In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install ../input/talib-package/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl
import talib as ta 

Processing /kaggle/input/talib-package/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl
Installing collected packages: talib-binary
Successfully installed talib-binary-0.4.19
[0m

In [3]:
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
import missingno as msno

import statsmodels.api as sm
from pylab import rcParams

from tqdm import tqdm

# import ta
from talib import abstract

In [4]:
data = pd.read_csv('/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')
# display(data)
train = data.copy()

In [6]:
# date test
lastday = pd.to_datetime(data.iloc[-1].Date) - pd.DateOffset(100)
data.loc[pd.to_datetime(data['Date']) > lastday]

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
2196531,20210826_1301,2021-08-26,1301,3015.0,3020.0,2990.0,3020.0,7300,1.0,,False,0.011609
2196532,20210826_1332,2021-08-26,1332,597.0,602.0,592.0,600.0,874600,1.0,,False,0.020101
2196533,20210826_1333,2021-08-26,1333,2489.0,2500.0,2462.0,2494.0,155700,1.0,,False,0.019584
2196534,20210826_1375,2021-08-26,1375,1621.0,1634.0,1610.0,1634.0,65300,1.0,,False,-0.000624
2196535,20210826_1376,2021-08-26,1376,1509.0,1509.0,1486.0,1504.0,5100,1.0,,False,0.001333
...,...,...,...,...,...,...,...,...,...,...,...,...
2332526,20211203_9990,2021-12-03,9990,514.0,528.0,513.0,528.0,44200,1.0,,False,0.034816
2332527,20211203_9991,2021-12-03,9991,782.0,794.0,782.0,794.0,35900,1.0,,False,0.025478
2332528,20211203_9993,2021-12-03,9993,1690.0,1690.0,1645.0,1645.0,7200,1.0,,False,-0.004302
2332529,20211203_9994,2021-12-03,9994,2388.0,2396.0,2380.0,2389.0,6500,1.0,,False,0.009098


In [None]:
def add_features(input_df, train=True):
    df_list = []
    for code in tqdm(np.sort(data.SecuritiesCode.unique())):
        if train:
            df = input_df.loc[train.SecuritiesCode == code, :].reset_index(drop=True)
        else:
            # inference 시, 최대 window 길이로 자름
            df = input_df.loc[train.SecuritiesCode == code, -EMA_MAX:].reset_index(drop=True) 
        
        df['Date'] = pd.to_datetime(df['Date'])
        
        # shadows
        df['upper_shadow'] = df['High'] - np.maximum(df['Open'], df['Close'])
        df['lower_shadow'] = np.minimum(df['Open'], df['Close']) - df['Low']

        # lagged features
        # 날짜 단위이므로 7일전, 30일전, 180일전, 360일전 
        # lagged close, target (target 은 정확히 무엇? return인가)
        
        # lagged feature 계산하기 전 결측치 채워넣기
        df = df.fillna(method='ffill')
        
        # TA-lib features - RSI, EMA 7-90
        df['RSI'] = ta.RSI(df['Close'])
        df['EMA7'] = ta.EMA(df['Close'], 7)
        df['EMA15'] = ta.EMA(df['Close'], 15)
        df['EMA30'] = ta.EMA(df['Close'], 30)
        df['EMA90'] = ta.EMA(df['Close'], 90)

        
        for indicator in ta.get_function_groups()['Pattern Recognition']:
            df[str(indicator)] = getattr(ta,str(indicator))(df.Open, df.High, df.Low, df.Close)


        # fill ema features by backward -- 이렇게 채워진 것은 false data 이므로 일단 test 해보고 없애는 것을 검토하자.
        df = df.fillna(method='bfill')
    
    
        # volatility
        
        df_list.append(df)
        
    df_feature_added = pd.concat(df_list)
    
    return df_feature_added


In [None]:
# test
df_added = add_features(train)
df_added

In [None]:
df_ta[df_ta.EMA90 == df_ta.iloc[0].EMA90] # period = p 일때 앞의 p-1개 데이터로 p번째의 EMA 계산

In [None]:
df_ta = df_added[df_added.SecuritiesCode == 1301]

plt.plot(df_ta['Date'], df_ta['Close'])
plt.plot(df_ta['Date'], df_ta['EMA7'])
plt.plot(df_ta['Date'], df_ta['EMA15'])
plt.plot(df_ta['Date'], df_ta['EMA30'])
plt.plot(df_ta['Date'], df_ta['EMA90'])
plt.legend()

In [None]:
def preprocess_train(df):
    dfc = df.copy()
    
    # remove columns - Date removed temporarily
    dfc = dfc.drop(columns=['RowId', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag'])
    stdsc = StandardScaler()
    ordinal = OrdinalEncoder()

    target = ['Target']
    ord_features = ['Date', 'SecuritiesCode']
    scaled_features = ['Open', 'High', 'Low', 'Close', 'Volume', 'upper_shadow', 'lower_shadow',
                      'RSI', 'EMA7', 'EMA15', 'EMA30', 'EMA90']
    pattern_features = [c for c in df_added.columns if c.startswith('CDL')]
    
    date_code_ord = ordinal.fit_transform(dfc.loc[:,ord_features])
    scaled = stdsc.fit_transform(dfc.loc[:,scaled_features])
    dfc_scaled = pd.concat([pd.DataFrame(date_code_ord, columns=ord_features),
                            pd.DataFrame(scaled, columns=scaled_features),
                            dfc.loc[:, pattern_features]], axis=1)

    y = dfc.loc[:,target]
    return dfc_scaled, y, [ordinal, stdsc]
    

X_scaled, y, trained_scalers = preprocess_train(df_added)

X_scaled

In [None]:
def preprocess_inference(df, trained_scalers: list):
    dfc = df.copy()
    ordinal = trained_scalers[0]
    stdsc = trained_scalers[1]
    
    # remove columns - Date removed temporarily
    dfc = dfc.drop(columns=['RowId', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag'])

    target = ['Target']
    ord_features = ['Date', 'SecuritiesCode']
    scaled_features = ['Open', 'High', 'Low', 'Close', 'Volume', 'upper_shadow', 'lower_shadow',
                      'RSI', 'EMA7', 'EMA15', 'EMA30', 'EMA90']
    pattern_features = [c for c in df_added.columns if c.startswith('CDL')]
    
    date_code_ord = ordinal.transform(dfc.loc[:,ord_features])
    scaled = stdsc.transform(dfc.loc[:,scaled_features])
    dfc_scaled = pd.concat([pd.DataFrame(date_code_ord, columns=ord_features),
                            pd.DataFrame(scaled, columns=scaled_features),
                            dfc.loc[:, pattern_features]], axis=1)

    
    return dfc_scaled
    

# X_test_scaled = preprocess_train(df_added, trained_scalers)

# X_test_scaled

In [None]:
# base model - lgbm 
lgb = LGBMRegressor().fit(X_scaled, y)



In [None]:
pd.concat([data, supp_data]).reset_index(drop=True).iloc[-90:]

In [None]:
pd.to_datetime(supp_data['Date'])[0] + pd.DateOffset(1)

In [None]:
# using supplement data as test data
supp_data = pd.read_csv('/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv')
supp_data

In [None]:
EMA_MAX = 90

data_for_inference = pd.concat([data, supp_data]).reset_index(drop=True)

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
for (i, prices, options, financials, trades, secondary_prices, sample_prediction) in enumerate(iter_test):
    # 이전 데이터와 합치고 최근 90일치만 이용한다
    lastday = pd.to_datetime(prices.Date) - pd.DateOffset(100)
    if i == 0:
        data = pd.concat([data_for_inference, prices])
        data = data.loc[pd.to_datetime(data['Date']) > lastday]
    else:
        data = pd.concat([data, prices])
        data = data.loc[pd.to_datetime(data['Date']) > lastday]
        
    feat = add_features(data)
    X = preprocess(feat, train=False)

    # X, y
    X['Target'] = lgb.predict(X)
    X['Rank'] = (X['Target'].rank(method='average', ascending=False)-1).astype(int)
    sample_prediction['Rank'] = X['Rank'].values
#     display(sample_prediction)
    env.predict(sample_prediction)