# Alpha 전략 feat.인호님

기존 데이터에 인호님 코드 합쳐 돌려보기

## Basic settings

### Import libraries

In [35]:
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor

from tqdm import tqdm

In [36]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path

from sklearn.impute import SimpleImputer

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [37]:
## custom library

import eda_util as eutil
import submission_config as subconfig
import submission_util as subutil

In [38]:
pd.set_option('display.float_format', lambda x: f'{x:,g}')

In [39]:
BASE_PATH = subconfig.BASE_PATH
DATA_PATH = subconfig.DATA_PATH

OUTPUT_PATH = subconfig.OUTPUT_PATH

### Import data & preprocessing

In [40]:
krx_df = pd.read_csv(subconfig.krx_df_PATH)

In [41]:
krx_df.columns = ['date', 'code', 'name', 'volume', 'open', 'high', 'low', 'close']

In [42]:
krx_df['date'] = pd.to_datetime(krx_df['date'], format='%Y%m%d')

In [43]:
return_df = pd.read_pickle(subconfig.return_df_PATH)
close_df = pd.read_pickle(subconfig.adjclose_df_PATH)

In [44]:
open_df = pd.read_pickle(subconfig.adjopen_df_PATH)
high_df = pd.read_pickle(subconfig.adjhigh_df_PATH)
low_df = pd.read_pickle(subconfig.adjlow_df_PATH)

In [45]:
## date list

holidays = return_df.isnull().all(axis=1)
tradingdays = ~holidays

holidays = holidays.index[holidays]
tradingdays = tradingdays.index[tradingdays]

In [46]:
TRAIN_START = pd.to_datetime(subconfig.TRAIN_START, format='%Y-%m-%d')
SIMOS_END = pd.to_datetime(subconfig.SIMOS_END, format='%Y-%m-%d')

In [47]:
tradingdays = tradingdays[(tradingdays >= TRAIN_START) & (tradingdays <= SIMOS_END)]

In [48]:
dacon_sid_list = [ii[1:] for ii in krx_df['code'].unique()]

In [49]:
return_df = return_df.loc[tradingdays, :].dropna(axis='columns', how='all')
return_df = return_df.loc[:, dacon_sid_list]

close_df = close_df.loc[tradingdays, :].dropna(axis='columns', how='all')
close_df = close_df.loc[:, dacon_sid_list]

In [50]:
open_df = open_df.loc[tradingdays, :].dropna(axis='columns', how='all')
open_df = open_df.loc[:, dacon_sid_list]

high_df = high_df.loc[tradingdays, :].dropna(axis='columns', how='all')
high_df = high_df.loc[:, dacon_sid_list]

low_df = low_df.loc[tradingdays, :].dropna(axis='columns', how='all')
low_df = low_df.loc[:, dacon_sid_list]

In [51]:
SIMOS_START = subconfig.SIMOS_START
# simOS_END = subconfig.SIMOS_END

### Import additional data

In [52]:
volume_df = pd.read_pickle(subconfig.volume_df_PATH)
dollarvolume_df = pd.read_pickle(subconfig.dollarvolume_df_PATH)
marketcap_df = pd.read_pickle(subconfig.marketcap_df_PATH)
market_cat_df = pd.read_pickle(DATA_PATH / 'market_cat_df_20140101_20230705.pickle')

In [53]:
volume_df = volume_df.loc[tradingdays, :].dropna(axis='columns', how='all')
volume_df = volume_df.loc[:, dacon_sid_list]

dollarvolume_df = dollarvolume_df.loc[tradingdays, :].dropna(axis='columns', how='all')
dollarvolume_df = dollarvolume_df.loc[:, dacon_sid_list]

marketcap_df = marketcap_df.loc[tradingdays, :].dropna(axis='columns', how='all')
marketcap_df = marketcap_df.loc[:, dacon_sid_list]

In [54]:
volume_df = volume_df.shift(1)
dollarvolume_df = dollarvolume_df.shift(1)
marketcap_df = marketcap_df.shift(1)

In [55]:
market_cat_inrange = market_cat_df[market_cat_df['trdDd'].isin(tradingdays)]

In [56]:
KOSPI_sid_list = market_cat_inrange[market_cat_inrange['is_KOSPI'] == True]['ISU_SRT_CD'].unique()
KOSDAQ_sid_list = market_cat_inrange[market_cat_inrange['is_KOSDAQ'] == True]['ISU_SRT_CD'].unique()
KONEX_sid_list = market_cat_inrange[market_cat_inrange['is_KONEX'] == True]['ISU_SRT_CD'].unique()

### Parameters

In [57]:
PORTFOLIO_DATE = subconfig.PORTFOLIO_DATE

RDVADV_WINDOW = subconfig.WINDOWS['rdvadv'] # 20

## Alphas

### Integrating my data with Inho's code

In [58]:
# Your function to calculate SMAPE
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

In [59]:
results_df = pd.DataFrame(columns=['code'] + ['return_day_' + str(i) for i in range(1, 16)])

preds_df_fin_xgb = pd.DataFrame()
smapes_df_fin_xgb = pd.DataFrame()

preds_df_fin_lgbm = pd.DataFrame()
smapes_df_fin_lgbm = pd.DataFrame()

preds_df_fin_catboost = pd.DataFrame()
smapes_df_fin_catboost = pd.DataFrame()

In [60]:
code = '005930'

In [64]:
train_close = pd.DataFrame(
    data={
        'open': open_df.loc[TRAIN_START:SIMOS_START, code],
        'high': high_df.loc[TRAIN_START:SIMOS_START, code],
        'low': low_df.loc[TRAIN_START:SIMOS_START, code],
        'close': close_df.loc[TRAIN_START:SIMOS_START, code],
        'dollarvolume': dollarvolume_df.loc[TRAIN_START:SIMOS_START, code],
        'marketcap': marketcap_df.loc[TRAIN_START:SIMOS_START, code],
    }
    )

In [65]:
train_close

Unnamed: 0_level_0,open,high,low,close,dollarvolume,marketcap
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-06-01,80500,81300,80100,80600,,
2021-06-02,80400,81400,80300,80800,1.13546e+12,4.81164e+14
2021-06-03,81300,83000,81100,82800,1.32771e+12,4.82358e+14
2021-06-04,82700,82700,81500,82200,2.43812e+12,4.94298e+14
2021-06-07,82700,82800,81600,81900,1.48779e+12,4.90716e+14
...,...,...,...,...,...,...
2023-05-24,68100,68700,68000,68500,5.85923e+11,4.08333e+14
2023-05-25,69900,70000,68700,68800,5.60469e+11,4.0893e+14
2023-05-26,69800,70400,69500,70300,9.84569e+11,4.10721e+14
2023-05-30,71300,72300,71200,72300,1.37067e+12,4.19676e+14


In [67]:
# Iterate over each unique stock
for code in tqdm(dacon_sid_list):
    
    # Filter by stock code
    # Note: All prices are adjusted
    # TODO: Add normalized rdvadv signal to the columns

    train_close = pd.DataFrame(
        data={
            'open': open_df.loc[TRAIN_START:SIMOS_START, code],
            'high': high_df.loc[TRAIN_START:SIMOS_START, code],
            'low': low_df.loc[TRAIN_START:SIMOS_START, code],
            'close': close_df.loc[TRAIN_START:SIMOS_START, code],
            'dollarvolume': dollarvolume_df.loc[TRAIN_START:SIMOS_START, code],
            'marketcap': marketcap_df.loc[TRAIN_START:SIMOS_START, code],
        }
        )
    train_close = train_close.iloc[1:, :]

    # Store original data for reference
    original_data = train_close.copy()

    # Create return columns for each day
    returns = []
    smapes_xgb = []
    smapes_lgbm = []
    smapes_catboost = []

    preds_df_xgb = pd.DataFrame()
    preds_df_lgbm = pd.DataFrame()
    preds_df_catboost = pd.DataFrame()
    
    # For each day from 1 to 15
    for day in range(1, 16):
        # Scale data
        X = train_close[:]
        y = train_close['close']
        
        scaler = MinMaxScaler(feature_range=(-1, 1))
        data_scaled = scaler.fit_transform(X)
        data_scaled2 = y
        
        X_train = data_scaled[:-day]
        y_train = data_scaled2[day:]
        X_test = data_scaled[-day]
        
        X_train = X_train[:int(len(X_train) * 0.9)]
        X_val = X_train[int(len(X_train) * 0.9):]
        y_train = y_train[:int(len(y_train) * 0.9)]
        y_val = y_train[int(len(y_train) * 0.9):] 

        # Train XGBoost
        xgb_model = XGBRegressor()
        xgb_model.fit(X_train, y_train)
        vals_xgb = xgb_model.predict(X_val)
        smapes_xgb.append(smape(y_val, vals_xgb))
        
        preds_xgb = xgb_model.predict([data_scaled[-day]])
        preds_df_xgb = pd.concat([preds_df_xgb, pd.DataFrame(preds_xgb)], axis = 0)

        # Train LightGBM
        lgbm_model = LGBMRegressor()
        lgbm_model.fit(X_train, y_train)
        vals_lgbm = lgbm_model.predict(X_val)
        smapes_lgbm.append(smape(y_val, vals_lgbm))
        
        preds_lgbm = lgbm_model.predict([data_scaled[-day]])
        preds_df_lgbm = pd.concat([preds_df_lgbm, pd.DataFrame(preds_lgbm)], axis = 0)
    
    smapes_df_xgb = pd.DataFrame(smapes_xgb)
    smapes_df_lgbm = pd.DataFrame(smapes_lgbm)
    smapes_df_catboost = pd.DataFrame(smapes_catboost)

    preds_df_fin_xgb = pd.concat([preds_df_fin_xgb, preds_df_xgb], axis = 1)
    smapes_df_fin_xgb = pd.concat([smapes_df_fin_xgb, smapes_df_xgb], axis = 1)

    preds_df_fin_lgbm = pd.concat([preds_df_fin_lgbm, preds_df_lgbm], axis = 1)
    smapes_df_fin_lgbm = pd.concat([smapes_df_fin_lgbm, smapes_df_lgbm], axis = 1)

100%|██████████| 2000/2000 [42:10<00:00,  1.27s/it] 


In [1]:
smapes_df_xgb.to_pickle(OUTPUT_PATH / 'smapes_df_xgb.pickle')

NameError: name 'smapes_df_xgb' is not defined

In [68]:
final = np.zeros((len(smapes_df_fin_xgb), len(smapes_df_fin_xgb.columns)))

for i in range(0, len(smapes_df_fin_xgb.columns)):
    for j in range(0, len(smapes_df_fin_xgb)):
        weights = [1 / smapes_df_fin_xgb.iloc[j:j+1, i].values[0],
                   
                   1 / smapes_df_fin_lgbm.iloc[j:j+1, i].values[0]]
        

        weights /= np.sum(weights) 
        final[j][i] = weights[0] * preds_df_fin_xgb.iloc[j:j+1, i].values[0] \
                            + weights[1] * preds_df_fin_lgbm.iloc[j:j+1, i].values[0] \
                            + weights[2] * preds_df_fin_lgbm.iloc[j:j+1, i].values[0]

IndexError: index 2 is out of bounds for axis 0 with size 2

In [None]:
final_df = pd.DataFrame(final)
final_values = pd.DataFrame((final_df.iloc[-1] - final_df.iloc[0]) / final_df.iloc[0])
final_values_sharpe = -pd.DataFrame(((final_df.iloc[-1] - final_df.iloc[0]) / final_df.iloc[0]) / final_df.pct_change().std())

In [None]:
final_values.index = dacon_sid_list
final_values.columns = ['VALUE']
final_values.reset_index(inplace = True)
final_values.columns = ['종목코드', 'VALUE']