# Alpha 전략 feat.인호님

기존 데이터에 인호님 코드 합쳐 돌려보기

## Basic settings

### Import libraries

In [1]:
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor

from tqdm import tqdm

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path

from sklearn.impute import SimpleImputer

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [3]:
## custom library

import eda_util as eutil
import submission_config as subconfig
import submission_util as subutil

In [4]:
pd.set_option('display.float_format', lambda x: f'{x:,g}')

In [5]:
BASE_PATH = subconfig.BASE_PATH
DATA_PATH = subconfig.DATA_PATH

OUTPUT_PATH = subconfig.OUTPUT_PATH

### Import data & preprocessing

In [6]:
krx_df = pd.read_csv(subconfig.krx_df_PATH)

In [7]:
krx_df.columns = ['date', 'code', 'name', 'volume', 'open', 'high', 'low', 'close']

In [8]:
krx_df['date'] = pd.to_datetime(krx_df['date'], format='%Y%m%d')

In [9]:
return_df = pd.read_pickle(subconfig.return_df_PATH)
close_df = pd.read_pickle(subconfig.adjclose_df_PATH)

In [10]:
open_df = pd.read_pickle(subconfig.adjopen_df_PATH)
high_df = pd.read_pickle(subconfig.adjhigh_df_PATH)
low_df = pd.read_pickle(subconfig.adjlow_df_PATH)

In [11]:
## date list

holidays = return_df.isnull().all(axis=1)
tradingdays = ~holidays

holidays = holidays.index[holidays]
tradingdays = tradingdays.index[tradingdays]

In [12]:
TRAIN_START = pd.to_datetime(subconfig.TRAIN_START, format='%Y-%m-%d')
SIMOS_END = pd.to_datetime(subconfig.SIMOS_END, format='%Y-%m-%d')

In [13]:
tradingdays = tradingdays[(tradingdays >= TRAIN_START) & (tradingdays <= SIMOS_END)]

In [14]:
dacon_sid_list = [ii[1:] for ii in krx_df['code'].unique()]

In [15]:
return_df = return_df.loc[tradingdays, :].dropna(axis='columns', how='all')
return_df = return_df.loc[:, dacon_sid_list]

close_df = close_df.loc[tradingdays, :].dropna(axis='columns', how='all')
close_df = close_df.loc[:, dacon_sid_list]

In [16]:
open_df = open_df.loc[tradingdays, :].dropna(axis='columns', how='all')
open_df = open_df.loc[:, dacon_sid_list]

high_df = high_df.loc[tradingdays, :].dropna(axis='columns', how='all')
high_df = high_df.loc[:, dacon_sid_list]

low_df = low_df.loc[tradingdays, :].dropna(axis='columns', how='all')
low_df = low_df.loc[:, dacon_sid_list]

In [17]:
SIMOS_START = subconfig.SIMOS_START
# simOS_END = subconfig.SIMOS_END

### Import additional data

In [18]:
volume_df = pd.read_pickle(subconfig.volume_df_PATH)
dollarvolume_df = pd.read_pickle(subconfig.dollarvolume_df_PATH)
marketcap_df = pd.read_pickle(subconfig.marketcap_df_PATH)
market_cat_df = pd.read_pickle(DATA_PATH / 'market_cat_df_20140101_20230705.pickle')

In [19]:
volume_df = volume_df.loc[tradingdays, :].dropna(axis='columns', how='all')
volume_df = volume_df.loc[:, dacon_sid_list]

dollarvolume_df = dollarvolume_df.loc[tradingdays, :].dropna(axis='columns', how='all')
dollarvolume_df = dollarvolume_df.loc[:, dacon_sid_list]

marketcap_df = marketcap_df.loc[tradingdays, :].dropna(axis='columns', how='all')
marketcap_df = marketcap_df.loc[:, dacon_sid_list]

In [20]:
volume_df = volume_df.shift(1)
dollarvolume_df = dollarvolume_df.shift(1)
marketcap_df = marketcap_df.shift(1)

In [21]:
market_cat_inrange = market_cat_df[market_cat_df['trdDd'].isin(tradingdays)]

In [22]:
KOSPI_sid_list = market_cat_inrange[market_cat_inrange['is_KOSPI'] == True]['ISU_SRT_CD'].unique()
KOSDAQ_sid_list = market_cat_inrange[market_cat_inrange['is_KOSDAQ'] == True]['ISU_SRT_CD'].unique()
KONEX_sid_list = market_cat_inrange[market_cat_inrange['is_KONEX'] == True]['ISU_SRT_CD'].unique()

### Parameters

In [23]:
PORTFOLIO_DATE = subconfig.PORTFOLIO_DATE

RDVADV_WINDOW = subconfig.WINDOWS['rdvadv'] # 20

## Alphas

### Integrating my data with Inho's code

In [24]:
# Your function to calculate SMAPE
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

In [25]:
results_df = pd.DataFrame(columns=['code'] + ['return_day_' + str(i) for i in range(1, 16)])

preds_df_fin_xgb = pd.DataFrame()
smapes_df_fin_xgb = pd.DataFrame()

preds_df_fin_lgbm = pd.DataFrame()
smapes_df_fin_lgbm = pd.DataFrame()

preds_df_fin_catboost = pd.DataFrame()
smapes_df_fin_catboost = pd.DataFrame()

In [26]:
code = '005930'

In [27]:
train_close = pd.DataFrame(
    data={
        'open': open_df.loc[TRAIN_START:SIMOS_START, code],
        'high': high_df.loc[TRAIN_START:SIMOS_START, code],
        'low': low_df.loc[TRAIN_START:SIMOS_START, code],
        'close': close_df.loc[TRAIN_START:SIMOS_START, code],
        'dollarvolume': dollarvolume_df.loc[TRAIN_START:SIMOS_START, code],
        'marketcap': marketcap_df.loc[TRAIN_START:SIMOS_START, code],
    }
    )

In [28]:
train_close

Unnamed: 0_level_0,open,high,low,close,dollarvolume,marketcap
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-06-01,80500,81300,80100,80600,,
2021-06-02,80400,81400,80300,80800,1.13546e+12,4.81164e+14
2021-06-03,81300,83000,81100,82800,1.32771e+12,4.82358e+14
2021-06-04,82700,82700,81500,82200,2.43812e+12,4.94298e+14
2021-06-07,82700,82800,81600,81900,1.48779e+12,4.90716e+14
...,...,...,...,...,...,...
2023-05-24,68100,68700,68000,68500,5.85923e+11,4.08333e+14
2023-05-25,69900,70000,68700,68800,5.60469e+11,4.0893e+14
2023-05-26,69800,70400,69500,70300,9.84569e+11,4.10721e+14
2023-05-30,71300,72300,71200,72300,1.37067e+12,4.19676e+14


아래 코드는 Ryzen 5 5600X 6 Core (CPU 12) 로 돌렸을 때 

40분 가량 걸림. 

Windows에서 GPU 연산은 활용하기 어려움. 

- XGB: conda는 지원안함, Windows는 version conflict 남
- LGBM: Linux만 지원

In [29]:
# Iterate over each unique stock
for code in tqdm(dacon_sid_list):
    
    # Filter by stock code
    # Note: All prices are adjusted
    # TODO: Add normalized rdvadv signal to the columns

    train_close = pd.DataFrame(
        data={
            'open': open_df.loc[TRAIN_START:SIMOS_START, code],
            'high': high_df.loc[TRAIN_START:SIMOS_START, code],
            'low': low_df.loc[TRAIN_START:SIMOS_START, code],
            'close': close_df.loc[TRAIN_START:SIMOS_START, code],
            'dollarvolume': dollarvolume_df.loc[TRAIN_START:SIMOS_START, code],
            'marketcap': marketcap_df.loc[TRAIN_START:SIMOS_START, code],
        }
        )
    train_close = train_close.iloc[1:, :] # nan 있는 1st row 제거

    # Store original data for reference
    original_data = train_close.copy()

    # Create return columns for each day
    returns = []
    smapes_xgb = []
    smapes_lgbm = []
    smapes_catboost = []

    preds_df_xgb = pd.DataFrame()
    preds_df_lgbm = pd.DataFrame()
    preds_df_catboost = pd.DataFrame()
    
    # For each day from 1 to 15
    for day in range(1, 16):
        # Scale data
        X = train_close[:]
        y = train_close['close']
        
        scaler = MinMaxScaler(feature_range=(-1, 1))
        data_scaled = scaler.fit_transform(X)
        data_scaled2 = y
        
        X_train = data_scaled[:-day]
        y_train = data_scaled2[day:]
        X_test = data_scaled[-day]
        
        X_train = X_train[:int(len(X_train) * 0.9)]
        X_val = X_train[int(len(X_train) * 0.9):]
        y_train = y_train[:int(len(y_train) * 0.9)]
        y_val = y_train[int(len(y_train) * 0.9):] 

        # Train XGBoost
        xgb_model = XGBRegressor()
        xgb_model.fit(X_train, y_train)
        vals_xgb = xgb_model.predict(X_val)
        smapes_xgb.append(smape(y_val, vals_xgb))
        
        preds_xgb = xgb_model.predict([data_scaled[-day]])
        preds_df_xgb = pd.concat([preds_df_xgb, pd.DataFrame(preds_xgb)], axis = 0)

        # Train LightGBM
        lgbm_model = LGBMRegressor()
        lgbm_model.fit(X_train, y_train)
        vals_lgbm = lgbm_model.predict(X_val)
        smapes_lgbm.append(smape(y_val, vals_lgbm))
        
        preds_lgbm = lgbm_model.predict([data_scaled[-day]])
        preds_df_lgbm = pd.concat([preds_df_lgbm, pd.DataFrame(preds_lgbm)], axis = 0)
    
    smapes_df_xgb = pd.DataFrame(smapes_xgb)
    smapes_df_lgbm = pd.DataFrame(smapes_lgbm)
    smapes_df_catboost = pd.DataFrame(smapes_catboost)

    preds_df_fin_xgb = pd.concat([preds_df_fin_xgb, preds_df_xgb], axis = 1)
    smapes_df_fin_xgb = pd.concat([smapes_df_fin_xgb, smapes_df_xgb], axis = 1)

    preds_df_fin_lgbm = pd.concat([preds_df_fin_lgbm, preds_df_lgbm], axis = 1)
    smapes_df_fin_lgbm = pd.concat([smapes_df_fin_lgbm, smapes_df_lgbm], axis = 1)

100%|██████████| 2000/2000 [41:24<00:00,  1.24s/it]


In [30]:
smapes_df_xgb.to_pickle(OUTPUT_PATH / 'smapes_df_xgb.pickle')
smapes_df_lgbm.to_pickle(OUTPUT_PATH / 'smapes_df_lgbm.pickle')
smapes_df_catboost.to_pickle(OUTPUT_PATH / 'smapes_df_catboost.pickle')

preds_df_fin_xgb.to_pickle(OUTPUT_PATH / 'preds_df_fin_xgb.pickle')
smapes_df_fin_xgb.to_pickle(OUTPUT_PATH / 'smapes_df_fin_xgb.pickle')

preds_df_fin_lgbm.to_pickle(OUTPUT_PATH / 'preds_df_fin_lgbm.pickle')
smapes_df_fin_lgbm.to_pickle(OUTPUT_PATH / 'smapes_df_fin_lgbm.pickle')

In [33]:
smapes_df_fin_xgb

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19
0,0.0699052,0.058065,0.117029,0.124798,0.0917049,0.105412,0.0988542,0.0438919,0.134958,0.0778532,...,0.0884825,0.134007,0.186222,0.0863442,0.0883335,0.104126,0.0727708,0.103358,0.103467,0.0778086
1,0.13463,0.0741976,0.159751,0.149066,0.200484,0.104945,0.156881,0.0465584,0.142274,0.0651934,...,0.109797,0.183278,0.190375,0.149862,0.130703,0.14417,0.112534,0.174479,0.158781,0.124014
2,0.198049,0.106346,0.160569,0.163689,0.140981,0.120253,0.139048,0.0579159,0.164267,0.130925,...,0.0956133,0.223187,0.174518,0.153077,0.10617,0.161914,0.135345,0.225823,0.172003,0.157455
3,0.11168,0.12312,0.152484,0.189295,0.209138,0.171138,0.168657,0.0621615,0.190347,0.0835519,...,0.100259,0.144588,0.221179,0.1226,0.130515,0.234942,0.194321,0.183389,0.179422,0.181259
4,0.129518,0.142591,0.256707,0.208979,0.185799,0.156203,0.172054,0.0888665,0.205213,0.100602,...,0.144013,0.160576,0.259496,0.142788,0.118669,0.134371,0.148486,0.174892,0.138403,0.196364
5,0.178912,0.193523,0.273933,0.255933,0.121682,0.16025,0.189538,0.0871419,0.205121,0.126664,...,0.165031,0.178573,0.258908,0.153733,0.122516,0.22054,0.244227,0.181482,0.194613,0.233621
6,0.185297,0.143033,0.310627,0.241394,0.113727,0.257785,0.208948,0.077487,0.201698,0.128376,...,0.16431,0.214352,0.251321,0.144223,0.139601,0.214162,0.205102,0.197801,0.185588,0.225104
7,0.234084,0.148889,0.354036,0.248993,0.147894,0.218536,0.19195,0.100383,0.231574,0.147611,...,0.138076,0.246665,0.249377,0.0618952,0.0986775,0.214042,0.242436,0.194029,0.157269,0.182286
8,0.173977,0.157133,0.263334,0.356054,0.161435,0.192319,0.226324,0.0783575,0.173314,0.190389,...,0.165246,0.295896,0.235593,0.0663719,0.197268,0.233012,0.255734,0.114601,0.193615,0.241563
9,0.185119,0.239193,0.334192,0.303845,0.166279,0.149384,0.188473,0.0828146,0.170147,0.195213,...,0.186031,0.236524,0.258713,0.139583,0.163234,0.239708,0.227976,0.151437,0.146879,0.223672


In [35]:
smapes_df_fin_xgb.shape

(15, 2000)

In [36]:
smapes_df_fin_lgbm.shape

(15, 2000)

In [40]:
final = np.zeros((len(smapes_df_fin_xgb), len(smapes_df_fin_xgb.columns)))

for i in range(0, len(smapes_df_fin_xgb.columns)):
    for j in range(0, len(smapes_df_fin_xgb)):
        weights = [1 / smapes_df_fin_xgb.iloc[j:j+1, i].values[0],
                   
                   1 / smapes_df_fin_lgbm.iloc[j:j+1, i].values[0]]
        

        weights /= np.sum(weights) 
        

        final[j][i] = weights[0] * preds_df_fin_xgb.iloc[j:j+1, i].values[0] \
                            + weights[1] * preds_df_fin_lgbm.iloc[j:j+1, i].values[0] \
                        #     + weights[2] * preds_df_fin_lgbm.iloc[j:j+1, i].values[0]

  1 / smapes_df_fin_lgbm.iloc[j:j+1, i].values[0]]
  weights /= np.sum(weights)


In [41]:
final_df = pd.DataFrame(final)
final_values = pd.DataFrame((final_df.iloc[-1] - final_df.iloc[0]) / final_df.iloc[0])
final_values_sharpe = -pd.DataFrame(((final_df.iloc[-1] - final_df.iloc[0]) / final_df.iloc[0]) / final_df.pct_change().std())

In [56]:
final_values_sharpe.index = dacon_sid_list
final_values_sharpe.columns = ['VALUE']
final_values_sharpe.reset_index(inplace = True)
final_values_sharpe.columns = ['종목코드', 'VALUE']

In [57]:
final_values_sharpe.set_index('종목코드', inplace = True)

In [58]:
final_values_sharpe['VALUE']

종목코드
060310     3.34115
095570    -2.90035
006840    0.538436
054620   -0.159679
265520     5.04852
            ...   
189980    0.446128
000540    -1.00456
003280    -2.02937
037440    0.773508
238490     4.08125
Name: VALUE, Length: 2000, dtype: float64

### Submission

In [59]:
alpha_feat_inho = subutil.Submission(
    alpha_series=final_values_sharpe['VALUE'],
    alpha_name='alpha_feat_inho_ReverseSharpe',
)

In [60]:
alpha_feat_inho.get_rank(export_path=OUTPUT_PATH)

Saved to E:\VSCodeProjects\daconKRX2023\output\alpha_feat_inho_ReverseSharpe.csv


Unnamed: 0_level_0,순위
종목코드,Unnamed: 1_level_1
A060310,119
A095570,1801
A006840,201
A054620,202
A265520,32
...,...
A189980,1797
A000540,1798
A003280,1799
A037440,1800
