In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV

from matplotlib import pyplot as plt

from tqdm import tqdm

In [2]:
krx = '/data/hun/KRX_marketdata/'

In [3]:
tickers = pd.read_csv(krx+'ticker_recent.csv', encoding ='cp949')
tickers.columns = ('full_ticker', 'ticker', 'name_full', 'name',
              'name_eng', 'listed_data', 'market', 'security_category',
              'related_department', 'preferred', 'face_value', 'shares')
tickers = tickers.drop(['related_department'], axis=1).dropna().reset_index(drop=True)

In [3]:
factors= pd.read_csv(krx+'factors.csv')
factors['ticker'] = factors.ticker.apply(lambda x: '0'*(6-len(str(x)))+str(x))

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
prices = pd.read_csv(krx+'prices_final.csv')
prices['ticker'] = prices.ticker.apply(lambda x: '0'*(6-len(str(x)))+str(x))

interest1 = pd.read_excel(krx+'stat_107302.xls')
interest2 = pd.read_excel(krx+'stat_107301.xls')
interest = pd.concat([interest1.transpose(),interest2.transpose().drop(["Unnamed: 0"])]).reset_index()
int_col = ['date','tb3y','tb5y','tb10y','cb3y','cd3m','call','br']
interest.columns = int_col
interest.drop([0],axis=0, inplace=True)
interest["date"] = interest.date.apply(lambda x: f'{x[:4]}/{x[4:6]}')
interest.replace('-', np.nan, inplace=True)

interest = interest.dropna()

interest['term_spread'] = interest['tb3y'].astype('float')- interest.br.astype('float')
interest['credit_spread'] = interest['cb3y'].astype('float') - interest.tb3y.astype('float')

factors = pd.merge(factors, interest[['date','cd3m','term_spread','credit_spread']], how='left', on='date')

factors['trans_turn'] = prices.total_transactions/prices['size']

######## factors.to_csv(krx+'factors.csv', index=False)

In [4]:
factors =factors.dropna()

In [5]:
factors.reset_index(drop = True, inplace = True)

In [6]:
factors.describe()

Unnamed: 0,ret,RM_RF,smb,hml,div_ret,PBR,EPR,BPR,size,share_turn,mom1,mom6,mom12,beta,beta_seq,ido_vol,cd3m,term_spread,credit_spread,trans_turn
count,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0,367098.0
mean,0.022826,0.002942,0.029222,0.004118,1.271247,1.644664,0.083455,1.040695,0.500123,0.387793,0.023124,0.023249,0.022955,0.46341,28.214998,0.821128,2.661427,0.531033,0.7064,0.41238
std,0.892897,0.09125,0.043141,0.052979,1.98909,5.732081,0.65699,1.306708,0.288705,3.868207,0.896392,0.36379,0.256622,5.291533,3523.952771,89.935896,1.371765,0.549941,0.521864,4.920248
min,-0.985862,-0.887071,-0.046105,-0.479083,0.0,0.0,0.0,0.0,0.000434,4e-06,-0.985862,-0.459365,-0.406501,-670.942221,0.0,0.0,0.63,-0.34,0.22,4e-06
25%,-0.070175,-0.038905,0.002915,-0.014061,0.0,0.48,0.0,0.333333,0.25,0.029265,-0.070295,-0.024374,-0.014802,-0.046389,0.050577,0.006902,1.58,0.14,0.43,0.029537
50%,-0.004076,0.007577,0.019467,0.008086,0.41,0.94,0.042863,0.78125,0.5,0.086005,-0.003953,0.004187,0.006765,0.357565,0.310596,0.014389,2.63,0.41,0.53,0.087124
75%,0.071103,0.051134,0.042318,0.030847,1.92,1.77,0.104058,1.428571,0.750136,0.24206,0.071339,0.041642,0.036077,0.941776,1.221496,0.031341,3.55,0.74,0.83,0.247815
max,437.666667,0.184805,0.34117,0.167935,57.92,1044.0,100.0,100.0,1.0,855.512794,437.666667,72.8849,36.465624,974.667766,949977.254124,16318.994101,6.03,2.47,4.38,1777.534815


R_oos

In [7]:
def ros(pred, real):
    mse_m = ((pred.reshape(-1) - real)**2).mean()
    mse_bm = (real**2).mean()
    return 1 - mse_m / mse_bm


### Preprocessing


In [8]:
def xvar(data):
    return data.drop(['date','ticker','name'], axis=1).astype('float')

In [9]:
def xy(data):
    return data.drop(['ret'], axis=1), data.ret

In [10]:
dates = factors.date.unique()[::-1]

# Pooling

In [11]:
#preprocessing

length = len(dates)
ind_train = int(length*0.7)+2
# pool_train_pre = factors[factors.date.apply(lambda x: x in dates[0:ind_train])]
# pool_test = factors[factors.date.apply(lambda x: x in dates[ind_train:])]

In [12]:
ind_tv = int(length*0.4)

In [13]:
ind_train - ind_tv

72

In [14]:
length - ind_tv

140

In [15]:
dates[ind_tv], dates[ind_train]

('2010/01', '2016/01')

In [16]:
## Time series split

pool_train = []
pool_val = []
cnt = ind_tv
start= 0
for i in tqdm(range(12)):
    train_ind = dates[start:cnt]
    val_ind = dates[cnt:min(cnt+12, length)]
    
    pool_train.append(xvar(factors[factors.date.apply(lambda x: x in train_ind)]))
    pool_val.append(xvar(factors[factors.date.apply(lambda x: x in val_ind)]))
    cnt += 12
    start += 12

100%|██████████| 12/12 [00:53<00:00,  4.49s/it]


In [17]:
def cv_pcr(exx, exy, evx, evy):
    tem_pcr = 0
    tem_pls = 0
    max_pcr = 0
    max_pls = 0
    pred_pcr = []
    pred_pls =[]
    mse_pcr = 0
    mse_pls = 0
    for i in  [i for i in range(1,16)]:
        pls = PLSRegression(n_components = i).fit(exx, exy)
        pcr = make_pipeline(StandardScaler(), PCA(n_components=i), LinearRegression()).fit(exx, exy)
        tem1, tem2 = pcr.score(evx, evy), pls.score(evx, evy)
        if i==1:
            tem_pcr, tem_pls = tem1, tem2
            max_pcr, max_pls = 1, 1
            pred_pcr, pred_pls = pcr.predict(evx), pls.predict(evx)
            mse_pcr, mse_pls = mse(exy, pcr.predict(exx)), mse(exy, pls.predict(exx))
        else:
            if tem_pcr < tem1:
                tem_pcr, max_pcr = tem1, i
                mse_pcr = mse(exy, pcr.predict(exx))
                pred_pcr = pcr.predict(evx)
            if tem_pls < tem2:
                tem_pls, max_pls = tem2, i
                mse_pls = mse(exy, pls.predict(exx))
                pred_pls = pls.predict(evx)
    
    return tem_pcr, max_pcr, mse_pcr, tem_pls, max_pls, mse_pls, pred_pcr, pred_pls


### 실패작

In [27]:
### 이건 실패한 모델
results = pd.DataFrame(columns = ['model','period','r_squared','train_loss_mse', 'complexity'])
results.to_csv('ML_train_res2.cv', index = False)
mod_name = ['lin','en','pls','pcr','rf','gbr']


for period in tqdm(range(5)):
    ex, ev= pool_train[period],  pool_val[period]
    exx, exy = xy(ex)
    evx, evy = xy(ev)
    
#     lin = LinearRegression().fit(exx, exy)
    en = ElasticNet(random_state=0).fit(exx, exy)
    pcr_r, pcr_com, mse_pcr, pls_r, pls_com, mse_pls, pred_pcr, pred_pls = cv_pcr(exx, exy, evx, evy)
    print(f'pcr finished {period}')
    rf = RandomForestRegressor(random_state=0).fit(exx, exy)
    gbr = GradientBoostingRegressor(random_state=0).fit(exx, exy)
    print(f'comp finished {period}')
    
    r_squared = [lin.score(evx, evy), en.score(evx, evy), pls_r, pcr_r, \
                 rf.score(evx, evy), gbr.score(evx, evy)]
    mse_loss = [mse(exy, lin.predict(exx)), mse(exy, en.predict(exx)), mse_pls, mse_pcr, \
                mse(exy, rf.predict(exx)), mse(exy, gbr.predict(exx))]
    
    mean_depth = np.mean([estimator.tree_.max_depth for estimator in rf.estimators_])
    mean_features = np.mean([estimator.max_features_ for estimator in gbr.estimators_.reshape(-1)])
    
    complexity = [0, (en.coef_ != 0).sum(), pcr_com, pls_com, mean_depth, mean_features]
    
    res_temp = pd.DataFrame({"model": mod_name, "period" : [period]*len(mod_name), "r_squared" : r_squared,
                            "train_loss_mse": mse_loss, "complexity": complexity})
    
    res_temp.to_csv('NN_train_res.cv', index = False, header = False, mode ='a')

  0%|          | 0/5 [00:00<?, ?it/s]

pcr finished 0
comp finished 0


 20%|██        | 1/5 [00:17<01:10, 17.59s/it]

pcr finished 1
comp finished 1


 40%|████      | 2/5 [06:05<05:50, 116.78s/it]

pcr finished 2
comp finished 2


 60%|██████    | 3/5 [12:54<06:48, 204.49s/it]

pcr finished 3
comp finished 3


 80%|████████  | 4/5 [20:50<04:45, 285.95s/it]

pcr finished 4
comp finished 4


100%|██████████| 5/5 [29:51<00:00, 358.27s/it]


In [33]:
# results.to_csv('ML_train_res.csv', index = False)

In [51]:
mse_loss

[0.7025880538489462,
 1.3590887252942987,
 1.2397783563100855,
 0.8797793362611417,
 0.09520281940207949,
 0.029753838562563764]

In [83]:
results

Unnamed: 0,model,period,r_squared,train_loss,complexity,train_loss_mse
0,lin,0,-0.231385,,0.0,<function mean_squared_error at 0x7f1cbc4d6950>
1,en,0,0.032811,,2.0,<function mean_squared_error at 0x7f1cbc4d6950>
2,pls,0,-0.074628,,1.0,<function mean_squared_error at 0x7f1cbc4d6950>
3,pcr,0,0.112349,,1.0,<function mean_squared_error at 0x7f1cbc4d6950>
4,rf,0,0.893907,,51.05,<function mean_squared_error at 0x7f1cbc4d6950>
5,gbr,0,0.680366,,19.0,<function mean_squared_error at 0x7f1cbc4d6950>
0,lin,1,-2.209271,,0.0,<function mean_squared_error at 0x7f1cbc4d6950>
1,en,1,0.001071,,2.0,<function mean_squared_error at 0x7f1cbc4d6950>
2,pls,1,-1.3633,,1.0,<function mean_squared_error at 0x7f1cbc4d6950>
3,pcr,1,0.010919,,1.0,<function mean_squared_error at 0x7f1cbc4d6950>


# Test

In [48]:
model_pick = []
for i in mod_name:
    sub = results[results.model == i]
    model_pick.append(sub[sub.r_squared == sub.r_squared.max()].period.values[0])
model_pick

[2, 0, 2, 2, 0, 2]

In [86]:
xt, yt = xy(xvar(pool_test))
xx, yy = xy(xvar(pool_train_pre))

In [55]:
%%time
lin = LinearRegression().fit(xx, yy)
en = ElasticNet(random_state=0).fit(xx, yy)
pcr_r, pcr_com, mse_pcr, pls_r, pls_com, mse_pls = cv_pcr(xx, yy, xt, yt)
print('rf started')
rf = RandomForestRegressor(random_state=0).fit(xx, yy)
gbr = GradientBoostingRegressor(random_state=0).fit(xx, yy)


r_squared = [lin.score(xt, yt), en.score(xt, yt), pls_r, pcr_r, \
             rf.score(xt, yt), gbr.score(xt, yt)]
mse_loss = [mse(yy, lin.predict(xx)), mse(yy, en.predict(xx)), mse_pls, mse_pcr, \
            mse(yy, rf.predict(xx)), mse(yy, gbr.predict(xx))]

mean_depth = np.mean([estimator.tree_.max_depth for estimator in rf.estimators_])
mean_features = np.mean([estimator.max_features_ for estimator in gbr.estimators_.reshape(-1)])

complexity = [0, (en.coef_ != 0).sum(), pcr_com, pls_com, mean_depth, mean_features]

test_res = pd.DataFrame({"model": mod_name, "period" : [period]*len(mod_name), "r_squared" : r_squared,
                        "train_loss_mse": mse_loss, "complexity": complexity})
test_res

rf started
CPU times: user 17min 11s, sys: 12min 46s, total: 29min 58s
Wall time: 11min 28s


Unnamed: 0,model,period,r_squared,train_loss_mse,complexity
0,lin,4,-16.506596,0.619776,0.0
1,en,4,0.011281,1.154922,2.0
2,pls,4,-10.32048,1.059508,1.0
3,pcr,4,-0.362283,0.766287,1.0
4,rf,4,-44.797024,0.076586,58.07
5,gbr,4,-96.411266,0.029035,19.0


In [71]:
# test_res.to_csv('ML_pricing_res_[full train, full test].csv', index = False)

#### 여기부터 수정하기

# Test
### Time Series val test

In [30]:
# results = pd.DataFrame(columns = ['model','period','r_squared','train_loss_mse', 'complexity'])
# results.to_csv('ML_test_res_rolling.csv', index = False)
# mod_name = ['lin','en','pls','pcr','rf','gbr']
# preds = pd.DataFrame(columns = mod_name)
# preds.to_csv('ML_test_pred.csv', index = False)

for period in tqdm(range(1)) #range(len(pool_train)+1)):
#     ex, ev= pool_train[period],  pool_val[period]

    ex, ev= pool_train[10],  pool_val[11]
    
    exx, exy = xy(ex)
    evx, evy = xy(ev)
    

    lin = LinearRegression().fit(exx, exy)
    en = ElasticNet(random_state=0).fit(exx, exy)
    pcr_r, pcr_com, mse_pcr, pls_r, pls_com, mse_pls, pred_pcr, pred_pls = cv_pcr(exx, exy, evx, evy)
    print(f'pcr finished {period}')
    rf = RandomForestRegressor(random_state=12).fit(exx, exy)
    gbr = GradientBoostingRegressor(random_state=12).fit(exx, exy)

    r_squared = [lin.score(evx, evy), en.score(evx, evy), pls_r, pcr_r, \
                 rf.score(evx, evy), gbr.score(evx, evy)]

    mse_loss = [mse(exy, lin.predict(exx)), mse(exy, en.predict(exx)), mse_pls, mse_pcr, \
                mse(exy, rf.predict(exx)), mse(exy, gbr.predict(exx))]

    predicted = [lin.predict(evx), en.predict(evx), pred_pls, pred_pcr, rf.predict(evx), gbr.predict(evx)]

    mean_depth = np.mean([estimator.tree_.max_depth for estimator in rf.estimators_])
    mean_features = np.mean([estimator.max_features_ for estimator in gbr.estimators_.reshape(-1)])

    complexity = [0, (en.coef_ != 0).sum(), pcr_com, pls_com, mean_depth, mean_features]

    res_temp = pd.DataFrame({"model": mod_name, "period" : [period]*len(mod_name), "r_squared" : r_squared,
                            "train_loss_mse": mse_loss, "complexity": complexity})
    pred_temp = pd.DataFrame({"lin":predicted[0], "en":predicted[1], "pls":predicted[2].reshape(-1), 
                              "pcr":predicted[3], "rf":predicted[4], "gbr":predicted[5]})
    
#     res_temp.to_csv('ML_test_res_rolling_for2021.csv', index = False, header = False, mode ='a')
#     pred_temp.to_csv('ML_test_pred_for2021.csv', index = False, header = False, mode ='a')

  0%|          | 0/13 [00:00<?, ?it/s]

pcr finished 0


  8%|▊         | 1/13 [04:56<59:14, 296.18s/it]

pcr finished 1


 15%|█▌        | 2/13 [10:20<55:51, 304.65s/it]

pcr finished 2


 23%|██▎       | 3/13 [15:54<52:14, 313.46s/it]

pcr finished 3


 31%|███       | 4/13 [21:58<49:16, 328.50s/it]

pcr finished 4


 38%|███▊      | 5/13 [28:14<45:42, 342.77s/it]

pcr finished 5


 46%|████▌     | 6/13 [34:53<41:58, 359.76s/it]

pcr finished 6


 54%|█████▍    | 7/13 [41:43<37:28, 374.79s/it]

pcr finished 7


 62%|██████▏   | 8/13 [49:03<32:51, 394.23s/it]

pcr finished 8


 69%|██████▉   | 9/13 [56:53<27:48, 417.03s/it]

pcr finished 9


 77%|███████▋  | 10/13 [1:04:53<21:47, 435.90s/it]

pcr finished 10


 85%|████████▍ | 11/13 [1:13:02<15:03, 451.88s/it]

pcr finished 11


 92%|█████████▏| 12/13 [1:21:22<06:46, 406.91s/it]


IndexError: list index out of range

In [None]:
# results = pd.DataFrame(columns = ['model','period','r_squared','train_loss_mse', 'complexity'])
# results.to_csv('ML_test_res_rolling.csv', index = False)
# mod_name = ['lin','en','pls','pcr','rf','gbr']
# preds = pd.DataFrame(columns = mod_name)
# preds.to_csv('ML_test_pred.csv', index = False)

for period in tqdm(range(1)): #range(len(pool_train)+1)):
#     ex, ev= pool_train[period],  pool_val[period]

    ex, ev= pool_train[10],  pool_val[11]
    
    exx, exy = xy(ex)
    evx, evy = xy(ev)
    

    lin = LinearRegression().fit(exx, exy)
    en = ElasticNet(random_state=0).fit(exx, exy)
    pcr_r, pcr_com, mse_pcr, pls_r, pls_com, mse_pls, pred_pcr, pred_pls = cv_pcr(exx, exy, evx, evy)
    print(f'pcr finished {period}')
    rf = RandomForestRegressor(random_state=12).fit(exx, exy)
    gbr = GradientBoostingRegressor(random_state=12).fit(exx, exy)

    r_squared = [lin.score(evx, evy), en.score(evx, evy), pls_r, pcr_r, \
                 rf.score(evx, evy), gbr.score(evx, evy)]

    mse_loss = [mse(exy, lin.predict(exx)), mse(exy, en.predict(exx)), mse_pls, mse_pcr, \
                mse(exy, rf.predict(exx)), mse(exy, gbr.predict(exx))]

    predicted = [lin.predict(evx), en.predict(evx), pred_pls, pred_pcr, rf.predict(evx), gbr.predict(evx)]

    mean_depth = np.mean([estimator.tree_.max_depth for estimator in rf.estimators_])
    mean_features = np.mean([estimator.max_features_ for estimator in gbr.estimators_.reshape(-1)])

    complexity = [0, (en.coef_ != 0).sum(), pcr_com, pls_com, mean_depth, mean_features]

    res_temp = pd.DataFrame({"model": mod_name, "period" : [period]*len(mod_name), "r_squared" : r_squared,
                            "train_loss_mse": mse_loss, "complexity": complexity})
    pred_temp = pd.DataFrame({"lin":predicted[0], "en":predicted[1], "pls":predicted[2].reshape(-1), 
                              "pcr":predicted[3], "rf":predicted[4], "gbr":predicted[5]})
    
    res_temp.to_csv('ML_test_res_rolling_for2021.csv', index = False)
    pred_temp.to_csv('ML_test_pred_for2021.csv', index = False)
    

  0%|          | 0/1 [00:00<?, ?it/s]

pcr finished 0
