In [1]:
import os
import FinanceDataReader as fdr
from tqdm import tqdm
import pandas as pd

# pycaret 이용

## 삼성전자(005930) 테스트

* 종가와 상관계수 높은 특성 : KOSPI(0.79), KOSDAQ(0.87), DOW(0.58), NASDAQ(0.50), S&P500(0.56)
* 예측 순서(종가와 상관계수 높은 특성 먼저 예측)
    * 날짜로 종가 예측
    * 종가로 KOSDAQ 예측(종가와 상관계수 0.87)
    * 종가와 KOSDAQ으로 NASDAQ 예측(KOSDAQ과 상관계수 0.68)
    * 종가, KOSDAQ, NASDAQ으로 S&P500 예측(NASDAQ과 상관계수 0.93)
    * 종가, KOSDAQ, NASDAQ, S&P500으로 DOW 예측(S&P500과 상관계수 0.95)
    * 종가, KOSDAQ, NASDAQ, S&P500, DOW로 KOSPI 예측
    * 종가, 5가지 특성으로 다른 종가 예측 

In [2]:
# 외부 데이터 가져오기
df = pd.read_csv('../data/20210104_20211119_data_interpolate.csv')
df.head()

Unnamed: 0,Date,exchange_rate,kospi,kosdaq,kospi100,dow,nasdaq,sp500,미국채10년-2년,BTC,Gold,Oil,US10Y,US2Y
0,2021-01-04,1086.48,2944.45,977.62,3052.74,30223.89,12698.4,3700.65,0.8009,36499000.0,37.1,32.38,0.918,0.1171
1,2021-01-05,1086.42,2990.57,985.76,3099.42,30391.6,12819.0,3726.86,0.832,37962000.0,37.18,33.94,0.955,0.123
2,2021-01-06,1087.93,2968.21,981.39,3070.05,30829.4,12740.8,3748.14,0.9002,40731000.0,36.6,34.11,1.039,0.1388
3,2021-01-07,1094.28,3031.68,988.86,3139.48,31041.13,13067.5,3803.79,0.9402,44575000.0,36.48,34.53,1.081,0.1408
4,2021-01-08,1092.93,3152.18,987.79,3293.96,31097.97,13202.0,3824.68,0.9821,47241000.0,35.26,35.43,1.119,0.1369


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           230 non-null    object 
 1   exchange_rate  230 non-null    float64
 2   kospi          230 non-null    float64
 3   kosdaq         230 non-null    float64
 4   kospi100       230 non-null    float64
 5   dow            230 non-null    float64
 6   nasdaq         230 non-null    float64
 7   sp500          230 non-null    float64
 8   미국채10년-2년      230 non-null    float64
 9   BTC            230 non-null    float64
 10  Gold           230 non-null    float64
 11  Oil            230 non-null    float64
 12  US10Y          230 non-null    float64
 13  US2Y           230 non-null    float64
dtypes: float64(13), object(1)
memory usage: 25.3+ KB


In [4]:
# 종목 정보 가져오기
path = '../data'
list_name = 'Stock_List.csv'
stock_list = pd.read_csv(os.path.join(path,list_name))
stock_list.head()

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,5930,KOSPI
1,SK하이닉스,660,KOSPI
2,NAVER,35420,KOSPI
3,카카오,35720,KOSPI
4,삼성바이오로직스,207940,KOSPI


In [5]:
# 종목 코드 6자리로 맞추기
stock_list['종목코드'] = stock_list["종목코드"].astype(str).str.zfill(6)
stock_list.head()

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,5930,KOSPI
1,SK하이닉스,660,KOSPI
2,NAVER,35420,KOSPI
3,카카오,35720,KOSPI
4,삼성바이오로직스,207940,KOSPI


In [6]:
start_date = '20210104'
end_date = '20211119'

In [7]:
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])
code = '005930'
data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
data = pd.merge(Business_days, data, how = 'outer')
data['weekday'] = data.Date.apply(lambda x : x.weekday())
data['weeknum'] = data.Date.apply(lambda x : x.strftime('%V'))
data.Close = data.Close.interpolate()

data['Date'] = data['Date'].astype('str')
data = pd.merge(data, df, on='Date')

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 230 entries, 0 to 229
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           230 non-null    object 
 1   Close          230 non-null    float64
 2   weekday        230 non-null    int64  
 3   weeknum        230 non-null    object 
 4   exchange_rate  230 non-null    float64
 5   kospi          230 non-null    float64
 6   kosdaq         230 non-null    float64
 7   kospi100       230 non-null    float64
 8   dow            230 non-null    float64
 9   nasdaq         230 non-null    float64
 10  sp500          230 non-null    float64
 11  미국채10년-2년      230 non-null    float64
 12  BTC            230 non-null    float64
 13  Gold           230 non-null    float64
 14  Oil            230 non-null    float64
 15  US10Y          230 non-null    float64
 16  US2Y           230 non-null    float64
dtypes: float64(14), int64(1), object(2)
memory usage: 32.3

### 데이터 셋 만들기

In [9]:
mod_data = data[['Date','Close','weekday','weeknum','kospi','kosdaq','dow','nasdaq','sp500']]
mod_data.head()

Unnamed: 0,Date,Close,weekday,weeknum,kospi,kosdaq,dow,nasdaq,sp500
0,2021-01-04,83000.0,0,1,2944.45,977.62,30223.89,12698.4,3700.65
1,2021-01-05,83900.0,1,1,2990.57,985.76,30391.6,12819.0,3726.86
2,2021-01-06,82200.0,2,1,2968.21,981.39,30829.4,12740.8,3748.14
3,2021-01-07,82900.0,3,1,3031.68,988.86,31041.13,13067.5,3803.79
4,2021-01-08,88800.0,4,1,3152.18,987.79,31097.97,13202.0,3824.68


In [10]:
train_data = mod_data.loc[mod_data['weeknum'].astype(int) < 44]
train_data

Unnamed: 0,Date,Close,weekday,weeknum,kospi,kosdaq,dow,nasdaq,sp500
0,2021-01-04,83000.0,0,01,2944.45,977.62,30223.89,12698.4,3700.65
1,2021-01-05,83900.0,1,01,2990.57,985.76,30391.60,12819.0,3726.86
2,2021-01-06,82200.0,2,01,2968.21,981.39,30829.40,12740.8,3748.14
3,2021-01-07,82900.0,3,01,3031.68,988.86,31041.13,13067.5,3803.79
4,2021-01-08,88800.0,4,01,3152.18,987.79,31097.97,13202.0,3824.68
...,...,...,...,...,...,...,...,...,...
210,2021-10-25,70200.0,0,43,3020.54,994.31,35743.78,15226.7,4566.48
211,2021-10-26,71100.0,1,43,3049.08,1011.76,35755.83,15235.7,4574.79
212,2021-10-27,70100.0,2,43,3025.49,1008.95,35491.48,15235.8,4551.68
213,2021-10-28,70700.0,3,43,3009.55,1000.13,35729.89,15448.1,4596.42


In [11]:
val_data = mod_data.loc[mod_data['weeknum'].astype(int) == 44]
val_data

Unnamed: 0,Date,Close,weekday,weeknum,kospi,kosdaq,dow,nasdaq,sp500
215,2021-11-01,69900.0,0,44,2978.94,998.57,35913.68,15595.9,4613.67
216,2021-11-02,71500.0,1,44,3013.49,1009.44,36053.09,15649.6,4630.65
217,2021-11-03,70400.0,2,44,2975.71,1005.0,36157.02,15811.6,4660.57
218,2021-11-04,70600.0,3,44,2983.22,1001.43,36124.66,15940.3,4680.06
219,2021-11-05,70200.0,4,44,2969.27,1001.35,36329.07,15971.6,4697.53


In [12]:
val_x = val_data[['Date','weekday','weeknum']]
val_y = val_data[['Date','Close']]

In [13]:
test_data = mod_data.loc[mod_data['weeknum'].astype(int) == 44]
test_data

Unnamed: 0,Date,Close,weekday,weeknum,kospi,kosdaq,dow,nasdaq,sp500
215,2021-11-01,69900.0,0,44,2978.94,998.57,35913.68,15595.9,4613.67
216,2021-11-02,71500.0,1,44,3013.49,1009.44,36053.09,15649.6,4630.65
217,2021-11-03,70400.0,2,44,2975.71,1005.0,36157.02,15811.6,4660.57
218,2021-11-04,70600.0,3,44,2983.22,1001.43,36124.66,15940.3,4680.06
219,2021-11-05,70200.0,4,44,2969.27,1001.35,36329.07,15971.6,4697.53


In [14]:
test_x = test_data[['Date','weekday','weeknum']]
test_y = test_data[['Date','Close']]

# 종가 예측

#### 1. kosdaq 예측
* 2. nasdaq
* 3. sp500
* 4. dow
* 5. kospi
* 6. 종가

In [15]:
from pycaret.regression import *

In [16]:
# setup
ex = setup(train_data, target='kosdaq', ignore_features=['Close','kospi','dow','nasdaq','sp500'])

Unnamed: 0,Description,Value
0,session_id,1191
1,Target,kosdaq
2,Original Data,"(215, 9)"
3,Missing Values,False
4,Numeric Features,1
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(150, 18)"


In [17]:
# 모델 비교
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,10.2635,225.3782,14.2963,0.856,0.0146,0.0105,0.123
gbr,Gradient Boosting Regressor,10.7268,234.9409,14.8309,0.8477,0.0152,0.0109,0.021
knn,K Neighbors Regressor,10.8756,233.8619,14.414,0.8468,0.0147,0.0111,0.012
catboost,CatBoost Regressor,11.4971,251.5552,15.477,0.8382,0.0159,0.0117,0.436
xgboost,Extreme Gradient Boosting,12.4124,302.5969,17.0581,0.795,0.0175,0.0126,0.077
dt,Decision Tree Regressor,13.2943,350.056,17.9565,0.7815,0.0184,0.0135,0.013
ada,AdaBoost Regressor,13.9448,329.5143,17.6466,0.7785,0.0179,0.0141,0.029
et,Extra Trees Regressor,13.6007,348.6067,18.1819,0.771,0.0185,0.0138,0.094
lightgbm,Light Gradient Boosting Machine,14.3817,424.5266,19.9865,0.7181,0.0202,0.0146,0.2
ridge,Ridge Regression,16.3257,437.1757,20.39,0.716,0.0207,0.0166,0.011


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=1191, verbose=0, warm_start=False)

In [18]:
# 모델 생성
rf = create_model('rf')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,11.8773,253.247,15.9137,0.8368,0.0164,0.0122
1,10.3184,284.717,16.8736,0.7312,0.0169,0.0104
2,18.0931,598.758,24.4695,0.7019,0.0256,0.0188
3,6.9751,80.687,8.9826,0.9589,0.0091,0.0071
4,9.4516,136.5882,11.6871,0.9385,0.0119,0.0095
5,7.3821,89.1004,9.4393,0.9331,0.0092,0.0073
6,8.3601,144.3874,12.0161,0.8699,0.0124,0.0085
7,8.7975,162.5888,12.751,0.9018,0.0131,0.009
8,12.2325,368.1653,19.1876,0.8071,0.0199,0.0127
9,9.1469,135.5427,11.6423,0.8811,0.0117,0.0092


In [19]:
# 모델 튜닝
tuned_rf = tune_model(rf)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,15.7541,314.9119,17.7458,0.797,0.0181,0.0161
1,14.8464,398.1857,19.9546,0.624,0.0201,0.015
2,20.464,813.302,28.5185,0.5951,0.0297,0.0213
3,8.5025,139.4843,11.8103,0.929,0.0123,0.0088
4,9.9461,154.2853,12.4212,0.9305,0.0128,0.0101
5,8.7555,154.4628,12.4283,0.884,0.0125,0.0088
6,6.7541,75.6545,8.698,0.9318,0.0088,0.0068
7,8.6892,122.9502,11.0883,0.9257,0.0113,0.0088
8,14.1501,408.2877,20.2061,0.7861,0.0209,0.0146
9,8.0034,94.6347,9.728,0.917,0.0097,0.008


In [20]:
# 모델 훈련 완료
kosdaq_prediction_rf = finalize_model(tuned_rf)

In [21]:
# kosdaq 예측
kosdaq_pred = predict_model(kosdaq_prediction_rf, val_x)
kosdaq_pred

Unnamed: 0,Date,weekday,weeknum,Label
215,2021-11-01,0,44,1013.014999
216,2021-11-02,1,44,1013.014999
217,2021-11-03,2,44,1013.014999
218,2021-11-04,3,44,1013.014999
219,2021-11-05,4,44,1013.014999


In [22]:
# val_x에 예측한 kosdaq 값 추가
val_x['kosdaq'] = kosdaq_pred['Label']
val_x

Unnamed: 0,Date,weekday,weeknum,kosdaq
215,2021-11-01,0,44,1013.014999
216,2021-11-02,1,44,1013.014999
217,2021-11-03,2,44,1013.014999
218,2021-11-04,3,44,1013.014999
219,2021-11-05,4,44,1013.014999


* 1. kosdaq 예측
#### 2. nasdaq
* 3. sp500
* 4. dow
* 5. kospi
* 6. 다른 종가

In [23]:
# setup
ex = setup(train_data, target='nasdaq', ignore_features=['Close','kospi','dow','sp500'])

Unnamed: 0,Description,Value
0,session_id,2653
1,Target,nasdaq
2,Original Data,"(215, 9)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(150, 19)"


In [24]:
# 모델 비교
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,145.0238,36773.1003,188.5477,0.9214,0.0137,0.0104,0.564
rf,Random Forest Regressor,146.045,38951.0891,193.8231,0.9131,0.0142,0.0105,0.13
ada,AdaBoost Regressor,163.4772,42568.8226,205.1057,0.9083,0.0148,0.0117,0.044
gbr,Gradient Boosting Regressor,143.8716,40868.0135,198.2589,0.9082,0.0145,0.0104,0.031
et,Extra Trees Regressor,150.0384,41658.7954,198.6405,0.9039,0.0146,0.0108,0.131
xgboost,Extreme Gradient Boosting,151.7414,42719.4455,200.745,0.9001,0.0147,0.0109,0.09
llar,Lasso Least Angle Regression,183.4414,52998.1339,227.1543,0.8922,0.0164,0.0132,0.011
lasso,Lasso Regression,185.0594,53365.2318,228.5578,0.8878,0.0165,0.0133,0.01
br,Bayesian Ridge,186.7611,54781.0578,231.1404,0.8856,0.0167,0.0134,0.011
ridge,Ridge Regression,186.8812,54575.3133,230.9704,0.8854,0.0166,0.0134,0.013


<catboost.core.CatBoostRegressor at 0x1a92b8811f0>

In [25]:
# 모델 생성
et = create_model('et')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,140.5584,31317.961,176.9688,0.9155,0.013,0.0102
1,196.3852,91921.4143,303.1854,0.7959,0.0227,0.0142
2,148.3499,50339.2047,224.364,0.7724,0.0166,0.0106
3,185.7989,62095.919,249.1905,0.8689,0.0185,0.0135
4,129.0367,22588.4329,150.2945,0.9589,0.0106,0.0091
5,153.9135,38788.0926,196.9469,0.9002,0.0144,0.0113
6,144.1467,34987.9626,187.0507,0.9433,0.0136,0.0103
7,119.9001,20418.4146,142.893,0.9722,0.0105,0.0087
8,143.6331,39757.8651,199.3937,0.9467,0.0149,0.0106
9,138.6612,24372.6874,156.1175,0.9651,0.0111,0.0099


In [26]:
# 모델 튜닝
tuned_et = tune_model(et)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,159.694,38896.8878,197.2229,0.895,0.0143,0.0115
1,181.0328,52886.6325,229.9709,0.8826,0.0167,0.0131
2,111.6464,22542.5869,150.1419,0.8981,0.0105,0.0078
3,283.7286,113648.0245,337.1172,0.76,0.0243,0.0203
4,181.9303,48645.0675,220.5563,0.9114,0.0158,0.0129
5,179.2293,55800.1157,236.2205,0.8565,0.0172,0.0131
6,141.2997,36079.4678,189.946,0.9415,0.0131,0.0099
7,213.3664,60763.7505,246.503,0.9173,0.0178,0.0153
8,231.0248,82984.1941,288.0698,0.8888,0.0213,0.0169
9,252.1673,89499.0097,299.1639,0.8718,0.0213,0.018


In [27]:
# 훈련 완료
nasdaq_prediction_ef = finalize_model(tuned_et)

In [28]:
# nasdaq 예측
nasdaq_pred = predict_model(nasdaq_prediction_ef, val_x)
nasdaq_pred

Unnamed: 0,Date,weekday,weeknum,kosdaq,Label
215,2021-11-01,0,44,1013.014999,14882.28063
216,2021-11-02,1,44,1013.014999,14878.714132
217,2021-11-03,2,44,1013.014999,14895.620934
218,2021-11-04,3,44,1013.014999,14897.983528
219,2021-11-05,4,44,1013.014999,14895.800864


In [29]:
# val_x에 예측 nasdaq 열 추가
val_x['nasdaq'] = nasdaq_pred['Label']
val_x

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq
215,2021-11-01,0,44,1013.014999,14882.28063
216,2021-11-02,1,44,1013.014999,14878.714132
217,2021-11-03,2,44,1013.014999,14895.620934
218,2021-11-04,3,44,1013.014999,14897.983528
219,2021-11-05,4,44,1013.014999,14895.800864


* 1. kosdaq 예측
* 2. nasdaq
#### 3. sp500
* 4. dow
* 5. kospi
* 6. 다른 종가

In [30]:
# setup
ex = setup(train_data, target='sp500', ignore_features=['Close','kospi','dow'])

Unnamed: 0,Description,Value
0,session_id,5035
1,Target,sp500
2,Original Data,"(215, 9)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(150, 20)"


In [31]:
# 모델 비교
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,19.4286,633.8435,24.4982,0.9883,0.006,0.0047,0.024
et,Extra Trees Regressor,19.8198,694.151,25.7859,0.9873,0.0064,0.0049,0.136
catboost,CatBoost Regressor,19.9617,823.6416,27.669,0.9858,0.0068,0.0049,0.757
br,Bayesian Ridge,22.2131,800.7966,27.6717,0.9849,0.0067,0.0054,0.011
lr,Linear Regression,22.4663,806.3225,27.743,0.9849,0.0068,0.0054,0.01
lar,Least Angle Regression,22.5184,810.7008,27.805,0.9848,0.0068,0.0054,0.011
ridge,Ridge Regression,22.4276,829.7059,28.025,0.9842,0.0068,0.0054,0.012
rf,Random Forest Regressor,22.3874,893.7971,28.9917,0.9833,0.0071,0.0054,0.149
lightgbm,Light Gradient Boosting Machine,24.3546,1020.2753,31.273,0.9817,0.0075,0.0058,0.074
lasso,Lasso Regression,24.0921,966.8262,29.839,0.9812,0.0072,0.0058,0.011


GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=5035, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [32]:
# 모델 생성
gbr = create_model('gbr')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,21.2056,619.7144,24.8941,0.9861,0.0061,0.0051
1,18.6657,635.9258,25.2176,0.9888,0.0061,0.0045
2,19.5055,808.5132,28.4344,0.9842,0.0073,0.0049
3,18.108,501.5726,22.3958,0.9878,0.0052,0.0042
4,16.0304,447.3973,21.1518,0.9932,0.0053,0.0039
5,20.5609,655.2192,25.5973,0.9883,0.0061,0.0049
6,14.957,280.1861,16.7388,0.9959,0.0041,0.0037
7,12.578,217.1131,14.7348,0.9969,0.0035,0.003
8,27.2065,1207.7205,34.7523,0.9826,0.0086,0.0067
9,25.4687,965.0726,31.0656,0.9791,0.0079,0.0064


In [33]:
# 모델 튜닝
tuned_gbr = tune_model(gbr)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,20.2314,621.5062,24.93,0.986,0.0061,0.0048
1,15.2534,388.1814,19.7023,0.9932,0.0047,0.0036
2,19.6599,730.4544,27.0269,0.9858,0.0069,0.0049
3,17.0168,515.3454,22.7012,0.9875,0.0052,0.004
4,13.6074,323.4699,17.9853,0.9951,0.0043,0.0032
5,23.3916,857.9912,29.2915,0.9846,0.0072,0.0057
6,14.0276,299.5644,17.3079,0.9956,0.0044,0.0035
7,14.1216,332.0623,18.2226,0.9953,0.0045,0.0034
8,25.879,1145.0701,33.8389,0.9835,0.0085,0.0064
9,28.3706,1105.781,33.2533,0.976,0.0084,0.0071


In [34]:
# 훈련 완료
sp500_prediction_gbr = finalize_model(tuned_gbr)

In [35]:
# nasdaq 예측
sp500_pred = predict_model(sp500_prediction_gbr, val_x)
sp500_pred

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,Label
215,2021-11-01,0,44,1013.014999,14882.28063,4461.974317
216,2021-11-02,1,44,1013.014999,14878.714132,4466.044291
217,2021-11-03,2,44,1013.014999,14895.620934,4469.903961
218,2021-11-04,3,44,1013.014999,14897.983528,4478.601341
219,2021-11-05,4,44,1013.014999,14895.800864,4478.81872


In [36]:
# val_x에 예측 nasdaq 열 추가
val_x['sp500'] = sp500_pred['Label']
val_x

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500
215,2021-11-01,0,44,1013.014999,14882.28063,4461.974317
216,2021-11-02,1,44,1013.014999,14878.714132,4466.044291
217,2021-11-03,2,44,1013.014999,14895.620934,4469.903961
218,2021-11-04,3,44,1013.014999,14897.983528,4478.601341
219,2021-11-05,4,44,1013.014999,14895.800864,4478.81872


* 1. kosdaq 예측
* 2. nasdaq
* 3. sp500
#### 4. dow
* 5. kospi
* 6. 종가

In [37]:
# setup
ex = setup(train_data, target='dow', ignore_features=['Close','kospi'])

Unnamed: 0,Description,Value
0,session_id,8276
1,Target,dow
2,Original Data,"(215, 9)"
3,Missing Values,False
4,Numeric Features,4
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(150, 21)"


In [38]:
# 모델 비교
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,100.4756,18108.9585,131.0547,0.9914,0.004,0.003,0.011
ridge,Ridge Regression,99.8623,18031.946,130.6733,0.9914,0.004,0.003,0.012
br,Bayesian Ridge,101.486,18144.8715,131.3049,0.9913,0.004,0.003,0.011
lr,Linear Regression,101.2249,19170.422,134.5967,0.9909,0.0041,0.003,0.651
en,Elastic Net,136.1229,26946.9784,162.0873,0.9865,0.0049,0.0041,0.012
huber,Huber Regressor,142.8897,31932.346,174.2273,0.9841,0.0052,0.0043,0.026
gbr,Gradient Boosting Regressor,141.7069,37121.033,184.953,0.9821,0.0056,0.0042,0.028
xgboost,Extreme Gradient Boosting,156.3488,41561.149,197.4837,0.9798,0.0059,0.0047,0.081
rf,Random Forest Regressor,155.3648,43283.8562,199.2772,0.9797,0.006,0.0046,0.126
et,Extra Trees Regressor,154.135,44284.7194,200.7946,0.9796,0.0061,0.0046,0.107


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=8276,
      selection='cyclic', tol=0.0001, warm_start=False)

In [39]:
# 모델 생성
ridge = create_model('ridge')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,132.4979,28517.2051,168.8704,0.9874,0.0052,0.004
1,112.3061,21931.0,148.0912,0.9844,0.0044,0.0033
2,109.076,22275.7578,149.2507,0.9912,0.0047,0.0033
3,96.2711,19574.3516,139.9084,0.9941,0.0044,0.0029
4,62.9531,6438.5293,80.2404,0.9973,0.0025,0.0019
5,119.6544,19553.7754,139.8348,0.9909,0.0043,0.0036
6,142.6168,30326.7617,174.1458,0.9866,0.0052,0.0043
7,87.6957,11042.2793,105.0823,0.9953,0.0031,0.0026
8,72.4431,7493.2515,86.5636,0.9922,0.0026,0.0021
9,63.1091,13166.5488,114.7456,0.9949,0.0035,0.0019


In [40]:
# 모델 튜닝
tuned_ridge = tune_model(ridge)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,132.4495,28257.1133,168.0985,0.9876,0.0051,0.004
1,112.8038,21895.625,147.9717,0.9844,0.0044,0.0033
2,110.7232,24239.4453,155.6902,0.9904,0.0049,0.0034
3,96.7112,20216.0352,142.1831,0.9939,0.0045,0.003
4,64.6637,6805.3589,82.4946,0.9971,0.0026,0.002
5,117.0198,19871.0215,140.9646,0.9908,0.0043,0.0035
6,147.1706,31727.4141,178.1219,0.9859,0.0053,0.0044
7,87.8655,11388.209,106.7156,0.9951,0.0032,0.0026
8,69.9896,7259.9136,85.2051,0.9924,0.0025,0.0021
9,59.3641,12275.1191,110.7931,0.9952,0.0034,0.0018


In [41]:
# 훈련 완료
dow_prediction_ridge = finalize_model(tuned_ridge)

In [42]:
# dow 예측
dow_pred_ridge = predict_model(dow_prediction_ridge, val_x)
dow_pred_ridge

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,Label
215,2021-11-01,0,44,1013.014999,14882.28063,4461.974317,34982.164062
216,2021-11-02,1,44,1013.014999,14878.714132,4466.044291,35036.757812
217,2021-11-03,2,44,1013.014999,14895.620934,4469.903961,35057.34375
218,2021-11-04,3,44,1013.014999,14897.983528,4478.601341,35165.890625
219,2021-11-05,4,44,1013.014999,14895.800864,4478.81872,35160.136719


In [43]:
# val_x에 dow 열 추가
val_x['dow'] = dow_pred_ridge['Label']
val_x

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow
215,2021-11-01,0,44,1013.014999,14882.28063,4461.974317,34982.164062
216,2021-11-02,1,44,1013.014999,14878.714132,4466.044291,35036.757812
217,2021-11-03,2,44,1013.014999,14895.620934,4469.903961,35057.34375
218,2021-11-04,3,44,1013.014999,14897.983528,4478.601341,35165.890625
219,2021-11-05,4,44,1013.014999,14895.800864,4478.81872,35160.136719


* 1. kosdaq 예측
* 2. nasdaq
* 3. sp500
* 4. dow
#### 5. kospi
* 6.  종가

In [45]:
# setup
ex = setup(train_data, target='kospi', ignore_features=['Close'])

Unnamed: 0,Description,Value
0,session_id,2100
1,Target,kospi
2,Original Data,"(215, 9)"
3,Missing Values,False
4,Numeric Features,5
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(150, 22)"


In [47]:
# 모델 비교
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,18.4439,668.9576,25.5937,0.925,0.0082,0.0059,0.825
rf,Random Forest Regressor,20.0017,743.7509,26.8528,0.9151,0.0086,0.0064,0.145
gbr,Gradient Boosting Regressor,20.0583,730.7977,26.7851,0.9137,0.0086,0.0064,0.031
et,Extra Trees Regressor,20.3223,815.1824,27.9017,0.906,0.009,0.0065,0.109
xgboost,Extreme Gradient Boosting,21.011,867.7442,28.921,0.8991,0.0093,0.0067,0.084
lr,Linear Regression,21.7098,997.1215,29.8186,0.8947,0.0096,0.007,0.01
ridge,Ridge Regression,22.6671,1053.7848,30.3771,0.89,0.0098,0.0073,0.011
lasso,Lasso Regression,23.7909,1111.6021,31.3645,0.8851,0.0101,0.0076,0.014
ada,AdaBoost Regressor,26.8607,1178.5351,33.7346,0.8677,0.0108,0.0086,0.048
lightgbm,Light Gradient Boosting Machine,25.203,1265.8848,33.948,0.8572,0.011,0.0081,0.031


<catboost.core.CatBoostRegressor at 0x1a92b7e09d0>

In [48]:
# 모델 생성
cat = create_model('catboost')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,17.5016,677.7654,26.0339,0.9451,0.0085,0.0056
1,18.3021,539.0143,23.2167,0.9463,0.0076,0.0059
2,17.9925,506.7321,22.5107,0.899,0.007,0.0056
3,13.7889,518.2283,22.7646,0.9034,0.0073,0.0044
4,22.6545,1113.7838,33.3734,0.9033,0.0107,0.0073
5,18.7212,519.1603,22.7851,0.9519,0.0071,0.0059
6,15.5986,547.3433,23.3954,0.9355,0.0075,0.0051
7,23.0853,824.4715,28.7136,0.9278,0.0095,0.0076
8,15.4631,512.5339,22.6392,0.9146,0.0073,0.005
9,21.3314,930.5427,30.5048,0.9236,0.0098,0.0068


In [49]:
# 모델 튜닝
tuned_cat = tune_model(cat)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,23.1455,893.6466,29.8939,0.9276,0.0097,0.0075
1,22.9494,1024.5956,32.0093,0.8978,0.0105,0.0075
2,17.639,416.8605,20.4172,0.9169,0.0064,0.0055
3,19.9584,811.1189,28.4801,0.8488,0.0092,0.0064
4,22.6564,1128.9326,33.5996,0.902,0.0108,0.0072
5,19.6201,623.0025,24.96,0.9423,0.008,0.0062
6,16.9582,575.7479,23.9947,0.9321,0.0077,0.0054
7,23.9842,1086.8998,32.9682,0.9049,0.0109,0.0079
8,18.2883,690.8897,26.2848,0.8848,0.0086,0.0059
9,18.7042,718.7974,26.8104,0.941,0.0087,0.006


In [50]:
# 훈련 완료
kospi_prediction_cat = finalize_model(tuned_cat)

In [51]:
# kospi 예측
kospi_pred_cat = predict_model(kospi_prediction_cat, val_x)
kospi_pred_cat

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,Label
215,2021-11-01,0,44,1013.014999,14882.28063,4461.974317,34982.164062,3109.411832
216,2021-11-02,1,44,1013.014999,14878.714132,4466.044291,35036.757812,3108.417746
217,2021-11-03,2,44,1013.014999,14895.620934,4469.903961,35057.34375,3106.819014
218,2021-11-04,3,44,1013.014999,14897.983528,4478.601341,35165.890625,3069.184414
219,2021-11-05,4,44,1013.014999,14895.800864,4478.81872,35160.136719,3066.742507


In [52]:
# val_x에 kospi 열 추가
val_x['kospi'] = kospi_pred_cat['Label']
val_x

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
215,2021-11-01,0,44,1013.014999,14882.28063,4461.974317,34982.164062,3109.411832
216,2021-11-02,1,44,1013.014999,14878.714132,4466.044291,35036.757812,3108.417746
217,2021-11-03,2,44,1013.014999,14895.620934,4469.903961,35057.34375,3106.819014
218,2021-11-04,3,44,1013.014999,14897.983528,4478.601341,35165.890625,3069.184414
219,2021-11-05,4,44,1013.014999,14895.800864,4478.81872,35160.136719,3066.742507


* 1. kosdaq 예측
* 2. nasdaq
* 3. sp500
* 4. dow
* 5. kospi
#### 6.  종가

In [53]:
# setup
ex = setup(train_data, target='Close')

Unnamed: 0,Description,Value
0,session_id,7184
1,Target,Close
2,Original Data,"(215, 9)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(150, 23)"


In [54]:
# 모델 비교
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,723.4306,961650.3324,945.3836,0.9463,0.0117,0.009,0.941
gbr,Gradient Boosting Regressor,728.5927,1018906.1459,977.203,0.9427,0.0119,0.009,0.034
et,Extra Trees Regressor,716.8878,1081710.4917,975.5914,0.9397,0.0118,0.0088,0.107
rf,Random Forest Regressor,741.1089,1143883.2275,998.5408,0.9388,0.0122,0.0092,0.124
xgboost,Extreme Gradient Boosting,823.689,1177164.1812,1047.999,0.9289,0.013,0.0103,0.09
lasso,Lasso Regression,887.3498,1202621.0438,1090.4754,0.9225,0.0136,0.0111,0.011
lr,Linear Regression,893.9685,1218696.3375,1096.0486,0.9213,0.0137,0.0112,0.009
lightgbm,Light Gradient Boosting Machine,888.1073,1395922.2117,1153.3382,0.9207,0.0144,0.0111,0.167
ridge,Ridge Regression,893.3836,1271625.25,1119.9745,0.9202,0.0139,0.0111,0.011
ada,AdaBoost Regressor,920.1512,1339943.7572,1131.0081,0.9178,0.0138,0.0114,0.049


<catboost.core.CatBoostRegressor at 0x1a92b861610>

In [55]:
# 모델 생성
cat = create_model('catboost')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1035.8996,2234364.6155,1494.7791,0.9152,0.0182,0.0129
1,566.782,462342.9831,679.9581,0.9396,0.0083,0.007
2,787.1739,1064445.4356,1031.7196,0.9207,0.0128,0.0099
3,930.4299,1251533.0685,1118.7194,0.9362,0.0135,0.0113
4,496.3631,462020.5568,679.7209,0.971,0.0083,0.0061
5,412.8904,282524.6283,531.5305,0.9819,0.007,0.0053
6,857.2338,1167432.1867,1080.4778,0.9485,0.0136,0.0108
7,741.312,763979.8122,874.0594,0.9469,0.0109,0.0093
8,712.8512,911459.8132,954.704,0.9517,0.011,0.0085
9,693.3698,1016400.224,1008.1668,0.9515,0.0133,0.009


In [56]:
# 모델 튜닝
tuned_cat = tune_model(cat)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1270.7983,3280836.7612,1811.308,0.8755,0.0225,0.0161
1,569.8644,527243.3619,726.1153,0.9311,0.0088,0.0069
2,896.4909,1491324.9206,1221.1981,0.8889,0.015,0.0111
3,818.8868,1049718.3992,1024.5577,0.9465,0.0123,0.01
4,546.8715,520515.75,721.4678,0.9673,0.0087,0.0066
5,588.1413,661102.5814,813.0821,0.9576,0.011,0.0077
6,744.8115,881447.4875,938.8543,0.9611,0.0115,0.0092
7,776.2547,806735.6835,898.1847,0.9439,0.0113,0.0098
8,800.3352,1325861.457,1151.4606,0.9297,0.0135,0.0096
9,1087.736,1868876.4481,1367.0686,0.9108,0.0171,0.0137


In [58]:
# 훈련 완료
close_prediction_cat = finalize_model(tuned_cat)

In [59]:
# close 예측
close_pred_cat = predict_model(close_prediction_cat, val_x)
close_pred_cat

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi,Label
215,2021-11-01,0,44,1013.014999,14882.28063,4461.974317,34982.164062,3109.411832,74553.64292
216,2021-11-02,1,44,1013.014999,14878.714132,4466.044291,35036.757812,3108.417746,74602.552563
217,2021-11-03,2,44,1013.014999,14895.620934,4469.903961,35057.34375,3106.819014,74356.379211
218,2021-11-04,3,44,1013.014999,14897.983528,4478.601341,35165.890625,3069.184414,71646.465879
219,2021-11-05,4,44,1013.014999,14895.800864,4478.81872,35160.136719,3066.742507,71707.955498


In [61]:
# val_x에 close 열 추가
val_x['Close'] = close_pred_cat['Label']
val_x

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi,Close
215,2021-11-01,0,44,1013.014999,14882.28063,4461.974317,34982.164062,3109.411832,74553.64292
216,2021-11-02,1,44,1013.014999,14878.714132,4466.044291,35036.757812,3108.417746,74602.552563
217,2021-11-03,2,44,1013.014999,14895.620934,4469.903961,35057.34375,3106.819014,74356.379211
218,2021-11-04,3,44,1013.014999,14897.983528,4478.601341,35165.890625,3069.184414,71646.465879
219,2021-11-05,4,44,1013.014999,14895.800864,4478.81872,35160.136719,3066.742507,71707.955498


In [70]:
stock_close = {}
stock_close['Date'] = val_x['Date']
stock_close[code] = val_x['Close']

In [71]:
stock_close

{'Date': 215    2021-11-01
 216    2021-11-02
 217    2021-11-03
 218    2021-11-04
 219    2021-11-05
 Name: Date, dtype: object,
 '005930': 215    74553.642920
 216    74602.552563
 217    74356.379211
 218    71646.465879
 219    71707.955498
 Name: Close, dtype: float64}

In [72]:
all_stock_close = pd.DataFrame(stock_close)
all_stock_close

Unnamed: 0,Date,005930
215,2021-11-01,74553.64292
216,2021-11-02,74602.552563
217,2021-11-03,74356.379211
218,2021-11-04,71646.465879
219,2021-11-05,71707.955498


# 평가

In [65]:
def evaluation_index(pred, y):
    y_mean = y.mean()
    
    # MSE
    mse = ( (y - pred) ** 2 ).sum() / len(pred)
        
    # RMSE
    rmse = mse ** (1/2)
    
    # MAE
    mae = ( abs( y - pred ) ).sum() / len(pred)
    
    # R2
    y_mean = y.mean()
    upper = ( (y - pred) ** 2 ).sum()
    lower = ( (y - y_mean) ** 2 ).sum()
    fraction = upper / lower
    r2 = 1 - fraction
    
    #NMAE(Normalized Mean Absolute Error)-정규화 평균 절대 오차 척도
    nmae = ( abs( y - pred ) / y ).sum() / len(pred)
    
    # 평가지표 출력
    print(f"R2 = {r2}")
    print(f"MSE = {mse}")
    print(f"RMSE = {rmse}")
    print(f"MAE = {mae}")
    print(f"NMAE = {nmae}")
    print(f"NMAE * 100 = {nmae*100}")

In [66]:
evaluation_index(val_x['Close'], val_y['Close'])

R2 = -33.26715389361069
MSE = 10060836.383164098
RMSE = 3171.882151525195
MAE = 2853.3992142376082
NMAE = 0.040493988724138494
NMAE * 100 = 4.04939887241385
