In [80]:
import FinanceDataReader as fdr
from tqdm import tqdm
import pandas as pd

# 상장 시장별 예측(Catboost이용)

## 모든 종목 예측

* 종가와 상관계수 높은 특성 : KOSDAQ(0.87), KOSPI(0.79), DOW(0.58), NASDAQ(0.50), S&P500(0.56)
* 예측 순서(종가와 상관계수 높은 특성 먼저 예측)
    * weekday, weeknumdm로 KOSDAQ 예측(종가와 상관계수 0.87)
    * weekday, weeknum, KOSDAQ으로 NASDAQ 예측(KOSDAQ과 상관계수 0.68)
    * weekday, weeknum, KOSDAQ, NASDAQ으로 S&P500 예측(NASDAQ과 상관계수 0.93)
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500으로 DOW 예측(S&P500과 상관계수 0.95)
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500, DOW로 KOSPI 예측
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500, DOW, KOSPI로 다른 종가 예측
* 바로 전날 데이터 이용해 예측

In [81]:
# 제출 점수 :
# 자체 결정계수 점수 : 53.77794

In [82]:
from pycaret.regression import *

# 데이터 불러오기(첫번째 주)
* 훈련 : 10.04 ~ 10.29
* 예측 : 11.01 ~ 11.05

In [83]:
start_date = '2021-10-04'
end_date = '2021-11-04'

### train set

In [84]:
data1 = pd.read_csv('../data/20210104-20211126_stock_index.csv')
stock_list = pd.read_csv('../data/stock_list.csv')

In [85]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    235 non-null    object 
 1   kosdaq  235 non-null    float64
 2   nasdaq  235 non-null    float64
 3   dow     235 non-null    float64
 4   sp500   235 non-null    float64
 5   kospi   235 non-null    float64
dtypes: float64(5), object(1)
memory usage: 11.1+ KB


In [86]:
data1['Date'] = pd.to_datetime(data1['Date'])
data1['weeknum'] = data1['Date'].dt.weekofyear
data1['weekday'] = data1['Date'].dt.weekday
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     235 non-null    datetime64[ns]
 1   kosdaq   235 non-null    float64       
 2   nasdaq   235 non-null    float64       
 3   dow      235 non-null    float64       
 4   sp500    235 non-null    float64       
 5   kospi    235 non-null    float64       
 6   weeknum  235 non-null    int64         
 7   weekday  235 non-null    int64         
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 14.8 KB


In [87]:
data1.head()

Unnamed: 0,Date,kosdaq,nasdaq,dow,sp500,kospi,weeknum,weekday
0,2021-01-04,977.62,12698.4,30223.89,3700.65,2944.45,1,0
1,2021-01-05,985.76,12819.0,30391.6,3726.86,2990.57,1,1
2,2021-01-06,981.39,12740.8,30829.4,3748.14,2968.21,1,2
3,2021-01-07,988.86,13067.5,31041.13,3803.79,3031.68,1,3
4,2021-01-08,987.79,13202.0,31097.97,3824.68,3152.18,1,4


In [88]:
data1.columns

Index(['Date', 'kosdaq', 'nasdaq', 'dow', 'sp500', 'kospi', 'weeknum',
       'weekday'],
      dtype='object')

In [89]:
data1 = data1[['Date', 'weeknum', 'weekday', 'kosdaq', 'nasdaq', 'dow', 'sp500', 'kospi']]

In [90]:
data1.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,dow,sp500,kospi
0,2021-01-04,1,0,977.62,12698.4,30223.89,3700.65,2944.45
1,2021-01-05,1,1,985.76,12819.0,30391.6,3726.86,2990.57
2,2021-01-06,1,2,981.39,12740.8,30829.4,3748.14,2968.21
3,2021-01-07,1,3,988.86,13067.5,31041.13,3803.79,3031.68
4,2021-01-08,1,4,987.79,13202.0,31097.97,3824.68,3152.18


In [91]:
train1 = data1[(data1['Date'] >= start_date) & (data1['Date'] <= end_date)]
train1.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,dow,sp500,kospi
195,2021-10-04,40,0,969.285,14255.5,34003.58,4300.46,2990.675
196,2021-10-05,40,1,955.37,14433.8,34315.99,4345.72,2962.17
197,2021-10-06,40,2,922.36,14501.9,34417.98,4363.55,2908.31
198,2021-10-07,40,3,953.43,14654.0,34754.15,4399.76,2959.46
199,2021-10-08,40,4,953.11,14579.5,34746.71,4391.36,2956.3


In [92]:
train1.tail()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,dow,sp500,kospi
214,2021-10-29,43,4,992.33,15498.4,35819.59,4605.38,2970.68
215,2021-11-01,44,0,998.57,15595.9,35913.68,4613.67,2978.94
216,2021-11-02,44,1,1009.44,15649.6,36053.09,4630.65,3013.49
217,2021-11-03,44,2,1005.0,15811.6,36157.02,4660.57,2975.71
218,2021-11-04,44,3,1001.43,15940.3,36124.66,4680.06,2983.22


In [93]:
train1 = train1.reset_index()
train1 = train1.drop('index', axis=1)
train1.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,dow,sp500,kospi
0,2021-10-04,40,0,969.285,14255.5,34003.58,4300.46,2990.675
1,2021-10-05,40,1,955.37,14433.8,34315.99,4345.72,2962.17
2,2021-10-06,40,2,922.36,14501.9,34417.98,4363.55,2908.31
3,2021-10-07,40,3,953.43,14654.0,34754.15,4399.76,2959.46
4,2021-10-08,40,4,953.11,14579.5,34746.71,4391.36,2956.3


In [94]:
train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     24 non-null     datetime64[ns]
 1   weeknum  24 non-null     int64         
 2   weekday  24 non-null     int64         
 3   kosdaq   24 non-null     float64       
 4   nasdaq   24 non-null     float64       
 5   dow      24 non-null     float64       
 6   sp500    24 non-null     float64       
 7   kospi    24 non-null     float64       
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 1.6 KB


In [95]:
# 종목 코드 6자리로 맞추기
stock_list['종목코드'] = stock_list['종목코드'].astype(str).str.zfill(6)

In [96]:
stock_list.head()

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,5930,KOSPI
1,SK하이닉스,660,KOSPI
2,NAVER,35420,KOSPI
3,카카오,35720,KOSPI
4,삼성바이오로직스,207940,KOSPI


### test set

In [97]:
test1 = data1[(data1['Date'] >= '2021-11-01') & (data1['Date'] <= '2021-11-05')].reset_index()
test1 = test1.drop('index', axis=1)
test1.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,dow,sp500,kospi
0,2021-11-01,44,0,998.57,15595.9,35913.68,4613.67,2978.94
1,2021-11-02,44,1,1009.44,15649.6,36053.09,4630.65,3013.49
2,2021-11-03,44,2,1005.0,15811.6,36157.02,4660.57,2975.71
3,2021-11-04,44,3,1001.43,15940.3,36124.66,4680.06,2983.22
4,2021-11-05,44,4,1001.35,15971.6,36329.07,4697.53,2969.27


In [98]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     5 non-null      datetime64[ns]
 1   weeknum  5 non-null      int64         
 2   weekday  5 non-null      int64         
 3   kosdaq   5 non-null      float64       
 4   nasdaq   5 non-null      float64       
 5   dow      5 non-null      float64       
 6   sp500    5 non-null      float64       
 7   kospi    5 non-null      float64       
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 448.0 bytes


In [99]:
test1 = test1[['Date','weeknum','weekday']]
test1

Unnamed: 0,Date,weeknum,weekday
0,2021-11-01,44,0
1,2021-11-02,44,1
2,2021-11-03,44,2
3,2021-11-04,44,3
4,2021-11-05,44,4


## 3가지 날짜 특성으로 5가지 외부 특성 순서대로 예측하기

In [100]:
start_date = '2021-10-04'
end_date = '2021-11-04'

features = ['Date', 'weeknum', 'weekday', 'kosdaq', 'nasdaq', 'sp500', 'dow', 'kospi']
days1 = ['2021-10-04', '2021-10-05', '2021-10-06', '2021-10-07', '2021-10-08']
days2 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']
pred_test1 = pd.DataFrame()
for d in tqdm(range(5)):
    train_start_day = d
    train_end_day = d
    test_day = d+1
    temp_train = train1[(train1['Date'] >= days1[train_start_day]) & (train1['Date'] <= days2[train_end_day])]
    temp_test = test1[test1['Date'] == days2[test_day]]
    
    for i in range(5):      
        ex = setup(temp_train, target = features[i+3], ignore_features = features[i+4:], fold=4,
                      fold_shuffle=True, silent=True, use_gpu=True)
        model = create_model('catboost')
        final_model = finalize_model(model)
        pred = predict_model(final_model, temp_test)
        
        temp_test[features[i+3]] = pred['Label']
    
    pred_test1 = pd.concat([pred_test1, temp_test])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,21.4816,615.0072,24.7993,0.2045,0.0082,0.0071
1,18.3648,965.3607,31.0703,0.4642,0.0106,0.0063
2,8.0191,129.4418,11.3772,0.6294,0.0038,0.0027
3,16.9876,430.9811,20.7601,-0.1723,0.0069,0.0057
Mean,16.2133,535.1977,22.0017,0.2815,0.0074,0.0054
SD,5.0032,302.865,7.1499,0.3026,0.0024,0.0017


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:02<00:00, 48.41s/it]


In [101]:
pred_test1

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,sp500,dow,kospi
0,2021-11-01,44,0,969.212896,14506.027898,4374.346336,34540.70005,2972.818283
1,2021-11-02,44,1,996.130465,15253.017991,4566.198763,35697.13386,3013.898914
2,2021-11-03,44,2,1003.685315,15507.310373,4593.508834,35736.631211,3003.717113
3,2021-11-04,44,3,995.710558,15486.116757,4606.498971,35910.840208,2984.410936
4,2021-11-05,44,4,993.271141,15495.87256,4622.599512,35846.924735,2977.474141


In [102]:
# pred_test1.to_csv('prac_4w_pred_1w_test1_cat.csv', index=False)

## 훈련, 테스트 셋 준비

### 데이터 불러오기

In [103]:
all_data = pd.read_csv('../data/2021-01-04_2021-11-26_all_data.csv')
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86950 entries, 0 to 86949
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    86950 non-null  object 
 1   kosdaq  86950 non-null  float64
 2   nasdaq  86950 non-null  float64
 3   dow     86950 non-null  float64
 4   sp500   86950 non-null  float64
 5   kospi   86950 non-null  float64
 6   code    86950 non-null  int64  
 7   market  86950 non-null  object 
 8   close   82872 non-null  float64
dtypes: float64(6), int64(1), object(2)
memory usage: 6.0+ MB


In [104]:
all_data['code'] = all_data['code'].astype(str).str.zfill(6)
all_data.head()

Unnamed: 0,Date,kosdaq,nasdaq,dow,sp500,kospi,code,market,close
0,2021-01-04,977.62,12698.4,30223.89,3700.65,2944.45,5930,KOSPI,83000.0
1,2021-01-05,985.76,12819.0,30391.6,3726.86,2990.57,5930,KOSPI,83900.0
2,2021-01-06,981.39,12740.8,30829.4,3748.14,2968.21,5930,KOSPI,82200.0
3,2021-01-07,988.86,13067.5,31041.13,3803.79,3031.68,5930,KOSPI,82900.0
4,2021-01-08,987.79,13202.0,31097.97,3824.68,3152.18,5930,KOSPI,88800.0


### 훈련 셋 준비

#### KOPSI 종목 데이터만 추출

In [105]:
kospi_train = all_data[all_data['market'] == 'KOSPI']
kospi_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43475 entries, 0 to 43474
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    43475 non-null  object 
 1   kosdaq  43475 non-null  float64
 2   nasdaq  43475 non-null  float64
 3   dow     43475 non-null  float64
 4   sp500   43475 non-null  float64
 5   kospi   43475 non-null  float64
 6   code    43475 non-null  object 
 7   market  43475 non-null  object 
 8   close   41440 non-null  float64
dtypes: float64(6), object(3)
memory usage: 3.3+ MB


In [106]:
kospi_train.head()

Unnamed: 0,Date,kosdaq,nasdaq,dow,sp500,kospi,code,market,close
0,2021-01-04,977.62,12698.4,30223.89,3700.65,2944.45,5930,KOSPI,83000.0
1,2021-01-05,985.76,12819.0,30391.6,3726.86,2990.57,5930,KOSPI,83900.0
2,2021-01-06,981.39,12740.8,30829.4,3748.14,2968.21,5930,KOSPI,82200.0
3,2021-01-07,988.86,13067.5,31041.13,3803.79,3031.68,5930,KOSPI,82900.0
4,2021-01-08,987.79,13202.0,31097.97,3824.68,3152.18,5930,KOSPI,88800.0


#### KOSDAQ 종목 데이터만 추출

In [107]:
kosdaq_train = all_data[all_data['market'] == 'KOSDAQ']
kosdaq_train = kosdaq_train.reset_index()
kosdaq_train = kosdaq_train.drop('index', axis=1)
kosdaq_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43475 entries, 0 to 43474
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    43475 non-null  object 
 1   kosdaq  43475 non-null  float64
 2   nasdaq  43475 non-null  float64
 3   dow     43475 non-null  float64
 4   sp500   43475 non-null  float64
 5   kospi   43475 non-null  float64
 6   code    43475 non-null  object 
 7   market  43475 non-null  object 
 8   close   41432 non-null  float64
dtypes: float64(6), object(3)
memory usage: 3.0+ MB


In [108]:
kosdaq_train.head()

Unnamed: 0,Date,kosdaq,nasdaq,dow,sp500,kospi,code,market,close
0,2021-01-04,977.62,12698.4,30223.89,3700.65,2944.45,91990,KOSDAQ,151300.0
1,2021-01-05,985.76,12819.0,30391.6,3726.86,2990.57,91990,KOSDAQ,152300.0
2,2021-01-06,981.39,12740.8,30829.4,3748.14,2968.21,91990,KOSDAQ,150300.0
3,2021-01-07,988.86,13067.5,31041.13,3803.79,3031.68,91990,KOSDAQ,154700.0
4,2021-01-08,987.79,13202.0,31097.97,3824.68,3152.18,91990,KOSDAQ,162900.0


### 테스트 셋 준비

In [109]:
test = pd.DataFrame()
test_start_date = '2021-11-01'
test_end_date = '2021-11-05'
Business_days = pd.DataFrame(pd.date_range(start=test_start_date, end=test_end_date, freq='B'), columns=['Date'])
for code, market in tqdm(stock_list[['종목코드', '상장시장']].values):
    temp_code = [code] * 5
    temp_market = [market] * 5
    
    pred_test1['code'] = temp_code
    pred_test1['market'] = temp_market
    
    test = pd.concat([test, pred_test1])

100%|███████████████████████████████████████████████████████████████████████████████| 370/370 [00:00<00:00, 883.31it/s]


In [110]:
# 5일 * 370종목 = 1850
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850 entries, 0 to 4
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     1850 non-null   datetime64[ns]
 1   weeknum  1850 non-null   int64         
 2   weekday  1850 non-null   int64         
 3   kosdaq   1850 non-null   float64       
 4   nasdaq   1850 non-null   float64       
 5   sp500    1850 non-null   float64       
 6   dow      1850 non-null   float64       
 7   kospi    1850 non-null   float64       
 8   code     1850 non-null   object        
 9   market   1850 non-null   object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 159.0+ KB


In [111]:
test.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,sp500,dow,kospi,code,market
0,2021-11-01,44,0,969.212896,14506.027898,4374.346336,34540.70005,2972.818283,5930,KOSPI
1,2021-11-02,44,1,996.130465,15253.017991,4566.198763,35697.13386,3013.898914,5930,KOSPI
2,2021-11-03,44,2,1003.685315,15507.310373,4593.508834,35736.631211,3003.717113,5930,KOSPI
3,2021-11-04,44,3,995.710558,15486.116757,4606.498971,35910.840208,2984.410936,5930,KOSPI
4,2021-11-05,44,4,993.271141,15495.87256,4622.599512,35846.924735,2977.474141,5930,KOSPI


#### KOSPI 종목 데이터만 추출

In [112]:
kospi_test = test[test['market'] == 'KOSPI']
kospi_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 925 entries, 0 to 4
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     925 non-null    datetime64[ns]
 1   weeknum  925 non-null    int64         
 2   weekday  925 non-null    int64         
 3   kosdaq   925 non-null    float64       
 4   nasdaq   925 non-null    float64       
 5   sp500    925 non-null    float64       
 6   dow      925 non-null    float64       
 7   kospi    925 non-null    float64       
 8   code     925 non-null    object        
 9   market   925 non-null    object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 79.5+ KB


In [113]:
kospi_test.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,sp500,dow,kospi,code,market
0,2021-11-01,44,0,969.212896,14506.027898,4374.346336,34540.70005,2972.818283,5930,KOSPI
1,2021-11-02,44,1,996.130465,15253.017991,4566.198763,35697.13386,3013.898914,5930,KOSPI
2,2021-11-03,44,2,1003.685315,15507.310373,4593.508834,35736.631211,3003.717113,5930,KOSPI
3,2021-11-04,44,3,995.710558,15486.116757,4606.498971,35910.840208,2984.410936,5930,KOSPI
4,2021-11-05,44,4,993.271141,15495.87256,4622.599512,35846.924735,2977.474141,5930,KOSPI


#### KOSDAQ 종목 데이터만 추출

In [114]:
kosdaq_test = test[test['market'] == 'KOSDAQ'].reset_index()
kosdaq_test = kosdaq_test.drop('index', axis=1)
kosdaq_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 925 entries, 0 to 924
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     925 non-null    datetime64[ns]
 1   weeknum  925 non-null    int64         
 2   weekday  925 non-null    int64         
 3   kosdaq   925 non-null    float64       
 4   nasdaq   925 non-null    float64       
 5   sp500    925 non-null    float64       
 6   dow      925 non-null    float64       
 7   kospi    925 non-null    float64       
 8   code     925 non-null    object        
 9   market   925 non-null    object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 72.4+ KB


In [115]:
kosdaq_test.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,sp500,dow,kospi,code,market
0,2021-11-01,44,0,969.212896,14506.027898,4374.346336,34540.70005,2972.818283,91990,KOSDAQ
1,2021-11-02,44,1,996.130465,15253.017991,4566.198763,35697.13386,3013.898914,91990,KOSDAQ
2,2021-11-03,44,2,1003.685315,15507.310373,4593.508834,35736.631211,3003.717113,91990,KOSDAQ
3,2021-11-04,44,3,995.710558,15486.116757,4606.498971,35910.840208,2984.410936,91990,KOSDAQ
4,2021-11-05,44,4,993.271141,15495.87256,4622.599512,35846.924735,2977.474141,91990,KOSDAQ


## 상장 시장 별 종가 예측하기

### KOSPI

In [116]:
days1 = ['2021-10-04', '2021-10-05', '2021-10-06', '2021-10-07', '2021-10-08']
days2 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']

kospi_stock_list = stock_list[stock_list['상장시장'] == 'KOSPI']

temp_pred_test = pd.DataFrame()
for d in range(5):
    train_start_day = d
    train_end_day = d
    test_day = d+1
    temp_train = kospi_train[ (kospi_train['Date'] >= days1[train_start_day]) &
                              (kospi_train['Date'] <= days2[train_end_day]) ]
    
    temp_test = kospi_test[kospi_test['Date'] == days2[test_day]]

    ex = setup(temp_train, target = 'close', silent=True, fold=4, fold_shuffle=True, use_gpu=True)
    cat = create_model('catboost')
    pred = predict_model(cat, temp_test)
    temp_test['close'] = pred['Label']
    temp_pred_test = pd.concat([temp_pred_test, temp_test])

kospi_pred = temp_pred_test[['Date','code', 'close']]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,24003.093,1122596537.3063,33505.1718,0.973,0.6277,0.7573
1,22603.4687,948029908.9137,30790.0943,0.9656,0.6076,0.6969
2,26580.7507,1362945144.4824,36918.087,0.9622,0.6469,0.793
3,23644.795,999264121.0547,31611.1392,0.9619,0.7334,1.0636
Mean,24208.0268,1108208927.9393,33206.1231,0.9657,0.6539,0.8277
SD,1463.186,160174773.6108,2358.4568,0.0045,0.048,0.1405


In [117]:
kospi_pred.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 925 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    925 non-null    datetime64[ns]
 1   code    925 non-null    object        
 2   close   925 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 28.9+ KB


### KOSDAQ

In [118]:
days1 = ['2021-10-04', '2021-10-05', '2021-10-06', '2021-10-07', '2021-10-08']
days2 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']

kosdaq_stock_list = stock_list[stock_list['상장시장'] == 'KOSDAQ']

temp_pred_test = pd.DataFrame()
for d in range(5):
    train_start_day = d
    train_end_day = d
    test_day = d+1
    temp_train = kosdaq_train[ (kosdaq_train['Date'] >= days1[train_start_day]) &
                              (kosdaq_train['Date'] <= days2[train_end_day]) ]
    
    temp_test = kosdaq_test[kosdaq_test['Date'] == days2[test_day]]

    ex = setup(temp_train, target = 'close', silent=True, fold=4, fold_shuffle=True, use_gpu=True)
    cat = create_model('catboost')
    pred = predict_model(cat, temp_test)
    temp_test['close'] = pred['Label']
    temp_pred_test = pd.concat([temp_pred_test, temp_test])

kosdaq_pred = temp_pred_test[['Date','code', 'close']]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9594.7304,210807510.711,14519.2118,0.9469,0.4401,0.4251
1,10469.6829,235771319.8107,15354.8468,0.9353,0.5065,0.5196
2,8945.5504,133996591.8728,11575.6897,0.9576,0.4854,0.4995
3,9536.999,193088342.6089,13895.6231,0.9431,0.4713,0.4706
Mean,9636.7407,193415941.2508,13836.3429,0.9457,0.4758,0.4787
SD,543.8865,37507369.8845,1404.1217,0.008,0.0241,0.0355


In [119]:
kosdaq_pred.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 925 entries, 0 to 924
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    925 non-null    datetime64[ns]
 1   code    925 non-null    object        
 2   close   925 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 28.9+ KB


### KOSPI, KOSDAQ 예측값 합치기

In [120]:
all_data = pd.concat([kospi_pred,kosdaq_pred])
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850 entries, 0 to 924
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    1850 non-null   datetime64[ns]
 1   code    1850 non-null   object        
 2   close   1850 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 57.8+ KB


In [121]:
all_data = all_data.sort_values('Date')
all_data = all_data.reset_index().drop('index', axis=1)
all_data

Unnamed: 0,Date,code,close
0,2021-11-01,005930,59024.119220
1,2021-11-01,215200,75285.819229
2,2021-11-01,112040,93995.504732
3,2021-11-01,067630,24002.295991
4,2021-11-01,218410,25544.179190
...,...,...,...
1845,2021-11-05,064350,46749.643188
1846,2021-11-05,004170,241524.943399
1847,2021-11-05,298050,683153.565087
1848,2021-11-05,153130,99477.192071


In [122]:
all_data['code'] = all_data['code'].astype(int)
all_data = all_data.sort_values(['code','Date'])
all_data

Unnamed: 0,Date,code,close
329,2021-11-01,60,33879.194865
595,2021-11-02,60,33750.773853
777,2021-11-03,60,43889.078001
1332,2021-11-04,60,41348.023605
1807,2021-11-05,60,31360.235247
...,...,...,...
11,2021-11-01,950130,16477.537396
542,2021-11-02,950130,19719.678121
1096,2021-11-03,950130,23634.813986
1279,2021-11-04,950130,17047.446498


In [123]:
all_data = all_data.reset_index().drop('index', axis=1)
all_data

Unnamed: 0,Date,code,close
0,2021-11-01,60,33879.194865
1,2021-11-02,60,33750.773853
2,2021-11-03,60,43889.078001
3,2021-11-04,60,41348.023605
4,2021-11-05,60,31360.235247
...,...,...,...
1845,2021-11-01,950130,16477.537396
1846,2021-11-02,950130,19719.678121
1847,2021-11-03,950130,23634.813986
1848,2021-11-04,950130,17047.446498


In [124]:
all_data['code'] = all_data['code'].astype(str).str.zfill(6)
all_data.head()

Unnamed: 0,Date,code,close
0,2021-11-01,60,33879.194865
1,2021-11-02,60,33750.773853
2,2021-11-03,60,43889.078001
3,2021-11-04,60,41348.023605
4,2021-11-05,60,31360.235247


In [125]:
all_data_pivot = all_data.pivot_table(index=['Date'], columns =['code'], values=['close'])

In [126]:
all_data_pivot

Unnamed: 0_level_0,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close
code,000060,000080,000100,000120,000150,000240,000250,000270,000660,000670,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2021-11-01,33879.194865,35886.182408,62770.68564,140540.792275,95346.984625,29261.69339,45152.532758,77160.032384,102793.989628,611726.65001,...,46455.430904,43055.131506,75417.30316,31692.496441,30191.540393,49104.06477,266591.854254,257177.183421,29730.174735,16477.537396
2021-11-02,33750.773853,36395.650783,61488.415662,136085.006018,95862.827693,28712.198589,46702.227509,78492.520197,110090.041385,703696.566765,...,47984.563507,52338.233353,86791.857535,33940.836849,31029.658596,52010.982925,321347.449953,249839.430573,37304.071672,19719.678121
2021-11-03,43889.078001,48132.985225,73507.788569,148908.645305,106666.790643,42473.959209,47102.460469,91619.835816,115055.396751,678125.915021,...,50076.703036,44127.704524,81348.62237,36284.188304,32645.838179,54830.933253,302528.05842,242332.106912,34604.476952,23634.813986
2021-11-04,41348.023605,43848.526307,69275.391445,142169.16765,100564.826068,31127.755873,46470.104781,81361.125261,114672.239653,662962.982156,...,47150.302742,58034.225908,87056.477211,33212.745024,29753.985988,50617.409705,277610.087293,256940.518607,42486.568406,17047.446498
2021-11-05,31360.235247,32484.981538,62234.42166,137571.510267,101957.371336,32370.90265,46849.663762,79122.845175,106692.425768,666732.980809,...,48676.268032,52035.21402,90320.743635,34512.115454,30886.561166,49373.669249,313808.646618,252345.030351,33732.641337,19911.318794


In [127]:
all_data_pivot.columns = all_data_pivot.columns.droplevel(0)
all_data_pivot

code,000060,000080,000100,000120,000150,000240,000250,000270,000660,000670,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-11-01,33879.194865,35886.182408,62770.68564,140540.792275,95346.984625,29261.69339,45152.532758,77160.032384,102793.989628,611726.65001,...,46455.430904,43055.131506,75417.30316,31692.496441,30191.540393,49104.06477,266591.854254,257177.183421,29730.174735,16477.537396
2021-11-02,33750.773853,36395.650783,61488.415662,136085.006018,95862.827693,28712.198589,46702.227509,78492.520197,110090.041385,703696.566765,...,47984.563507,52338.233353,86791.857535,33940.836849,31029.658596,52010.982925,321347.449953,249839.430573,37304.071672,19719.678121
2021-11-03,43889.078001,48132.985225,73507.788569,148908.645305,106666.790643,42473.959209,47102.460469,91619.835816,115055.396751,678125.915021,...,50076.703036,44127.704524,81348.62237,36284.188304,32645.838179,54830.933253,302528.05842,242332.106912,34604.476952,23634.813986
2021-11-04,41348.023605,43848.526307,69275.391445,142169.16765,100564.826068,31127.755873,46470.104781,81361.125261,114672.239653,662962.982156,...,47150.302742,58034.225908,87056.477211,33212.745024,29753.985988,50617.409705,277610.087293,256940.518607,42486.568406,17047.446498
2021-11-05,31360.235247,32484.981538,62234.42166,137571.510267,101957.371336,32370.90265,46849.663762,79122.845175,106692.425768,666732.980809,...,48676.268032,52035.21402,90320.743635,34512.115454,30886.561166,49373.669249,313808.646618,252345.030351,33732.641337,19911.318794


In [128]:
all_data_pivot.to_csv('prac_4w_pred_1w_cat.csv', index=False)

# 데이터 불러오기(두번째 주)
* 훈련 : 11.01 ~ 11.26
* 예측 : 11.29 ~ 12.03

In [None]:
start_date = '2021-11-01'
end_date = '2021-11-26'

### train set2

In [None]:
code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
code_data = pd.merge(Business_days, code_data, how = 'outer')

In [None]:
data2 = pd.read_csv('../data/20210104-20211126_stock_index.csv')

In [None]:
data2.info()

In [None]:
data2['Date'] = pd.to_datetime(data2['Date'])
data2.info()

In [None]:
data2.tail()

In [None]:
data2['weeknum'] = data2['Date'].dt.weekofyear
data2['weekday'] = data2['Date'].dt.weekday
data2.head()

In [None]:
data2 = data2[['Date', 'weeknum', 'weekday', 'kosdaq', 'nasdaq', 'dow', 'sp500', 'kospi']]

In [None]:
data2.head()

In [None]:
train2 = data2[(data2['Date'] >= start_date) & (data2['Date'] <= end_date)]
train2.head()

In [None]:
train2.tail()

In [None]:
train2 = train2.reset_index()
train2 = train2.drop('index', axis=1)
train2.head()

In [None]:
train2.info()

In [None]:
# 종목 코드 6자리로 맞추기
stock_list['종목코드'] = stock_list['종목코드'].astype(str).str.zfill(6)

In [None]:
stock_list.head()

### test set

In [None]:
Business_days = pd.DataFrame(pd.date_range(start='2021-11-29', end='2021-12-03', freq='B'), columns=['Date'])
test2 = Business_days
test2['weeknum'] = test2['Date'].dt.weekofyear
test2['weekday'] = test2['Date'].dt.weekday
test2.head()

In [None]:
test2.info()

## 3가지 날짜 특성으로 5가지 외부 특성 순서대로 예측하기

In [None]:
features = ['Date', 'weeknum', 'weekday', 'kosdaq', 'nasdaq', 'sp500', 'dow', 'kospi']

for i in tqdm(range(4)):
    model = setup(train2, target = features[i+3], ignore_features = features[i+4:], fold=4, fold_shuffle=True,
                  silent=True, use_gpu=True)
    top1_model = compare_models(sort='MAPE', n_select=1, exclude=['knn','huber','llar','omp','par'])
    final_model = finalize_model(top1_model)
    pred = predict_model(final_model, test2)

    test2[features[i+3]] = pred['Label']

In [None]:
pred_test2 = test2
pred_test2

In [None]:
pred_test2.to_csv('prac_4w_pred_1w_test2.csv', index=False)

## 8가지 특성으로 종가 예측하기

In [None]:
days1 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']
stock_pred_set1 = {}
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

for code in tqdm(stock_list['종목코드'].values):
    code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    code_data = pd.merge(Business_days, code_data, how = 'outer')
    train1['Close'] = code_data['Close']
    temp_pred_test = pd.DataFrame()
    for d in range(5):
        train_end_day = d
        test_day = d+1
        temp_train = train1[train1['Date'] <= days1[train_end_day]]
        temp_test = pred_test1[pred_test1['Date'] == days1[test_day]]

        model = setup(temp_train, target = 'Close', silent=True, fold=4, fold_shuffle=True, use_gpu=True)
        top1_model = compare_models(sort='MAPE', n_select=1, exclude=['knn','huber','llar','omp','par'])
        final_model = finalize_model(top1_model)
        pred = predict_model(final_model, temp_test)
        temp_test[code] = pred['Label']
        temp_pred_test = pd.concat([temp_pred_test, temp_test])
    
    stock_pred_set1[code] = temp_pred_test[code]

In [None]:
len(stock_pred_set1)

In [None]:
pred1 = pd.DataFrame(stock_pred_set1)
pred1.head()

# 제출 파일 만들기

In [None]:
sub = pd.read_csv('../data/sample_submission.csv')
sub

In [None]:
sub.info()

In [None]:
for code in tqdm(sub.columns.values[1:]):
    temp = list(stock_pred_dict1[code]) + list(stock_pred_dict2[code])
    sub[code] = temp

In [None]:
sub

In [None]:
sub.info()

In [None]:
sub.to_csv('sub05_pycaret04.csv', index=False)