In [129]:
import FinanceDataReader as fdr
from tqdm import tqdm
import pandas as pd

# 상장 시장별 예측(Catboost이용)

## 모든 종목 예측

* 종가와 상관계수 높은 특성 : KOSDAQ(0.87), KOSPI(0.79), DOW(0.58), NASDAQ(0.50), S&P500(0.56)
* 예측 순서(종가와 상관계수 높은 특성 먼저 예측)
    * weekday, weeknumdm로 KOSDAQ 예측(종가와 상관계수 0.87)
    * weekday, weeknum, KOSDAQ으로 NASDAQ 예측(KOSDAQ과 상관계수 0.68)
    * weekday, weeknum, KOSDAQ, NASDAQ으로 S&P500 예측(NASDAQ과 상관계수 0.93)
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500으로 DOW 예측(S&P500과 상관계수 0.95)
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500, DOW로 KOSPI 예측
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500, DOW, KOSPI로 다른 종가 예측
* 바로 전날 데이터 이용해 예측

In [130]:
# 제출 점수 :
# 자체 결정계수 점수 : 53.77794

In [131]:
from pycaret.regression import *

# 데이터 불러오기(첫번째 주)
* 훈련 : 10.04 ~ 10.29
* 예측 : 11.01 ~ 11.05

In [132]:
start_date = '2021-10-04'
end_date = '2021-11-04'

### train set

In [133]:
data1 = pd.read_csv('../data/20210104-20211126_stock_index.csv')
stock_list = pd.read_csv('../data/stock_list.csv')

In [134]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    235 non-null    object 
 1   kosdaq  235 non-null    float64
 2   nasdaq  235 non-null    float64
 3   dow     235 non-null    float64
 4   sp500   235 non-null    float64
 5   kospi   235 non-null    float64
dtypes: float64(5), object(1)
memory usage: 11.1+ KB


In [135]:
data1['Date'] = pd.to_datetime(data1['Date'])
data1['weeknum'] = data1['Date'].dt.weekofyear
data1['weekday'] = data1['Date'].dt.weekday
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     235 non-null    datetime64[ns]
 1   kosdaq   235 non-null    float64       
 2   nasdaq   235 non-null    float64       
 3   dow      235 non-null    float64       
 4   sp500    235 non-null    float64       
 5   kospi    235 non-null    float64       
 6   weeknum  235 non-null    int64         
 7   weekday  235 non-null    int64         
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 14.8 KB


In [136]:
data1.head()

Unnamed: 0,Date,kosdaq,nasdaq,dow,sp500,kospi,weeknum,weekday
0,2021-01-04,977.62,12698.4,30223.89,3700.65,2944.45,1,0
1,2021-01-05,985.76,12819.0,30391.6,3726.86,2990.57,1,1
2,2021-01-06,981.39,12740.8,30829.4,3748.14,2968.21,1,2
3,2021-01-07,988.86,13067.5,31041.13,3803.79,3031.68,1,3
4,2021-01-08,987.79,13202.0,31097.97,3824.68,3152.18,1,4


In [137]:
data1.columns

Index(['Date', 'kosdaq', 'nasdaq', 'dow', 'sp500', 'kospi', 'weeknum',
       'weekday'],
      dtype='object')

In [138]:
data1 = data1[['Date', 'weeknum', 'weekday', 'kosdaq', 'nasdaq', 'dow', 'sp500', 'kospi']]

In [139]:
data1.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,dow,sp500,kospi
0,2021-01-04,1,0,977.62,12698.4,30223.89,3700.65,2944.45
1,2021-01-05,1,1,985.76,12819.0,30391.6,3726.86,2990.57
2,2021-01-06,1,2,981.39,12740.8,30829.4,3748.14,2968.21
3,2021-01-07,1,3,988.86,13067.5,31041.13,3803.79,3031.68
4,2021-01-08,1,4,987.79,13202.0,31097.97,3824.68,3152.18


In [140]:
train1 = data1[(data1['Date'] >= start_date) & (data1['Date'] <= end_date)]
train1.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,dow,sp500,kospi
195,2021-10-04,40,0,969.285,14255.5,34003.58,4300.46,2990.675
196,2021-10-05,40,1,955.37,14433.8,34315.99,4345.72,2962.17
197,2021-10-06,40,2,922.36,14501.9,34417.98,4363.55,2908.31
198,2021-10-07,40,3,953.43,14654.0,34754.15,4399.76,2959.46
199,2021-10-08,40,4,953.11,14579.5,34746.71,4391.36,2956.3


In [141]:
train1.tail()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,dow,sp500,kospi
214,2021-10-29,43,4,992.33,15498.4,35819.59,4605.38,2970.68
215,2021-11-01,44,0,998.57,15595.9,35913.68,4613.67,2978.94
216,2021-11-02,44,1,1009.44,15649.6,36053.09,4630.65,3013.49
217,2021-11-03,44,2,1005.0,15811.6,36157.02,4660.57,2975.71
218,2021-11-04,44,3,1001.43,15940.3,36124.66,4680.06,2983.22


In [142]:
train1 = train1.reset_index()
train1 = train1.drop('index', axis=1)
train1.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,dow,sp500,kospi
0,2021-10-04,40,0,969.285,14255.5,34003.58,4300.46,2990.675
1,2021-10-05,40,1,955.37,14433.8,34315.99,4345.72,2962.17
2,2021-10-06,40,2,922.36,14501.9,34417.98,4363.55,2908.31
3,2021-10-07,40,3,953.43,14654.0,34754.15,4399.76,2959.46
4,2021-10-08,40,4,953.11,14579.5,34746.71,4391.36,2956.3


In [143]:
train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     24 non-null     datetime64[ns]
 1   weeknum  24 non-null     int64         
 2   weekday  24 non-null     int64         
 3   kosdaq   24 non-null     float64       
 4   nasdaq   24 non-null     float64       
 5   dow      24 non-null     float64       
 6   sp500    24 non-null     float64       
 7   kospi    24 non-null     float64       
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 1.6 KB


In [144]:
# 종목 코드 6자리로 맞추기
stock_list['종목코드'] = stock_list['종목코드'].astype(str).str.zfill(6)

In [145]:
stock_list.head()

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,5930,KOSPI
1,SK하이닉스,660,KOSPI
2,NAVER,35420,KOSPI
3,카카오,35720,KOSPI
4,삼성바이오로직스,207940,KOSPI


### test set

In [146]:
test1 = data1[(data1['Date'] >= '2021-11-01') & (data1['Date'] <= '2021-11-05')].reset_index()
test1 = test1.drop('index', axis=1)
test1.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,dow,sp500,kospi
0,2021-11-01,44,0,998.57,15595.9,35913.68,4613.67,2978.94
1,2021-11-02,44,1,1009.44,15649.6,36053.09,4630.65,3013.49
2,2021-11-03,44,2,1005.0,15811.6,36157.02,4660.57,2975.71
3,2021-11-04,44,3,1001.43,15940.3,36124.66,4680.06,2983.22
4,2021-11-05,44,4,1001.35,15971.6,36329.07,4697.53,2969.27


In [147]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     5 non-null      datetime64[ns]
 1   weeknum  5 non-null      int64         
 2   weekday  5 non-null      int64         
 3   kosdaq   5 non-null      float64       
 4   nasdaq   5 non-null      float64       
 5   dow      5 non-null      float64       
 6   sp500    5 non-null      float64       
 7   kospi    5 non-null      float64       
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 448.0 bytes


In [148]:
test1 = test1[['Date','weeknum','weekday']]
test1

Unnamed: 0,Date,weeknum,weekday
0,2021-11-01,44,0
1,2021-11-02,44,1
2,2021-11-03,44,2
3,2021-11-04,44,3
4,2021-11-05,44,4


## 3가지 날짜 특성으로 5가지 외부 특성 순서대로 예측하기

In [149]:
start_date = '2021-10-04'
end_date = '2021-11-04'

features = ['Date', 'weeknum', 'weekday', 'kosdaq', 'nasdaq', 'sp500', 'dow', 'kospi']
days1 = ['2021-10-04', '2021-10-05', '2021-10-06', '2021-10-07', '2021-10-08']
days2 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']
pred_test1 = pd.DataFrame()
for d in tqdm(range(5)):
    train_start_day = d
    train_end_day = d
    test_day = d+1
    temp_train = train1[(train1['Date'] >= days1[train_start_day]) & (train1['Date'] <= days2[train_end_day])]
    temp_test = test1[test1['Date'] == days2[test_day]]
    
    for i in range(5):      
        ex = setup(temp_train, target = features[i+3], ignore_features = features[i+4:], fold=4,
                      fold_shuffle=True, silent=True, use_gpu=True)
        model = create_model('catboost')
        final_model = finalize_model(model)
        pred = predict_model(final_model, temp_test)
        
        temp_test[features[i+3]] = pred['Label']
    
    pred_test1 = pd.concat([pred_test1, temp_test])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,7.0009,82.1682,9.0647,0.8492,0.003,0.0023
1,29.8542,996.1527,31.5619,0.0303,0.0105,0.0099
2,12.6934,227.6359,15.0876,0.4579,0.005,0.0042
3,37.6411,1979.2739,44.489,-0.4072,0.0151,0.0128
Mean,21.7974,821.3077,25.0508,0.2326,0.0084,0.0073
SD,12.4273,753.3641,13.92,0.4694,0.0047,0.0042


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:51<00:00, 46.38s/it]


In [150]:
pred_test1

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,sp500,dow,kospi
0,2021-11-01,44,0,993.774033,15185.93078,4518.670516,35386.295974,3008.656576
1,2021-11-02,44,1,994.836778,15208.904303,4556.283664,35639.944399,3019.374808
2,2021-11-03,44,2,998.831589,15436.412607,4584.647643,35685.86947,3004.333055
3,2021-11-04,44,3,995.124467,15398.079867,4601.396368,35868.255806,2988.707422
4,2021-11-05,44,4,995.488771,15474.500642,4634.933521,35919.049859,2974.385061


In [151]:
# pred_test1.to_csv('prac_4w_pred_1w_test1_cat.csv', index=False)

## 훈련, 테스트 셋 준비

### 데이터 불러오기

In [152]:
all_data = pd.read_csv('../data/2021-01-04_2021-11-26_all_data.csv')
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86950 entries, 0 to 86949
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    86950 non-null  object 
 1   kosdaq  86950 non-null  float64
 2   nasdaq  86950 non-null  float64
 3   dow     86950 non-null  float64
 4   sp500   86950 non-null  float64
 5   kospi   86950 non-null  float64
 6   code    86950 non-null  int64  
 7   market  86950 non-null  object 
 8   close   82872 non-null  float64
dtypes: float64(6), int64(1), object(2)
memory usage: 6.0+ MB


In [153]:
all_data['code'] = all_data['code'].astype(str).str.zfill(6)
all_data.head()

Unnamed: 0,Date,kosdaq,nasdaq,dow,sp500,kospi,code,market,close
0,2021-01-04,977.62,12698.4,30223.89,3700.65,2944.45,5930,KOSPI,83000.0
1,2021-01-05,985.76,12819.0,30391.6,3726.86,2990.57,5930,KOSPI,83900.0
2,2021-01-06,981.39,12740.8,30829.4,3748.14,2968.21,5930,KOSPI,82200.0
3,2021-01-07,988.86,13067.5,31041.13,3803.79,3031.68,5930,KOSPI,82900.0
4,2021-01-08,987.79,13202.0,31097.97,3824.68,3152.18,5930,KOSPI,88800.0


### 훈련 셋 준비

#### KOPSI 종목 데이터만 추출

In [154]:
kospi_train = all_data[all_data['market'] == 'KOSPI']
kospi_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43475 entries, 0 to 43474
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    43475 non-null  object 
 1   kosdaq  43475 non-null  float64
 2   nasdaq  43475 non-null  float64
 3   dow     43475 non-null  float64
 4   sp500   43475 non-null  float64
 5   kospi   43475 non-null  float64
 6   code    43475 non-null  object 
 7   market  43475 non-null  object 
 8   close   41440 non-null  float64
dtypes: float64(6), object(3)
memory usage: 3.3+ MB


In [155]:
kospi_train.head()

Unnamed: 0,Date,kosdaq,nasdaq,dow,sp500,kospi,code,market,close
0,2021-01-04,977.62,12698.4,30223.89,3700.65,2944.45,5930,KOSPI,83000.0
1,2021-01-05,985.76,12819.0,30391.6,3726.86,2990.57,5930,KOSPI,83900.0
2,2021-01-06,981.39,12740.8,30829.4,3748.14,2968.21,5930,KOSPI,82200.0
3,2021-01-07,988.86,13067.5,31041.13,3803.79,3031.68,5930,KOSPI,82900.0
4,2021-01-08,987.79,13202.0,31097.97,3824.68,3152.18,5930,KOSPI,88800.0


#### KOSDAQ 종목 데이터만 추출

In [156]:
kosdaq_train = all_data[all_data['market'] == 'KOSDAQ']
kosdaq_train = kosdaq_train.reset_index()
kosdaq_train = kosdaq_train.drop('index', axis=1)
kosdaq_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43475 entries, 0 to 43474
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    43475 non-null  object 
 1   kosdaq  43475 non-null  float64
 2   nasdaq  43475 non-null  float64
 3   dow     43475 non-null  float64
 4   sp500   43475 non-null  float64
 5   kospi   43475 non-null  float64
 6   code    43475 non-null  object 
 7   market  43475 non-null  object 
 8   close   41432 non-null  float64
dtypes: float64(6), object(3)
memory usage: 3.0+ MB


In [157]:
kosdaq_train.head()

Unnamed: 0,Date,kosdaq,nasdaq,dow,sp500,kospi,code,market,close
0,2021-01-04,977.62,12698.4,30223.89,3700.65,2944.45,91990,KOSDAQ,151300.0
1,2021-01-05,985.76,12819.0,30391.6,3726.86,2990.57,91990,KOSDAQ,152300.0
2,2021-01-06,981.39,12740.8,30829.4,3748.14,2968.21,91990,KOSDAQ,150300.0
3,2021-01-07,988.86,13067.5,31041.13,3803.79,3031.68,91990,KOSDAQ,154700.0
4,2021-01-08,987.79,13202.0,31097.97,3824.68,3152.18,91990,KOSDAQ,162900.0


### 테스트 셋 준비

In [158]:
test = pd.DataFrame()
test_start_date = '2021-11-01'
test_end_date = '2021-11-05'
Business_days = pd.DataFrame(pd.date_range(start=test_start_date, end=test_end_date, freq='B'), columns=['Date'])
for code, market in tqdm(stock_list[['종목코드', '상장시장']].values):
    temp_code = [code] * 5
    temp_market = [market] * 5
    
    pred_test1['code'] = temp_code
    pred_test1['market'] = temp_market
    
    test = pd.concat([test, pred_test1])

100%|███████████████████████████████████████████████████████████████████████████████| 370/370 [00:00<00:00, 893.95it/s]


In [159]:
# 5일 * 370종목 = 1850
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850 entries, 0 to 4
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     1850 non-null   datetime64[ns]
 1   weeknum  1850 non-null   int64         
 2   weekday  1850 non-null   int64         
 3   kosdaq   1850 non-null   float64       
 4   nasdaq   1850 non-null   float64       
 5   sp500    1850 non-null   float64       
 6   dow      1850 non-null   float64       
 7   kospi    1850 non-null   float64       
 8   code     1850 non-null   object        
 9   market   1850 non-null   object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 159.0+ KB


In [160]:
test.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,sp500,dow,kospi,code,market
0,2021-11-01,44,0,993.774033,15185.93078,4518.670516,35386.295974,3008.656576,5930,KOSPI
1,2021-11-02,44,1,994.836778,15208.904303,4556.283664,35639.944399,3019.374808,5930,KOSPI
2,2021-11-03,44,2,998.831589,15436.412607,4584.647643,35685.86947,3004.333055,5930,KOSPI
3,2021-11-04,44,3,995.124467,15398.079867,4601.396368,35868.255806,2988.707422,5930,KOSPI
4,2021-11-05,44,4,995.488771,15474.500642,4634.933521,35919.049859,2974.385061,5930,KOSPI


#### KOSPI 종목 데이터만 추출

In [161]:
kospi_test = test[test['market'] == 'KOSPI']
kospi_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 925 entries, 0 to 4
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     925 non-null    datetime64[ns]
 1   weeknum  925 non-null    int64         
 2   weekday  925 non-null    int64         
 3   kosdaq   925 non-null    float64       
 4   nasdaq   925 non-null    float64       
 5   sp500    925 non-null    float64       
 6   dow      925 non-null    float64       
 7   kospi    925 non-null    float64       
 8   code     925 non-null    object        
 9   market   925 non-null    object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 79.5+ KB


In [162]:
kospi_test.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,sp500,dow,kospi,code,market
0,2021-11-01,44,0,993.774033,15185.93078,4518.670516,35386.295974,3008.656576,5930,KOSPI
1,2021-11-02,44,1,994.836778,15208.904303,4556.283664,35639.944399,3019.374808,5930,KOSPI
2,2021-11-03,44,2,998.831589,15436.412607,4584.647643,35685.86947,3004.333055,5930,KOSPI
3,2021-11-04,44,3,995.124467,15398.079867,4601.396368,35868.255806,2988.707422,5930,KOSPI
4,2021-11-05,44,4,995.488771,15474.500642,4634.933521,35919.049859,2974.385061,5930,KOSPI


#### KOSDAQ 종목 데이터만 추출

In [163]:
kosdaq_test = test[test['market'] == 'KOSDAQ'].reset_index()
kosdaq_test = kosdaq_test.drop('index', axis=1)
kosdaq_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 925 entries, 0 to 924
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     925 non-null    datetime64[ns]
 1   weeknum  925 non-null    int64         
 2   weekday  925 non-null    int64         
 3   kosdaq   925 non-null    float64       
 4   nasdaq   925 non-null    float64       
 5   sp500    925 non-null    float64       
 6   dow      925 non-null    float64       
 7   kospi    925 non-null    float64       
 8   code     925 non-null    object        
 9   market   925 non-null    object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 72.4+ KB


In [164]:
kosdaq_test.head()

Unnamed: 0,Date,weeknum,weekday,kosdaq,nasdaq,sp500,dow,kospi,code,market
0,2021-11-01,44,0,993.774033,15185.93078,4518.670516,35386.295974,3008.656576,91990,KOSDAQ
1,2021-11-02,44,1,994.836778,15208.904303,4556.283664,35639.944399,3019.374808,91990,KOSDAQ
2,2021-11-03,44,2,998.831589,15436.412607,4584.647643,35685.86947,3004.333055,91990,KOSDAQ
3,2021-11-04,44,3,995.124467,15398.079867,4601.396368,35868.255806,2988.707422,91990,KOSDAQ
4,2021-11-05,44,4,995.488771,15474.500642,4634.933521,35919.049859,2974.385061,91990,KOSDAQ


## 상장 시장 별 종가 예측하기

### KOSPI

In [165]:
days1 = ['2021-10-04', '2021-10-05', '2021-10-06', '2021-10-07', '2021-10-08']
days2 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']

kospi_stock_list = stock_list[stock_list['상장시장'] == 'KOSPI']

temp_pred_test = pd.DataFrame()
for d in tqdm(range(5)):
    train_start_day = d
    train_end_day = d
    test_day = d+1
    temp_train = kospi_train[ (kospi_train['Date'] >= days1[train_start_day]) &
                              (kospi_train['Date'] <= days2[train_end_day]) ]
    
    temp_test = kospi_test[kospi_test['Date'] == days2[test_day]]

    ex = setup(temp_train, target = 'close', silent=True, fold=4, fold_shuffle=True, use_gpu=True)
    cat = create_model('catboost')
    pred = predict_model(cat, temp_test)
    temp_test['close'] = pred['Label']
    temp_pred_test = pd.concat([temp_pred_test, temp_test])

kospi_pred = temp_pred_test[['Date','code', 'close']]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,25011.3649,1095705981.2361,33101.4498,0.9669,0.7071,0.939
1,27588.1116,2022797554.7693,44975.5217,0.9426,0.6105,0.7246
2,25101.6531,1188616018.3477,34476.311,0.9587,0.6123,0.7082
3,22817.8025,877063068.2434,29615.2506,0.9733,0.7316,0.9879
Mean,25129.7331,1296045655.6491,35542.1333,0.9604,0.6654,0.8399
SD,1688.4496,434566405.6098,5727.3395,0.0115,0.0547,0.1249


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:54<00:00, 10.87s/it]


In [166]:
kospi_pred.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 925 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    925 non-null    datetime64[ns]
 1   code    925 non-null    object        
 2   close   925 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 28.9+ KB


### KOSDAQ

In [167]:
days1 = ['2021-10-04', '2021-10-05', '2021-10-06', '2021-10-07', '2021-10-08']
days2 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']

kosdaq_stock_list = stock_list[stock_list['상장시장'] == 'KOSDAQ']

temp_pred_test = pd.DataFrame()
for d in tqdm(range(5)):
    train_start_day = d
    train_end_day = d
    test_day = d+1
    temp_train = kosdaq_train[ (kosdaq_train['Date'] >= days1[train_start_day]) &
                              (kosdaq_train['Date'] <= days2[train_end_day]) ]
    
    temp_test = kosdaq_test[kosdaq_test['Date'] == days2[test_day]]

    ex = setup(temp_train, target = 'close', silent=True, fold=4, fold_shuffle=True, use_gpu=True)
    cat = create_model('catboost')
    pred = predict_model(cat, temp_test)
    temp_test['close'] = pred['Label']
    temp_pred_test = pd.concat([temp_pred_test, temp_test])

kosdaq_pred = temp_pred_test[['Date','code', 'close']]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,9553.9677,185740022.6179,13628.6471,0.9557,0.4967,0.5007
1,9966.2901,168049509.1903,12963.3911,0.9595,0.5211,0.547
2,10579.0285,233260840.5683,15272.8792,0.9364,0.4863,0.4934
3,9447.5611,170342748.9918,13051.542,0.9413,0.4711,0.4689
Mean,9886.7119,189348280.3421,13729.1149,0.9482,0.4938,0.5025
SD,444.1866,26249668.3022,927.192,0.0096,0.0182,0.0283


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:54<00:00, 10.93s/it]


In [168]:
kosdaq_pred.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 925 entries, 0 to 924
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    925 non-null    datetime64[ns]
 1   code    925 non-null    object        
 2   close   925 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 28.9+ KB


### KOSPI, KOSDAQ 예측값 합치기

In [169]:
all_data = pd.concat([kospi_pred,kosdaq_pred])
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850 entries, 0 to 924
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    1850 non-null   datetime64[ns]
 1   code    1850 non-null   object        
 2   close   1850 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 57.8+ KB


In [170]:
all_data = all_data.sort_values('Date')
all_data = all_data.reset_index().drop('index', axis=1)
all_data

Unnamed: 0,Date,code,close
0,2021-11-01,005930,65087.011461
1,2021-11-01,215200,74720.900754
2,2021-11-01,112040,116079.716977
3,2021-11-01,067630,24519.100835
4,2021-11-01,218410,23714.014319
...,...,...,...
1845,2021-11-05,064350,50100.455297
1846,2021-11-05,004170,240062.512795
1847,2021-11-05,298050,669046.272999
1848,2021-11-05,153130,96797.067366


In [171]:
all_data['code'] = all_data['code'].astype(int)
all_data = all_data.sort_values(['code','Date'])
all_data

Unnamed: 0,Date,code,close
329,2021-11-01,60,32805.006597
595,2021-11-02,60,33904.344944
777,2021-11-03,60,33277.854022
1332,2021-11-04,60,33505.766958
1807,2021-11-05,60,39368.183024
...,...,...,...
11,2021-11-01,950130,19441.572841
542,2021-11-02,950130,14629.074601
1096,2021-11-03,950130,20396.261123
1279,2021-11-04,950130,20754.384458


In [172]:
all_data = all_data.reset_index().drop('index', axis=1)
all_data

Unnamed: 0,Date,code,close
0,2021-11-01,60,32805.006597
1,2021-11-02,60,33904.344944
2,2021-11-03,60,33277.854022
3,2021-11-04,60,33505.766958
4,2021-11-05,60,39368.183024
...,...,...,...
1845,2021-11-01,950130,19441.572841
1846,2021-11-02,950130,14629.074601
1847,2021-11-03,950130,20396.261123
1848,2021-11-04,950130,20754.384458


In [173]:
all_data['code'] = all_data['code'].astype(str).str.zfill(6)
all_data.head()

Unnamed: 0,Date,code,close
0,2021-11-01,60,32805.006597
1,2021-11-02,60,33904.344944
2,2021-11-03,60,33277.854022
3,2021-11-04,60,33505.766958
4,2021-11-05,60,39368.183024


In [174]:
all_data_pivot = all_data.pivot_table(index=['Date'], columns =['code'], values=['close'])

In [175]:
all_data_pivot

Unnamed: 0_level_0,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close,close
code,000060,000080,000100,000120,000150,000240,000250,000270,000660,000670,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2021-11-01,32805.006597,33934.213051,58000.069543,129592.028749,96203.65821,26881.023532,45476.497247,77737.904173,106735.735918,684127.1457,...,45315.54859,49573.280411,84390.372197,31935.364168,28997.53273,56143.701959,304631.778937,262644.668832,35772.681802,19441.572841
2021-11-02,33904.344944,36170.13258,62083.595915,138664.997641,98651.154303,26998.768973,44930.13356,81620.164634,114386.64931,706359.340706,...,47922.863554,47866.594764,81229.500485,34642.956883,32142.309947,52650.991017,304579.196835,252154.596114,29685.376944,14629.074601
2021-11-03,33277.854022,34052.806024,61511.25963,137401.313935,98651.517178,26810.045852,46766.197291,75610.709844,97616.67512,612815.369637,...,50229.026623,54312.992098,84306.407678,36095.684497,32151.797524,53515.49609,307218.533312,242829.143889,32992.341994,20396.261123
2021-11-04,33505.766958,34785.34567,62064.075446,138889.121557,98753.69825,31002.268061,46356.772871,80908.500282,104616.602723,664119.855229,...,47176.778363,50465.863234,81115.413659,33812.617003,30660.177354,51554.141847,307621.61358,247195.691149,38332.33637,20754.384458
2021-11-05,39368.183024,42205.940066,67565.438365,140666.004351,105066.262593,34168.572239,46030.871478,84878.468283,104080.228016,670477.240448,...,45888.681093,54446.0064,87415.545999,33332.661207,29749.530379,49334.220886,325306.199446,242752.82167,32504.770365,19858.206238


In [176]:
all_data_pivot.columns = all_data_pivot.columns.droplevel(0)
all_data_pivot

code,000060,000080,000100,000120,000150,000240,000250,000270,000660,000670,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-11-01,32805.006597,33934.213051,58000.069543,129592.028749,96203.65821,26881.023532,45476.497247,77737.904173,106735.735918,684127.1457,...,45315.54859,49573.280411,84390.372197,31935.364168,28997.53273,56143.701959,304631.778937,262644.668832,35772.681802,19441.572841
2021-11-02,33904.344944,36170.13258,62083.595915,138664.997641,98651.154303,26998.768973,44930.13356,81620.164634,114386.64931,706359.340706,...,47922.863554,47866.594764,81229.500485,34642.956883,32142.309947,52650.991017,304579.196835,252154.596114,29685.376944,14629.074601
2021-11-03,33277.854022,34052.806024,61511.25963,137401.313935,98651.517178,26810.045852,46766.197291,75610.709844,97616.67512,612815.369637,...,50229.026623,54312.992098,84306.407678,36095.684497,32151.797524,53515.49609,307218.533312,242829.143889,32992.341994,20396.261123
2021-11-04,33505.766958,34785.34567,62064.075446,138889.121557,98753.69825,31002.268061,46356.772871,80908.500282,104616.602723,664119.855229,...,47176.778363,50465.863234,81115.413659,33812.617003,30660.177354,51554.141847,307621.61358,247195.691149,38332.33637,20754.384458
2021-11-05,39368.183024,42205.940066,67565.438365,140666.004351,105066.262593,34168.572239,46030.871478,84878.468283,104080.228016,670477.240448,...,45888.681093,54446.0064,87415.545999,33332.661207,29749.530379,49334.220886,325306.199446,242752.82167,32504.770365,19858.206238


In [177]:
all_data_pivot.to_csv('prac_4w_pred_1w_cat.csv', index=False)

# 데이터 불러오기(두번째 주)
* 훈련 : 11.01 ~ 11.26
* 예측 : 11.29 ~ 12.03

In [None]:
start_date = '2021-11-01'
end_date = '2021-11-26'

### train set2

In [None]:
code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
code_data = pd.merge(Business_days, code_data, how = 'outer')

In [None]:
data2 = pd.read_csv('../data/20210104-20211126_stock_index.csv')

In [None]:
data2.info()

In [None]:
data2['Date'] = pd.to_datetime(data2['Date'])
data2.info()

In [None]:
data2.tail()

In [None]:
data2['weeknum'] = data2['Date'].dt.weekofyear
data2['weekday'] = data2['Date'].dt.weekday
data2.head()

In [None]:
data2 = data2[['Date', 'weeknum', 'weekday', 'kosdaq', 'nasdaq', 'dow', 'sp500', 'kospi']]

In [None]:
data2.head()

In [None]:
train2 = data2[(data2['Date'] >= start_date) & (data2['Date'] <= end_date)]
train2.head()

In [None]:
train2.tail()

In [None]:
train2 = train2.reset_index()
train2 = train2.drop('index', axis=1)
train2.head()

In [None]:
train2.info()

In [None]:
# 종목 코드 6자리로 맞추기
stock_list['종목코드'] = stock_list['종목코드'].astype(str).str.zfill(6)

In [None]:
stock_list.head()

### test set

In [None]:
Business_days = pd.DataFrame(pd.date_range(start='2021-11-29', end='2021-12-03', freq='B'), columns=['Date'])
test2 = Business_days
test2['weeknum'] = test2['Date'].dt.weekofyear
test2['weekday'] = test2['Date'].dt.weekday
test2.head()

In [None]:
test2.info()

## 3가지 날짜 특성으로 5가지 외부 특성 순서대로 예측하기

In [None]:
features = ['Date', 'weeknum', 'weekday', 'kosdaq', 'nasdaq', 'sp500', 'dow', 'kospi']

for i in tqdm(range(4)):
    model = setup(train2, target = features[i+3], ignore_features = features[i+4:], fold=4, fold_shuffle=True,
                  silent=True, use_gpu=True)
    top1_model = compare_models(sort='MAPE', n_select=1, exclude=['knn','huber','llar','omp','par'])
    final_model = finalize_model(top1_model)
    pred = predict_model(final_model, test2)

    test2[features[i+3]] = pred['Label']

In [None]:
pred_test2 = test2
pred_test2

In [None]:
pred_test2.to_csv('prac_4w_pred_1w_test2.csv', index=False)

## 8가지 특성으로 종가 예측하기

In [None]:
days1 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']
stock_pred_set1 = {}
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

for code in tqdm(stock_list['종목코드'].values):
    code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    code_data = pd.merge(Business_days, code_data, how = 'outer')
    train1['Close'] = code_data['Close']
    temp_pred_test = pd.DataFrame()
    for d in range(5):
        train_end_day = d
        test_day = d+1
        temp_train = train1[train1['Date'] <= days1[train_end_day]]
        temp_test = pred_test1[pred_test1['Date'] == days1[test_day]]

        model = setup(temp_train, target = 'Close', silent=True, fold=4, fold_shuffle=True, use_gpu=True)
        top1_model = compare_models(sort='MAPE', n_select=1, exclude=['knn','huber','llar','omp','par'])
        final_model = finalize_model(top1_model)
        pred = predict_model(final_model, temp_test)
        temp_test[code] = pred['Label']
        temp_pred_test = pd.concat([temp_pred_test, temp_test])
    
    stock_pred_set1[code] = temp_pred_test[code]

In [None]:
len(stock_pred_set1)

In [None]:
pred1 = pd.DataFrame(stock_pred_set1)
pred1.head()

# 제출 파일 만들기

In [None]:
sub = pd.read_csv('../data/sample_submission.csv')
sub

In [None]:
sub.info()

In [None]:
for code in tqdm(sub.columns.values[1:]):
    temp = list(stock_pred_dict1[code]) + list(stock_pred_dict2[code])
    sub[code] = temp

In [None]:
sub

In [None]:
sub.info()

In [None]:
sub.to_csv('sub05_pycaret04.csv', index=False)