In [1]:
import FinanceDataReader as fdr
from tqdm import tqdm
import pandas as pd

# 상장 시장별 예측(Pycaret, Catboost이용)

## 모든 종목 예측

* 종가와 상관계수 높은 특성 : KOSDAQ(0.87), KOSPI(0.79), DOW(0.58), NASDAQ(0.50), S&P500(0.56)
* 예측 순서(종가와 상관계수 높은 특성 먼저 예측)
    * weekday, weeknumdm로 KOSDAQ 예측(종가와 상관계수 0.87)
    * weekday, weeknum, KOSDAQ으로 NASDAQ 예측(KOSDAQ과 상관계수 0.68)
    * weekday, weeknum, KOSDAQ, NASDAQ으로 S&P500 예측(NASDAQ과 상관계수 0.93)
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500으로 DOW 예측(S&P500과 상관계수 0.95)
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500, DOW로 KOSPI 예측
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500, DOW, KOSPI로 다른 종가 예측
* 바로 전날 데이터 이용해 예측
* 증감 비율로 예측

In [2]:
# 제출 점수 :
# 자체 결정계수 점수 : 3.22868

In [3]:
from pycaret.regression import *

# 데이터 불러오기(첫번째 주)
* 훈련 : 10.04 ~ 10.29
* 예측 : 11.01 ~ 11.05

### 원 데이터 불러오기

In [4]:
data = pd.read_csv('../data/2021-01-04_2021-11-26_all_data2.csv')
stock_list = pd.read_csv('../data/stock_list.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86950 entries, 0 to 86949
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         86950 non-null  object 
 1   kosdaq       86950 non-null  float64
 2   nasdaq       86950 non-null  float64
 3   dow          86950 non-null  float64
 4   sp500        86950 non-null  float64
 5   kospi        86950 non-null  float64
 6   code         86950 non-null  int64  
 7   market       86950 non-null  object 
 8   close        86950 non-null  float64
 9   kospi_rate   86950 non-null  float64
 10  nasdaq_rate  86950 non-null  float64
 11  dow_rate     86950 non-null  float64
 12  sp500_rate   86950 non-null  float64
 13  kosdaq_rate  86950 non-null  float64
 14  close_rate   86950 non-null  float64
dtypes: float64(12), int64(1), object(2)
memory usage: 10.0+ MB


In [6]:
data['code'] = data['code'].astype(str).str.zfill(6)
data.head()

Unnamed: 0,Date,kosdaq,nasdaq,dow,sp500,kospi,code,market,close,kospi_rate,nasdaq_rate,dow_rate,sp500_rate,kosdaq_rate,close_rate
0,2021-01-04,977.62,12698.4,30223.89,3700.65,2944.45,5930,KOSPI,83000.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-01-05,985.76,12819.0,30391.6,3726.86,2990.57,5930,KOSPI,83900.0,0.015663,0.009497,0.005549,0.007083,0.008326,0.010843
2,2021-01-06,981.39,12740.8,30829.4,3748.14,2968.21,5930,KOSPI,82200.0,-0.007477,-0.0061,0.014405,0.00571,-0.004433,-0.020262
3,2021-01-07,988.86,13067.5,31041.13,3803.79,3031.68,5930,KOSPI,82900.0,0.021383,0.025642,0.006868,0.014847,0.007612,0.008516
4,2021-01-08,987.79,13202.0,31097.97,3824.68,3152.18,5930,KOSPI,88800.0,0.039747,0.010293,0.001831,0.005492,-0.001082,0.07117


In [7]:
data.columns

Index(['Date', 'kosdaq', 'nasdaq', 'dow', 'sp500', 'kospi', 'code', 'market',
       'close', 'kospi_rate', 'nasdaq_rate', 'dow_rate', 'sp500_rate',
       'kosdaq_rate', 'close_rate'],
      dtype='object')

### 주식 지수에 필요한 열만 추출

In [8]:
index_data = data[['Date', 'code', 'market', 'kospi_rate', 'nasdaq_rate', 'dow_rate', 'sp500_rate', 'kosdaq_rate']]
index_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86950 entries, 0 to 86949
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         86950 non-null  object 
 1   code         86950 non-null  object 
 2   market       86950 non-null  object 
 3   kospi_rate   86950 non-null  float64
 4   nasdaq_rate  86950 non-null  float64
 5   dow_rate     86950 non-null  float64
 6   sp500_rate   86950 non-null  float64
 7   kosdaq_rate  86950 non-null  float64
dtypes: float64(5), object(3)
memory usage: 5.3+ MB


In [9]:
index_data.head()

Unnamed: 0,Date,code,market,kospi_rate,nasdaq_rate,dow_rate,sp500_rate,kosdaq_rate
0,2021-01-04,5930,KOSPI,0.0,0.0,0.0,0.0,0.0
1,2021-01-05,5930,KOSPI,0.015663,0.009497,0.005549,0.007083,0.008326
2,2021-01-06,5930,KOSPI,-0.007477,-0.0061,0.014405,0.00571,-0.004433
3,2021-01-07,5930,KOSPI,0.021383,0.025642,0.006868,0.014847,0.007612
4,2021-01-08,5930,KOSPI,0.039747,0.010293,0.001831,0.005492,-0.001082


### train set

In [10]:
start_date = '2021-10-04'
end_date = '2021-11-04'

In [11]:
index_train = index_data[(index_data['Date'] >= start_date) & (index_data['Date'] <= end_date)]
index_train = index_train.reset_index(drop=True)
index_train.head()

Unnamed: 0,Date,code,market,kospi_rate,nasdaq_rate,dow_rate,sp500_rate,kosdaq_rate
0,2021-10-04,5930,KOSPI,-0.009441,-0.021364,-0.009435,-0.012988,-0.014153
1,2021-10-05,5930,KOSPI,-0.009531,0.012507,0.009188,0.010524,-0.014356
2,2021-10-06,5930,KOSPI,-0.018183,0.004718,0.002972,0.004103,-0.034552
3,2021-10-07,5930,KOSPI,0.017588,0.010488,0.009767,0.008298,0.033685
4,2021-10-08,5930,KOSPI,-0.001068,-0.005084,-0.000214,-0.001909,-0.000336


In [12]:
index_train.tail()

Unnamed: 0,Date,code,market,kospi_rate,nasdaq_rate,dow_rate,sp500_rate,kosdaq_rate
8875,2021-10-29,99320,KOSDAQ,-0.012916,0.003256,0.002511,0.001949,-0.007799
8876,2021-11-01,99320,KOSDAQ,0.002781,0.006291,0.002627,0.0018,0.006288
8877,2021-11-02,99320,KOSDAQ,0.011598,0.003443,0.003882,0.00368,0.010886
8878,2021-11-03,99320,KOSDAQ,-0.012537,0.010352,0.002883,0.006461,-0.004398
8879,2021-11-04,99320,KOSDAQ,0.002524,0.00814,-0.000895,0.004182,-0.003552


In [13]:
index_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8880 entries, 0 to 8879
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         8880 non-null   object 
 1   code         8880 non-null   object 
 2   market       8880 non-null   object 
 3   kospi_rate   8880 non-null   float64
 4   nasdaq_rate  8880 non-null   float64
 5   dow_rate     8880 non-null   float64
 6   sp500_rate   8880 non-null   float64
 7   kosdaq_rate  8880 non-null   float64
dtypes: float64(5), object(3)
memory usage: 555.1+ KB


### test set

In [14]:
index_test = index_data[(index_data['Date'] >= '2021-11-01') & (index_data['Date'] <= '2021-11-05')]
index_test = index_test[['Date', 'code', 'market']]
index_test = index_test.reset_index(drop=True)
index_test.head()

Unnamed: 0,Date,code,market
0,2021-11-01,5930,KOSPI
1,2021-11-02,5930,KOSPI
2,2021-11-03,5930,KOSPI
3,2021-11-04,5930,KOSPI
4,2021-11-05,5930,KOSPI


In [15]:
index_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1850 entries, 0 to 1849
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    1850 non-null   object
 1   code    1850 non-null   object
 2   market  1850 non-null   object
dtypes: object(3)
memory usage: 43.5+ KB


## 3가지 특성으로 5가지 외부 특성 순서대로 예측하기

In [16]:
start_date = '2021-10-04'
end_date = '2021-11-04'

features = ['Date', 'code', 'market', 'kospi_rate', 'nasdaq_rate', 'dow_rate', 'sp500_rate', 'kosdaq_rate']
days1 = ['2021-10-04', '2021-10-05', '2021-10-06', '2021-10-07', '2021-10-08']
days2 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']
pred_index_test = pd.DataFrame()
for d in tqdm(range(5)):
    train_start_day = d
    train_end_day = d
    test_day = d+1
    temp_train = index_train[(index_train['Date'] >= days1[train_start_day]) & (index_train['Date'] <= days2[train_end_day])]
    temp_test = index_test[index_test['Date'] == days2[test_day]]
    
    for i in range(5):      
        ex = setup(temp_train, target = features[i+3], ignore_features = features[i+4:], fold=4,
                      fold_shuffle=True, silent=True, use_gpu=True)
        model = create_model('catboost')
        final_model = finalize_model(model)
        pred = predict_model(final_model, temp_test)
        
        temp_test[features[i+3]] = pred['Label']
    
    pred_index_test = pd.concat([pred_index_test, temp_test])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0,0.0,0.0,1.0,0.0,0.0001
1,0.0,0.0,0.0,1.0,0.0,0.0001
2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
Mean,0.0,0.0,0.0,1.0,0.0,0.0001
SD,0.0,0.0,0.0,0.0,0.0,0.0


100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [11:51<00:00, 142.31s/it]


In [17]:
pred_index_test = pred_index_test.reset_index(drop=True)
pred_index_test

Unnamed: 0,Date,code,market,kospi_rate,nasdaq_rate,dow_rate,sp500_rate,kosdaq_rate
0,2021-11-01,005930,KOSPI,-0.003548,0.008357,-0.001033,0.003379,0.003352
1,2021-11-01,000660,KOSPI,-0.003549,0.008357,-0.001033,0.003379,0.003352
2,2021-11-01,035420,KOSPI,-0.003547,0.008357,-0.001033,0.003379,0.003352
3,2021-11-01,035720,KOSPI,-0.003547,0.008357,-0.001033,0.003379,0.003352
4,2021-11-01,207940,KOSPI,-0.003547,0.008357,-0.001033,0.003379,0.003352
...,...,...,...,...,...,...,...,...
1845,2021-11-05,220630,KOSDAQ,-0.000403,-0.000409,0.000793,0.000623,0.001221
1846,2021-11-05,064260,KOSDAQ,-0.000399,-0.000409,0.000793,0.000623,0.001221
1847,2021-11-05,287410,KOSDAQ,-0.000412,-0.000409,0.000795,0.000623,0.001221
1848,2021-11-05,110790,KOSDAQ,-0.000401,-0.000409,0.000793,0.000623,0.001221


In [18]:
# pred_index_test.to_csv('prac_4w_pred_1w_index_test_cat.csv', index=False)

## 종가예측 위한 훈련, 테스트 셋 준비

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86950 entries, 0 to 86949
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         86950 non-null  object 
 1   kosdaq       86950 non-null  float64
 2   nasdaq       86950 non-null  float64
 3   dow          86950 non-null  float64
 4   sp500        86950 non-null  float64
 5   kospi        86950 non-null  float64
 6   code         86950 non-null  object 
 7   market       86950 non-null  object 
 8   close        86950 non-null  float64
 9   kospi_rate   86950 non-null  float64
 10  nasdaq_rate  86950 non-null  float64
 11  dow_rate     86950 non-null  float64
 12  sp500_rate   86950 non-null  float64
 13  kosdaq_rate  86950 non-null  float64
 14  close_rate   86950 non-null  float64
dtypes: float64(12), object(3)
memory usage: 10.0+ MB


### 훈련 셋 준비

In [20]:
start_date = '2021-10-04'
end_date = '2021-11-04'

In [21]:
data.columns

Index(['Date', 'kosdaq', 'nasdaq', 'dow', 'sp500', 'kospi', 'code', 'market',
       'close', 'kospi_rate', 'nasdaq_rate', 'dow_rate', 'sp500_rate',
       'kosdaq_rate', 'close_rate'],
      dtype='object')

In [22]:
close_train = data[ (data['Date'] >= start_date) & (data['Date'] <= end_date) ]
close_train = close_train[ ['Date', 'code', 'market', 'kospi_rate', 'nasdaq_rate',
 'dow_rate', 'sp500_rate', 'kosdaq_rate', 'close_rate'] ]
close_train = close_train.reset_index(drop=True)
close_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8880 entries, 0 to 8879
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         8880 non-null   object 
 1   code         8880 non-null   object 
 2   market       8880 non-null   object 
 3   kospi_rate   8880 non-null   float64
 4   nasdaq_rate  8880 non-null   float64
 5   dow_rate     8880 non-null   float64
 6   sp500_rate   8880 non-null   float64
 7   kosdaq_rate  8880 non-null   float64
 8   close_rate   8880 non-null   float64
dtypes: float64(6), object(3)
memory usage: 624.5+ KB


In [23]:
close_train.head()

Unnamed: 0,Date,code,market,kospi_rate,nasdaq_rate,dow_rate,sp500_rate,kosdaq_rate,close_rate
0,2021-10-04,5930,KOSPI,-0.009441,-0.021364,-0.009435,-0.012988,-0.014153,-0.006831
1,2021-10-05,5930,KOSPI,-0.009531,0.012507,0.009188,0.010524,-0.014356,-0.006878
2,2021-10-06,5930,KOSPI,-0.018183,0.004718,0.002972,0.004103,-0.034552,-0.012465
3,2021-10-07,5930,KOSPI,0.017588,0.010488,0.009767,0.008298,0.033685,0.004208
4,2021-10-08,5930,KOSPI,-0.001068,-0.005084,-0.000214,-0.001909,-0.000336,-0.001397


### 테스트 셋 준비

In [24]:
pred_index_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1850 entries, 0 to 1849
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         1850 non-null   object 
 1   code         1850 non-null   object 
 2   market       1850 non-null   object 
 3   kospi_rate   1850 non-null   float64
 4   nasdaq_rate  1850 non-null   float64
 5   dow_rate     1850 non-null   float64
 6   sp500_rate   1850 non-null   float64
 7   kosdaq_rate  1850 non-null   float64
dtypes: float64(5), object(3)
memory usage: 115.8+ KB


In [25]:
pred_index_test.head()

Unnamed: 0,Date,code,market,kospi_rate,nasdaq_rate,dow_rate,sp500_rate,kosdaq_rate
0,2021-11-01,5930,KOSPI,-0.003548,0.008357,-0.001033,0.003379,0.003352
1,2021-11-01,660,KOSPI,-0.003549,0.008357,-0.001033,0.003379,0.003352
2,2021-11-01,35420,KOSPI,-0.003547,0.008357,-0.001033,0.003379,0.003352
3,2021-11-01,35720,KOSPI,-0.003547,0.008357,-0.001033,0.003379,0.003352
4,2021-11-01,207940,KOSPI,-0.003547,0.008357,-0.001033,0.003379,0.003352


In [26]:
pred_close_test = pred_index_test

In [27]:
pred_close_test.tail()

Unnamed: 0,Date,code,market,kospi_rate,nasdaq_rate,dow_rate,sp500_rate,kosdaq_rate
1845,2021-11-05,220630,KOSDAQ,-0.000403,-0.000409,0.000793,0.000623,0.001221
1846,2021-11-05,64260,KOSDAQ,-0.000399,-0.000409,0.000793,0.000623,0.001221
1847,2021-11-05,287410,KOSDAQ,-0.000412,-0.000409,0.000795,0.000623,0.001221
1848,2021-11-05,110790,KOSDAQ,-0.000401,-0.000409,0.000793,0.000623,0.001221
1849,2021-11-05,99320,KOSDAQ,-0.0004,-0.000409,0.000793,0.000623,0.001221


## 상장 시장 별 종가 비율 예측하기

In [28]:
market = ['KOSPI', 'KOSDAQ']
days1 = ['2021-10-04', '2021-10-05', '2021-10-06', '2021-10-07', '2021-10-08']
days2 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']


temp_pred_close = pd.DataFrame()

for market in tqdm(market):
    # 상장 시장에 따라 추출
    train = close_train[close_train['market'] == market].reset_index(drop=True)
    test = pred_close_test[pred_close_test['market'] == market].reset_index(drop=True)

    for d in range(5):
        # 훈련, 테스트 날짜 지정
        train_start_day = d
        train_end_day = d
        test_day = d+1

        # 훈련, 테스트 셋 추출
        temp_train = train[ (train['Date'] >= days1[train_start_day]) &
        (train['Date'] <= days2[train_end_day]) ].reset_index(drop=True)
        temp_test = test[test['Date'] == days2[test_day]].reset_index(drop=True)

        # 머신 러닝 훈련
        ex = setup(temp_train, target = 'close_rate', silent=True, fold=4, fold_shuffle=True, use_gpu=True, verbose=False)
        cat = create_model('catboost')
        pred = predict_model(cat, temp_test)
        temp_test['close_rate'] = pred['Label']
        temp_pred_close = pd.concat([temp_pred_close, temp_test])

pred_close_rate = temp_pred_close[['Date', 'code', 'market', 'close_rate']]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0199,0.0009,0.0296,0.0786,0.0245,1.6138
1,0.0201,0.0009,0.0305,0.0508,0.0249,1.4468
2,0.0209,0.0011,0.0329,0.0618,0.0272,1.7216
3,0.0214,0.001,0.0309,-0.0487,0.0242,1.8184
Mean,0.0206,0.001,0.031,0.0356,0.0252,1.6501
SD,0.0006,0.0001,0.0012,0.0497,0.0012,0.1379


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:49<00:00, 54.97s/it]


In [29]:
pred_close_rate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850 entries, 0 to 184
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        1850 non-null   object 
 1   code        1850 non-null   object 
 2   market      1850 non-null   object 
 3   close_rate  1850 non-null   float64
dtypes: float64(1), object(3)
memory usage: 72.3+ KB


In [30]:
pred_close_rate.head()

Unnamed: 0,Date,code,market,close_rate
0,2021-11-01,5930,KOSPI,-0.002903
1,2021-11-01,660,KOSPI,0.001866
2,2021-11-01,35420,KOSPI,-0.00279
3,2021-11-01,35720,KOSPI,-0.00279
4,2021-11-01,207940,KOSPI,0.002129


In [31]:
pred_close_rate = pred_close_rate.sort_values(['code', 'Date'])
pred_close_rate = pred_close_rate.reset_index(drop=True)
pred_close_rate

Unnamed: 0,Date,code,market,close_rate
0,2021-11-01,000060,KOSPI,0.002328
1,2021-11-02,000060,KOSPI,-0.003304
2,2021-11-03,000060,KOSPI,0.005864
3,2021-11-04,000060,KOSPI,0.023918
4,2021-11-05,000060,KOSPI,0.002650
...,...,...,...,...
1845,2021-11-01,950130,KOSDAQ,-0.016779
1846,2021-11-02,950130,KOSDAQ,0.022060
1847,2021-11-03,950130,KOSDAQ,0.006508
1848,2021-11-04,950130,KOSDAQ,0.004333


## 종가 비율 값 원래 값으로 변환

In [32]:
# 원 데이터
data.head()

Unnamed: 0,Date,kosdaq,nasdaq,dow,sp500,kospi,code,market,close,kospi_rate,nasdaq_rate,dow_rate,sp500_rate,kosdaq_rate,close_rate
0,2021-01-04,977.62,12698.4,30223.89,3700.65,2944.45,5930,KOSPI,83000.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-01-05,985.76,12819.0,30391.6,3726.86,2990.57,5930,KOSPI,83900.0,0.015663,0.009497,0.005549,0.007083,0.008326,0.010843
2,2021-01-06,981.39,12740.8,30829.4,3748.14,2968.21,5930,KOSPI,82200.0,-0.007477,-0.0061,0.014405,0.00571,-0.004433,-0.020262
3,2021-01-07,988.86,13067.5,31041.13,3803.79,3031.68,5930,KOSPI,82900.0,0.021383,0.025642,0.006868,0.014847,0.007612,0.008516
4,2021-01-08,987.79,13202.0,31097.97,3824.68,3152.18,5930,KOSPI,88800.0,0.039747,0.010293,0.001831,0.005492,-0.001082,0.07117


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86950 entries, 0 to 86949
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         86950 non-null  object 
 1   kosdaq       86950 non-null  float64
 2   nasdaq       86950 non-null  float64
 3   dow          86950 non-null  float64
 4   sp500        86950 non-null  float64
 5   kospi        86950 non-null  float64
 6   code         86950 non-null  object 
 7   market       86950 non-null  object 
 8   close        86950 non-null  float64
 9   kospi_rate   86950 non-null  float64
 10  nasdaq_rate  86950 non-null  float64
 11  dow_rate     86950 non-null  float64
 12  sp500_rate   86950 non-null  float64
 13  kosdaq_rate  86950 non-null  float64
 14  close_rate   86950 non-null  float64
dtypes: float64(12), object(3)
memory usage: 10.0+ MB


In [34]:
mod_data = data[ ['Date','code','market','close'] ]
mod_data

Unnamed: 0,Date,code,market,close
0,2021-01-04,005930,KOSPI,83000.0
1,2021-01-05,005930,KOSPI,83900.0
2,2021-01-06,005930,KOSPI,82200.0
3,2021-01-07,005930,KOSPI,82900.0
4,2021-01-08,005930,KOSPI,88800.0
...,...,...,...,...
86945,2021-11-22,099320,KOSDAQ,48350.0
86946,2021-11-23,099320,KOSDAQ,46900.0
86947,2021-11-24,099320,KOSDAQ,47150.0
86948,2021-11-25,099320,KOSDAQ,46600.0


In [35]:
mod_data = mod_data.sort_values(['code', 'Date']).reset_index(drop=True)
mod_data

Unnamed: 0,Date,code,market,close
0,2021-01-04,000060,KOSPI,14250.0
1,2021-01-05,000060,KOSPI,14050.0
2,2021-01-06,000060,KOSPI,14250.0
3,2021-01-07,000060,KOSPI,14700.0
4,2021-01-08,000060,KOSPI,14750.0
...,...,...,...,...
86945,2021-11-22,950130,KOSDAQ,15950.0
86946,2021-11-23,950130,KOSDAQ,15500.0
86947,2021-11-24,950130,KOSDAQ,15200.0
86948,2021-11-25,950130,KOSDAQ,15300.0


In [36]:
pred_close_rate['code'].unique()[:5]

array(['000060', '000080', '000100', '000120', '000150'], dtype=object)

In [37]:
stock_close_list = []
for code in tqdm(pred_close_rate['code'].unique()):
    # 종목 별로 셋 가져오기
    temp_data = mod_data[mod_data['code'] == code].reset_index(drop=True)
    temp_pred = pred_close_rate[pred_close_rate['code'] == code].reset_index(drop=True)

    # 날짜 지정
    present_days = ['2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']
    
    # 당일 종가 = 전날 종가 * (1 + 당일 비율)
    # 2021-11-01일의 예측 종가 = 2021-10-29일의 실제 종가 * (1 + 2021-11-01 예측 비율)
    # 2021-11-02일의 예측 종가 = 2021-11-01일의 예측 종가 * (1 + 2021-11-02 예측 비율)
    
    # 2021-11-01일의 예측 종가 계산
    one_stock_close_list = []
    # 2021-10-29일의 실제 종가
    before_close = float(temp_data[temp_data['Date'] == '2021-10-29']['close'].values)
    # 2021-11-01일의 예측 비율
    present_rate = float(temp_pred[temp_pred['Date'] == present_days[0]]['close_rate'].values)
    close = before_close * (1 + present_rate)
    one_stock_close_list.append(close)
    # 2021-11-02 ~ 05일의 예측 종가 계산
    for i in range(4):
        before_close = one_stock_close_list[i]
        present_rate = float(temp_pred[temp_pred['Date'] == present_days[i+1]]['close_rate'].values)
        close = before_close * (1 + present_rate)
        one_stock_close_list.append(close)
    stock_close_list += one_stock_close_list

100%|███████████████████████████████████████████████████████████████████████████████| 370/370 [00:03<00:00, 113.07it/s]


In [38]:
len(stock_close_list)

1850

In [39]:
stock_close_list[:5]

[27764.49912982529,
 27672.773475499987,
 27835.04682443043,
 28500.798377089628,
 28576.322026084064]

In [40]:
mod_data[ (mod_data['Date']>='2021-11-01') & (mod_data['Date']<='2021-11-05') & (mod_data['code']=='000060') ]

Unnamed: 0,Date,code,market,close
215,2021-11-01,60,KOSPI,27850.0
216,2021-11-02,60,KOSPI,29250.0
217,2021-11-03,60,KOSPI,30250.0
218,2021-11-04,60,KOSPI,29450.0
219,2021-11-05,60,KOSPI,29550.0


In [41]:
pred_close_rate['close'] = stock_close_list
pred_close_rate

Unnamed: 0,Date,code,market,close_rate,close
0,2021-11-01,000060,KOSPI,0.002328,27764.499130
1,2021-11-02,000060,KOSPI,-0.003304,27672.773475
2,2021-11-03,000060,KOSPI,0.005864,27835.046824
3,2021-11-04,000060,KOSPI,0.023918,28500.798377
4,2021-11-05,000060,KOSPI,0.002650,28576.322026
...,...,...,...,...,...
1845,2021-11-01,950130,KOSDAQ,-0.016779,16813.085583
1846,2021-11-02,950130,KOSDAQ,0.022060,17183.990468
1847,2021-11-03,950130,KOSDAQ,0.006508,17295.820041
1848,2021-11-04,950130,KOSDAQ,0.004333,17370.759710


In [42]:
pred_close_rate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1850 entries, 0 to 1849
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        1850 non-null   object 
 1   code        1850 non-null   object 
 2   market      1850 non-null   object 
 3   close_rate  1850 non-null   float64
 4   close       1850 non-null   float64
dtypes: float64(2), object(3)
memory usage: 72.4+ KB


### 예측 파일 pivot_table로 변환

In [43]:
pred_pivot = pred_close_rate.pivot_table(index='Date', columns='code', values='close')
pred_pivot

code,000060,000080,000100,000120,000150,000240,000250,000270,000660,000670,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-11-01,27764.49913,34758.542985,59756.057998,141503.511612,102950.989132,16740.452467,47374.400253,85399.739588,103192.235635,690497.691266,...,49061.401867,51936.698981,85278.08113,36232.424198,25923.409088,52931.49712,339129.540899,259748.733053,26927.608269,16813.085583
2021-11-02,27672.773475,34599.202297,59516.532112,140796.416126,102184.715265,16703.99153,47726.745965,85155.310393,103625.271618,687988.509704,...,49113.560615,52898.135319,85898.99033,36309.739364,25921.62335,52819.41757,341346.307653,260194.239296,26919.029746,17183.990468
2021-11-03,27835.046824,34807.583937,59899.065078,141585.549662,101635.379457,16679.461199,48150.661862,85373.731053,104499.141777,687609.757213,...,49680.640361,53155.759857,87165.515079,37012.262812,26145.0157,53132.302797,340985.058237,260957.606072,26627.320994,17295.820041
2021-11-04,28500.798377,34901.779399,60829.802987,144155.142133,103181.01348,16787.221965,48375.355284,85918.692257,105464.429873,693434.175653,...,50120.624917,53711.960556,88036.147297,37190.078437,25888.024835,52640.624145,345355.618662,259319.691412,26534.926097,17370.75971
2021-11-05,28576.322026,35036.217192,61068.688871,144841.296668,104036.8633,16930.149135,48431.323573,86642.152898,106424.04296,695699.190327,...,50303.633031,54248.381002,88014.891185,38564.568864,26080.445542,52962.513018,341376.958207,259466.599307,26347.99709,18024.580671


In [44]:
pred_pivot.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 2021-11-01 to 2021-11-05
Columns: 370 entries, 000060 to 950130
dtypes: float64(370)
memory usage: 14.5+ KB


In [45]:
pred_pivot = pred_pivot.reset_index()
pred_pivot

code,Date,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-01,27764.49913,34758.542985,59756.057998,141503.511612,102950.989132,16740.452467,47374.400253,85399.739588,103192.235635,...,49061.401867,51936.698981,85278.08113,36232.424198,25923.409088,52931.49712,339129.540899,259748.733053,26927.608269,16813.085583
1,2021-11-02,27672.773475,34599.202297,59516.532112,140796.416126,102184.715265,16703.99153,47726.745965,85155.310393,103625.271618,...,49113.560615,52898.135319,85898.99033,36309.739364,25921.62335,52819.41757,341346.307653,260194.239296,26919.029746,17183.990468
2,2021-11-03,27835.046824,34807.583937,59899.065078,141585.549662,101635.379457,16679.461199,48150.661862,85373.731053,104499.141777,...,49680.640361,53155.759857,87165.515079,37012.262812,26145.0157,53132.302797,340985.058237,260957.606072,26627.320994,17295.820041
3,2021-11-04,28500.798377,34901.779399,60829.802987,144155.142133,103181.01348,16787.221965,48375.355284,85918.692257,105464.429873,...,50120.624917,53711.960556,88036.147297,37190.078437,25888.024835,52640.624145,345355.618662,259319.691412,26534.926097,17370.75971
4,2021-11-05,28576.322026,35036.217192,61068.688871,144841.296668,104036.8633,16930.149135,48431.323573,86642.152898,106424.04296,...,50303.633031,54248.381002,88014.891185,38564.568864,26080.445542,52962.513018,341376.958207,259466.599307,26347.99709,18024.580671


## 예측 종가 파일로 출력

In [46]:
pred_pivot.to_csv('prac4w_pred1w_rate_cat2.csv', index=False)