In [1]:
import FinanceDataReader as fdr
from tqdm import tqdm
import pandas as pd

# Pycaret 이용

## 모든 종목 예측

* 종가와 상관계수 높은 특성 : KOSDAQ(0.87), KOSPI(0.79), DOW(0.58), NASDAQ(0.50), S&P500(0.56)
* 예측 순서(종가와 상관계수 높은 특성 먼저 예측)
    * weekday, weeknumdm로 KOSDAQ 예측(종가와 상관계수 0.87)
    * weekday, weeknum, KOSDAQ으로 NASDAQ 예측(KOSDAQ과 상관계수 0.68)
    * weekday, weeknum, KOSDAQ, NASDAQ으로 S&P500 예측(NASDAQ과 상관계수 0.93)
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500으로 DOW 예측(S&P500과 상관계수 0.95)
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500, DOW로 KOSPI 예측
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500, DOW, KOSPI로 다른 종가 예측 

In [2]:
# 제출 점수 : 309.6832896345
# 자체 점수 : 309.87425

In [3]:
from pycaret.regression import *

# 데이터 불러오기(첫번째 주)
* 훈련 : 10.04 ~ 10.29
* 예측 : 11.01 ~ 11.05

## 외부 데이터 가져오기

### 훈련, 테스트, 종목 데이터 가져오기

In [4]:
train1 = pd.read_csv('20211004_29_train.csv')
test1 = pd.read_csv('20211004_29_pred_1101_05_test.csv')
stock_list = pd.read_csv('stock_list.csv')

In [5]:
train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     20 non-null     object 
 1   weekday  20 non-null     int64  
 2   weeknum  20 non-null     int64  
 3   kosdaq   20 non-null     float64
 4   nasdaq   20 non-null     float64
 5   sp500    20 non-null     float64
 6   dow      20 non-null     float64
 7   kospi    20 non-null     float64
dtypes: float64(5), int64(2), object(1)
memory usage: 1.4+ KB


In [6]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     5 non-null      object 
 1   weekday  5 non-null      int64  
 2   weeknum  5 non-null      int64  
 3   kosdaq   5 non-null      float64
 4   nasdaq   5 non-null      float64
 5   sp500    5 non-null      float64
 6   dow      5 non-null      float64
 7   kospi    5 non-null      float64
dtypes: float64(5), int64(2), object(1)
memory usage: 448.0+ bytes


In [7]:
train1.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
0,2021-10-04,0,40,969.285,14255.5,4300.46,34003.58,2990.675
1,2021-10-05,1,40,955.37,14433.8,4345.72,34315.99,2962.17
2,2021-10-06,2,40,922.36,14501.9,4363.55,34417.98,2908.31
3,2021-10-07,3,40,953.43,14654.0,4399.76,34754.15,2959.46
4,2021-10-08,4,40,953.11,14579.5,4391.36,34746.71,2956.3


In [8]:
train1.tail()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
15,2021-10-25,0,43,994.31,15226.7,4566.48,35743.78,3020.54
16,2021-10-26,1,43,1011.76,15235.7,4574.79,35755.83,3049.08
17,2021-10-27,2,43,1008.95,15235.8,4551.68,35491.48,3025.49
18,2021-10-28,3,43,1000.13,15448.1,4596.42,35729.89,3009.55
19,2021-10-29,4,43,992.33,15498.4,4605.38,35819.59,2970.68


In [9]:
test1

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
0,2021-11-01,0,44,972.0994,14971.576678,4483.7295,35186.918,2974.401
1,2021-11-02,1,44,972.51996,14975.816364,4484.8213,35193.47,2974.6333
2,2021-11-03,2,44,971.2291,14958.857623,4480.517,35167.76,2972.5112
3,2021-11-04,3,44,973.40875,14980.056049,4485.9893,35200.984,2977.359
4,2021-11-05,4,44,973.42645,14980.056049,4486.3213,35205.234,2974.0273


In [10]:
stock_list.head()

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,5930,KOSPI
1,SK하이닉스,660,KOSPI
2,NAVER,35420,KOSPI
3,카카오,35720,KOSPI
4,삼성바이오로직스,207940,KOSPI


In [11]:
# 종목 코드 6자리로 맞추기
stock_list['종목코드'] = stock_list["종목코드"].astype(str).str.zfill(6)
stock_list.head()

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,5930,KOSPI
1,SK하이닉스,660,KOSPI
2,NAVER,35420,KOSPI
3,카카오,35720,KOSPI
4,삼성바이오로직스,207940,KOSPI


# 훈련, 테스트 셋에 종목코드 열 추가하기
* 각 종목 별 셋 만들기
* 각 종목 별 셋을 모두 합치기

## train set

In [12]:
start_date = '2021-10-04'
end_date = '2021-10-29'

In [13]:
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

In [388]:
all_train1 = pd.DataFrame()
for code in tqdm(stock_list['종목코드'].values):
    temp_list=[]
    temp_list.append(code)
    train1['code'] = temp_list * 20

    code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    code_data = pd.merge(Business_days, code_data, how = 'outer')
    train1['Close'] = code_data['Close']

    all_train1 = pd.concat([all_train1, train1])

100%|██████████| 370/370 [01:20<00:00,  4.60it/s]


In [389]:
# 20일 * 370 종목 = 7400
all_train1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7400 entries, 0 to 19
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     7400 non-null   object 
 1   weekday  7400 non-null   int64  
 2   weeknum  7400 non-null   int64  
 3   kosdaq   7400 non-null   float64
 4   nasdaq   7400 non-null   float64
 5   sp500    7400 non-null   float64
 6   dow      7400 non-null   float64
 7   kospi    7400 non-null   float64
 8   code     7400 non-null   object 
 9   Close    6660 non-null   float64
dtypes: float64(6), int64(2), object(2)
memory usage: 635.9+ KB


In [390]:
all_train1.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi,code,Close
0,2021-10-04,0,40,969.285,14255.5,4300.46,34003.58,2990.675,5930,
1,2021-10-05,1,40,955.37,14433.8,4345.72,34315.99,2962.17,5930,72200.0
2,2021-10-06,2,40,922.36,14501.9,4363.55,34417.98,2908.31,5930,71300.0
3,2021-10-07,3,40,953.43,14654.0,4399.76,34754.15,2959.46,5930,71600.0
4,2021-10-08,4,40,953.11,14579.5,4391.36,34746.71,2956.3,5930,71500.0


In [391]:
stock_list.tail()

Unnamed: 0,종목명,종목코드,상장시장
365,맘스터치,220630,KOSDAQ
366,다날,64260,KOSDAQ
367,제이시스메디칼,287410,KOSDAQ
368,크리스에프앤씨,110790,KOSDAQ
369,쎄트렉아이,99320,KOSDAQ


In [392]:
all_train1.tail()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi,code,Close
15,2021-10-25,0,43,994.31,15226.7,4566.48,35743.78,3020.54,99320,50200.0
16,2021-10-26,1,43,1011.76,15235.7,4574.79,35755.83,3049.08,99320,49550.0
17,2021-10-27,2,43,1008.95,15235.8,4551.68,35491.48,3025.49,99320,48200.0
18,2021-10-28,3,43,1000.13,15448.1,4596.42,35729.89,3009.55,99320,49750.0
19,2021-10-29,4,43,992.33,15498.4,4605.38,35819.59,2970.68,99320,50200.0


In [393]:
all_train1.to_csv('20211004_29_all_train.csv', index=False)

## test set

In [14]:
all_test1 = pd.DataFrame()
for code in tqdm(stock_list['종목코드'].values):
    temp_list=[]
    temp_list.append(code)
    test1['code'] = temp_list * 5

    all_test1 = pd.concat([all_test1, test1])

100%|██████████| 370/370 [00:00<00:00, 569.00it/s]


In [15]:
# 5일 * 370 종목 = 1850
all_test1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850 entries, 0 to 4
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     1850 non-null   object 
 1   weekday  1850 non-null   int64  
 2   weeknum  1850 non-null   int64  
 3   kosdaq   1850 non-null   float64
 4   nasdaq   1850 non-null   float64
 5   sp500    1850 non-null   float64
 6   dow      1850 non-null   float64
 7   kospi    1850 non-null   float64
 8   code     1850 non-null   object 
dtypes: float64(5), int64(2), object(2)
memory usage: 144.5+ KB


In [16]:
all_test1.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi,code
0,2021-11-01,0,44,972.0994,14971.576678,4483.7295,35186.918,2974.401,5930
1,2021-11-02,1,44,972.51996,14975.816364,4484.8213,35193.47,2974.6333,5930
2,2021-11-03,2,44,971.2291,14958.857623,4480.517,35167.76,2972.5112,5930
3,2021-11-04,3,44,973.40875,14980.056049,4485.9893,35200.984,2977.359,5930
4,2021-11-05,4,44,973.42645,14980.056049,4486.3213,35205.234,2974.0273,5930


In [17]:
stock_list.tail()

Unnamed: 0,종목명,종목코드,상장시장
365,맘스터치,220630,KOSDAQ
366,다날,64260,KOSDAQ
367,제이시스메디칼,287410,KOSDAQ
368,크리스에프앤씨,110790,KOSDAQ
369,쎄트렉아이,99320,KOSDAQ


In [18]:
all_test1.tail()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi,code
0,2021-11-01,0,44,972.0994,14971.576678,4483.7295,35186.918,2974.401,99320
1,2021-11-02,1,44,972.51996,14975.816364,4484.8213,35193.47,2974.6333,99320
2,2021-11-03,2,44,971.2291,14958.857623,4480.517,35167.76,2972.5112,99320
3,2021-11-04,3,44,973.40875,14980.056049,4485.9893,35200.984,2977.359,99320
4,2021-11-05,4,44,973.42645,14980.056049,4486.3213,35205.234,2974.0273,99320


In [20]:
all_test1.to_csv('20210104_1029_pred_1101_05_all_test.csv', index=False)

## 첫번째 주 종가 예측

### 8가지 특성으로 종목별 종가 예측

In [316]:
model1=setup(all_train1, target = 'Close',ignore_features=['weeknum'], silent=False, categorical_features=['code'], fold=4, fold_shuffle=True)
tuned_model1 = tune_model(
    compare_models(sort='MAPE', n_select=1, exclude=['knn','huber','llar','omp','par'])
)
final_model1 = finalize_model(tuned_model1)
pred1 = predict_model(final_model1, all_test1.drop('weeknum',axis=1))

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,51086.6638,6944414361.404,83333.1528,0.6625,1.029,1.2294
1,51054.1798,6781494106.0287,82349.8276,0.694,1.0666,1.3718
2,49828.7439,7234524123.0217,85056.0058,0.5828,1.0837,1.5255
3,48494.6254,6317768613.9321,79484.3923,0.733,1.0208,1.3467
Mean,50116.0532,6819550301.0966,82555.8446,0.6681,1.0501,1.3684
SD,1064.6292,332049485.0155,2020.5987,0.0552,0.026,0.1055


In [None]:
save_model(final_model1, 'pred_model1')

In [318]:
pred1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850 entries, 0 to 4
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     1850 non-null   object 
 1   weekday  1850 non-null   int64  
 2   kosdaq   1850 non-null   float64
 3   nasdaq   1850 non-null   float64
 4   sp500    1850 non-null   float64
 5   dow      1850 non-null   float64
 6   kospi    1850 non-null   float64
 7   code     1850 non-null   object 
 8   Label    1850 non-null   float64
dtypes: float64(6), int64(1), object(2)
memory usage: 144.5+ KB


In [320]:
pred1.tail(20)

Unnamed: 0,Date,weekday,kosdaq,nasdaq,sp500,dow,kospi,code,Label
0,2021-11-01,0,972.0994,14971.576678,4483.7295,35186.918,2974.401,64260,43430.105769
1,2021-11-02,1,972.51996,14975.816364,4484.8213,35193.47,2974.6333,64260,43503.211538
2,2021-11-03,2,971.2291,14958.857623,4480.517,35167.76,2972.5112,64260,43653.436538
3,2021-11-04,3,973.40875,14980.056049,4485.9893,35200.984,2977.359,64260,43698.8
4,2021-11-05,4,973.42645,14980.056049,4486.3213,35205.234,2974.0273,64260,43421.142308
0,2021-11-01,0,972.0994,14971.576678,4483.7295,35186.918,2974.401,287410,43430.105769
1,2021-11-02,1,972.51996,14975.816364,4484.8213,35193.47,2974.6333,287410,43503.211538
2,2021-11-03,2,971.2291,14958.857623,4480.517,35167.76,2972.5112,287410,43653.436538
3,2021-11-04,3,973.40875,14980.056049,4485.9893,35200.984,2977.359,287410,43698.8
4,2021-11-05,4,973.42645,14980.056049,4486.3213,35205.234,2974.0273,287410,43421.142308


# 데이터 불러오기(두번째 주)
* 훈련 : 10.25 ~ 11.19
* 예측 : 11.29 ~ 12.03

In [400]:
train2 = pd.read_csv('20211025_1119_train.csv')
test2 = pd.read_csv('20211025_1119_pred_1129_1203_test.csv')
stock_list = pd.read_csv('stock_list.csv')

In [401]:
# 종목 코드 6자리로 맞추기
stock_list['종목코드'] = stock_list["종목코드"].astype(str).str.zfill(6)
stock_list.head()

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,5930,KOSPI
1,SK하이닉스,660,KOSPI
2,NAVER,35420,KOSPI
3,카카오,35720,KOSPI
4,삼성바이오로직스,207940,KOSPI


In [402]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     20 non-null     object 
 1   weekday  20 non-null     int64  
 2   weeknum  20 non-null     int64  
 3   kosdaq   20 non-null     float64
 4   nasdaq   20 non-null     float64
 5   sp500    20 non-null     float64
 6   dow      20 non-null     float64
 7   kospi    20 non-null     float64
dtypes: float64(5), int64(2), object(1)
memory usage: 1.4+ KB


In [403]:
test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     5 non-null      object 
 1   weekday  5 non-null      int64  
 2   weeknum  5 non-null      int64  
 3   kosdaq   5 non-null      float64
 4   nasdaq   5 non-null      float64
 5   sp500    5 non-null      float64
 6   dow      5 non-null      float64
 7   kospi    5 non-null      float64
dtypes: float64(5), int64(2), object(1)
memory usage: 448.0+ bytes


In [404]:
stock_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370 entries, 0 to 369
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   종목명     370 non-null    object
 1   종목코드    370 non-null    object
 2   상장시장    370 non-null    object
dtypes: object(3)
memory usage: 8.8+ KB


In [405]:
train2.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
0,2021-10-25,0,43,994.31,15226.7,4566.48,35743.78,3020.54
1,2021-10-26,1,43,1011.76,15235.7,4574.79,35755.83,3049.08
2,2021-10-27,2,43,1008.95,15235.8,4551.68,35491.48,3025.49
3,2021-10-28,3,43,1000.13,15448.1,4596.42,35729.89,3009.55
4,2021-10-29,4,43,992.33,15498.4,4605.38,35819.59,2970.68


In [406]:
train2.tail()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
15,2021-11-15,0,46,1029.03,15853.8,4682.81,36087.98,2999.52
16,2021-11-16,1,46,1035.46,15973.9,4700.9,36144.13,2997.21
17,2021-11-17,2,46,1031.26,15921.6,4688.67,35931.52,2962.42
18,2021-11-18,3,46,1032.77,15993.7,4706.64,35871.34,2947.38
19,2021-11-19,4,46,1041.92,16057.4,4697.96,35602.18,2971.02


In [407]:
test2.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
0,2021-11-29,0,48,994.300008,15516.016764,4613.382896,35954.46,2971.095891
1,2021-11-30,1,48,1009.960002,15730.499891,4652.258954,36125.67,3003.252271
2,2021-12-01,2,48,997.817456,15516.016764,4613.382896,35941.957,2971.845081
3,2021-12-02,3,48,997.817456,15516.016764,4613.382896,35830.055,2977.736577
4,2021-12-03,4,48,1000.700012,15516.016764,4613.382896,35812.945,2977.736577


In [408]:
stock_list.head()

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,5930,KOSPI
1,SK하이닉스,660,KOSPI
2,NAVER,35420,KOSPI
3,카카오,35720,KOSPI
4,삼성바이오로직스,207940,KOSPI


# 두번째 주 훈련, 테스트 셋에 종목코드 열 추가하기
* 각 종목 별 셋 만들기
* 각 종목 별 셋을 모두 합치기

In [409]:
start_date = '2021-10-25'
end_date = '2021-11-19'

In [410]:
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

## train set2

In [411]:
all_train2 = pd.DataFrame()
for code in tqdm(stock_list['종목코드'].values):
    temp_list=[]
    temp_list.append(code)
    train2['code'] = temp_list * 20

    code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    code_data = pd.merge(Business_days, code_data, how = 'outer')
    train2['Close'] = code_data['Close']

    all_train2 = pd.concat([all_train2, train2])

100%|██████████| 370/370 [01:13<00:00,  5.03it/s]


In [412]:
# 20일 * 370종목 = 7400
all_train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7400 entries, 0 to 19
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     7400 non-null   object 
 1   weekday  7400 non-null   int64  
 2   weeknum  7400 non-null   int64  
 3   kosdaq   7400 non-null   float64
 4   nasdaq   7400 non-null   float64
 5   sp500    7400 non-null   float64
 6   dow      7400 non-null   float64
 7   kospi    7400 non-null   float64
 8   code     7400 non-null   object 
 9   Close    7397 non-null   float64
dtypes: float64(6), int64(2), object(2)
memory usage: 635.9+ KB


In [413]:
all_train2.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi,code,Close
0,2021-10-25,0,43,994.31,15226.7,4566.48,35743.78,3020.54,5930,70200.0
1,2021-10-26,1,43,1011.76,15235.7,4574.79,35755.83,3049.08,5930,71100.0
2,2021-10-27,2,43,1008.95,15235.8,4551.68,35491.48,3025.49,5930,70100.0
3,2021-10-28,3,43,1000.13,15448.1,4596.42,35729.89,3009.55,5930,70700.0
4,2021-10-29,4,43,992.33,15498.4,4605.38,35819.59,2970.68,5930,69800.0


In [414]:
all_train2.tail()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi,code,Close
15,2021-11-15,0,46,1029.03,15853.8,4682.81,36087.98,2999.52,99320,46750.0
16,2021-11-16,1,46,1035.46,15973.9,4700.9,36144.13,2997.21,99320,48400.0
17,2021-11-17,2,46,1031.26,15921.6,4688.67,35931.52,2962.42,99320,49100.0
18,2021-11-18,3,46,1032.77,15993.7,4706.64,35871.34,2947.38,99320,48900.0
19,2021-11-19,4,46,1041.92,16057.4,4697.96,35602.18,2971.02,99320,49600.0


In [415]:
all_train2.to_csv('20211025_1119_all_train.csv', index=False)

## test set2

In [416]:
all_test2 = pd.DataFrame()
for code in tqdm(stock_list['종목코드'].values):
    temp_list=[]
    temp_list.append(code)
    test2['code'] = temp_list * 5

    all_test2 = pd.concat([all_test2, test2])

100%|██████████| 370/370 [00:00<00:00, 408.13it/s]


In [417]:
# 5일 * 370종목 = 1850
all_test2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850 entries, 0 to 4
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     1850 non-null   object 
 1   weekday  1850 non-null   int64  
 2   weeknum  1850 non-null   int64  
 3   kosdaq   1850 non-null   float64
 4   nasdaq   1850 non-null   float64
 5   sp500    1850 non-null   float64
 6   dow      1850 non-null   float64
 7   kospi    1850 non-null   float64
 8   code     1850 non-null   object 
dtypes: float64(5), int64(2), object(2)
memory usage: 144.5+ KB


In [418]:
all_test2.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi,code
0,2021-11-29,0,48,994.300008,15516.016764,4613.382896,35954.46,2971.095891,5930
1,2021-11-30,1,48,1009.960002,15730.499891,4652.258954,36125.67,3003.252271,5930
2,2021-12-01,2,48,997.817456,15516.016764,4613.382896,35941.957,2971.845081,5930
3,2021-12-02,3,48,997.817456,15516.016764,4613.382896,35830.055,2977.736577,5930
4,2021-12-03,4,48,1000.700012,15516.016764,4613.382896,35812.945,2977.736577,5930


In [419]:
all_test2.tail()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi,code
0,2021-11-29,0,48,994.300008,15516.016764,4613.382896,35954.46,2971.095891,99320
1,2021-11-30,1,48,1009.960002,15730.499891,4652.258954,36125.67,3003.252271,99320
2,2021-12-01,2,48,997.817456,15516.016764,4613.382896,35941.957,2971.845081,99320
3,2021-12-02,3,48,997.817456,15516.016764,4613.382896,35830.055,2977.736577,99320
4,2021-12-03,4,48,1000.700012,15516.016764,4613.382896,35812.945,2977.736577,99320


In [420]:
all_test2.to_csv('20211025_1119_pred_1129_1203_all_test.csv', index=False)

## 두번째 주 종가 예측

### 8가지 특성으로 종목별 종가 예측 2

In [271]:
model2=setup(all_train2, target = 'Close',ignore_features=['weeknum'], silent=False, categorical_features=['code'], fold=4, fold_shuffle=True)
tuned_model2 = tune_model(
    compare_models(sort='MAPE', n_select=1, exclude=['knn','huber','llar','omp','par'])
)
final_model2 = finalize_model(tuned_model2)
pred2 = predict_model(final_model2, all_test2.drop('weeknum',axis=1))

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,54555.0506,6491849792.1857,80572.0162,0.7112,1.1937,2.2899
1,53675.2668,6767807689.2578,82266.6864,0.5848,1.198,2.3732
2,54351.8039,6033302020.7202,77674.3331,0.639,1.187,2.2546
3,52625.5558,6163850618.3725,78510.1944,0.7545,1.192,2.3288
Mean,53801.9193,6364202530.134,79755.8075,0.6724,1.1927,2.3116
SD,753.235,286717188.0344,1792.6794,0.0653,0.0039,0.0442


In [279]:
save_model(final_model2, 'pred_model2')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='Close',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strategy='...
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  DecisionTreeRegressor(ccp_alpha=0.0, criterion='friedman_mse',
                                        max_depth=10, max_features=1.0,
                                        max_leaf_nodes=None,

In [272]:
pred2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850 entries, 0 to 4
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     1850 non-null   object 
 1   weekday  1850 non-null   int64  
 2   kosdaq   1850 non-null   float64
 3   nasdaq   1850 non-null   float64
 4   sp500    1850 non-null   float64
 5   dow      1850 non-null   float64
 6   kospi    1850 non-null   float64
 7   code     1850 non-null   object 
 8   Label    1850 non-null   float64
dtypes: float64(6), int64(1), object(2)
memory usage: 144.5+ KB


In [273]:
pred2.head(20)

Unnamed: 0,Date,weekday,kosdaq,nasdaq,sp500,dow,kospi,code,Label
0,2021-11-29,0,994.300008,15516.016764,4613.382896,35954.46,2971.095891,660,87364.510541
1,2021-11-30,1,1009.960002,15730.499891,4652.258954,36125.67,3003.252271,660,91219.025495
2,2021-12-01,2,997.817456,15516.016764,4613.382896,35941.957,2971.845081,660,86959.464234
3,2021-12-02,3,997.817456,15516.016764,4613.382896,35830.055,2977.736577,660,87678.54536
4,2021-12-03,4,1000.700012,15516.016764,4613.382896,35812.945,2977.736577,660,87714.999685
0,2021-11-29,0,994.300008,15516.016764,4613.382896,35954.46,2971.095891,660,87364.510541
1,2021-11-30,1,1009.960002,15730.499891,4652.258954,36125.67,3003.252271,660,91219.025495
2,2021-12-01,2,997.817456,15516.016764,4613.382896,35941.957,2971.845081,660,86959.464234
3,2021-12-02,3,997.817456,15516.016764,4613.382896,35830.055,2977.736577,660,87678.54536
4,2021-12-03,4,1000.700012,15516.016764,4613.382896,35812.945,2977.736577,660,87714.999685


# 제출 파일 만들기

In [323]:
sub = pd.read_csv('../data/sample_submission.csv')
sub

Unnamed: 0,Day,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2021-11-02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2021-11-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2021-11-04,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2021-11-05,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2021-11-29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2021-11-30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2021-12-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2021-12-02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2021-12-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [326]:
sub.columns.values[1:6]

array(['000060', '000080', '000100', '000120', '000150'], dtype=object)

In [331]:
pred1[pred1['code']=='000660']['Label']

0    43430.105769
1    43503.211538
2    43653.436538
3    43698.800000
4    43421.142308
0    43430.105769
1    43503.211538
2    43653.436538
3    43698.800000
4    43421.142308
Name: Label, dtype: float64

In [324]:
for code in tqdm(sub.columns.values[1:]):
    pred1[pred1['code']==code]
pred1[['code','Label']]

Unnamed: 0,code,Label
0,000660,43430.105769
1,000660,43503.211538
2,000660,43653.436538
3,000660,43698.800000
4,000660,43421.142308
...,...,...
0,099320,43430.105769
1,099320,43503.211538
2,099320,43653.436538
3,099320,43698.800000


In [None]:
sub

In [None]:
sub.to_csv('sub03_pycaret02.csv', index=False)