In [1]:
import os
import FinanceDataReader as fdr
from tqdm import tqdm
import pandas as pd

# Pycaret 이용

## 모든 종목 예측

* 종가와 상관계수 높은 특성 : KOSDAQ(0.87), KOSPI(0.79), DOW(0.58), NASDAQ(0.50), S&P500(0.56)
* 예측 순서(종가와 상관계수 높은 특성 먼저 예측)
    * weekday, weeknumdm로 KOSDAQ 예측(종가와 상관계수 0.87)
    * weekday, weeknum, KOSDAQ으로 NASDAQ 예측(KOSDAQ과 상관계수 0.68)
    * weekday, weeknum, KOSDAQ, NASDAQ으로 S&P500 예측(NASDAQ과 상관계수 0.93)
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500으로 DOW 예측(S&P500과 상관계수 0.95)
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500, DOW로 KOSPI 예측
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500, DOW, KOSPI로 다른 종가 예측 

In [2]:
# 제출 점수 :

In [3]:
from pycaret.regression import *

# 데이터 불러오기

## 외부 데이터 가져오기

In [4]:
# 외부 데이터 가져오기
df = pd.read_csv('../data/20210104_20211119_data_interpolate.csv')
df.head()

Unnamed: 0,Date,exchange_rate,kospi,kosdaq,kospi100,dow,nasdaq,sp500,미국채10년-2년,BTC,Gold,Oil,US10Y,US2Y
0,2021-01-04,1086.48,2944.45,977.62,3052.74,30223.89,12698.4,3700.65,0.8009,36499000.0,37.1,32.38,0.918,0.1171
1,2021-01-05,1086.42,2990.57,985.76,3099.42,30391.6,12819.0,3726.86,0.832,37962000.0,37.18,33.94,0.955,0.123
2,2021-01-06,1087.93,2968.21,981.39,3070.05,30829.4,12740.8,3748.14,0.9002,40731000.0,36.6,34.11,1.039,0.1388
3,2021-01-07,1094.28,3031.68,988.86,3139.48,31041.13,13067.5,3803.79,0.9402,44575000.0,36.48,34.53,1.081,0.1408
4,2021-01-08,1092.93,3152.18,987.79,3293.96,31097.97,13202.0,3824.68,0.9821,47241000.0,35.26,35.43,1.119,0.1369


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           230 non-null    object 
 1   exchange_rate  230 non-null    float64
 2   kospi          230 non-null    float64
 3   kosdaq         230 non-null    float64
 4   kospi100       230 non-null    float64
 5   dow            230 non-null    float64
 6   nasdaq         230 non-null    float64
 7   sp500          230 non-null    float64
 8   미국채10년-2년      230 non-null    float64
 9   BTC            230 non-null    float64
 10  Gold           230 non-null    float64
 11  Oil            230 non-null    float64
 12  US10Y          230 non-null    float64
 13  US2Y           230 non-null    float64
dtypes: float64(13), object(1)
memory usage: 25.3+ KB


In [6]:
df.columns

Index(['Date', 'exchange_rate', 'kospi', 'kosdaq', 'kospi100', 'dow', 'nasdaq',
       'sp500', '미국채10년-2년', 'BTC', 'Gold', 'Oil', 'US10Y', 'US2Y'],
      dtype='object')

### 필요한 특성만 추출

In [7]:
df = df[['Date','kosdaq','nasdaq','sp500','dow','kospi']]
df.head()

Unnamed: 0,Date,kosdaq,nasdaq,sp500,dow,kospi
0,2021-01-04,977.62,12698.4,3700.65,30223.89,2944.45
1,2021-01-05,985.76,12819.0,3726.86,30391.6,2990.57
2,2021-01-06,981.39,12740.8,3748.14,30829.4,2968.21
3,2021-01-07,988.86,13067.5,3803.79,31041.13,3031.68
4,2021-01-08,987.79,13202.0,3824.68,31097.97,3152.18


In [8]:
# 요일 번호와 주 번호 지정하여 열로 추가하기
df['Date'] = pd.to_datetime(df['Date'])
df['weekday'] = df['Date'].apply(lambda x : x.weekday())
df['weeknum'] = df['Date'].apply(lambda x : x.strftime('%V'))

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     230 non-null    datetime64[ns]
 1   kosdaq   230 non-null    float64       
 2   nasdaq   230 non-null    float64       
 3   sp500    230 non-null    float64       
 4   dow      230 non-null    float64       
 5   kospi    230 non-null    float64       
 6   weekday  230 non-null    int64         
 7   weeknum  230 non-null    object        
dtypes: datetime64[ns](1), float64(5), int64(1), object(1)
memory usage: 14.5+ KB


In [10]:
df.columns

Index(['Date', 'kosdaq', 'nasdaq', 'sp500', 'dow', 'kospi', 'weekday',
       'weeknum'],
      dtype='object')

In [11]:
df = df[['Date', 'weekday', 'weeknum','kosdaq', 'nasdaq', 'sp500', 'dow', 'kospi']]
df.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
0,2021-01-04,0,1,977.62,12698.4,3700.65,30223.89,2944.45
1,2021-01-05,1,1,985.76,12819.0,3726.86,30391.6,2990.57
2,2021-01-06,2,1,981.39,12740.8,3748.14,30829.4,2968.21
3,2021-01-07,3,1,988.86,13067.5,3803.79,31041.13,3031.68
4,2021-01-08,4,1,987.79,13202.0,3824.68,31097.97,3152.18


### 훈련, 테스트 셋 나누기

In [12]:
start_date = '2021-10-04'
end_date = '2021-10-29'

In [13]:
train = df[(df['Date'] <= end_date) & (df['Date'] >= start_date)]
test =  df[(df['Date'] <= '2021-11-05') & (df['Date'] >= '2021-11-01')]

In [14]:
train = train.reset_index()
train = train.drop('index',axis=1)

In [15]:
train.shape, test.shape

((20, 8), (5, 8))

In [16]:
train.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
0,2021-10-04,0,40,969.285,14255.5,4300.46,34003.58,2990.675
1,2021-10-05,1,40,955.37,14433.8,4345.72,34315.99,2962.17
2,2021-10-06,2,40,922.36,14501.9,4363.55,34417.98,2908.31
3,2021-10-07,3,40,953.43,14654.0,4399.76,34754.15,2959.46
4,2021-10-08,4,40,953.11,14579.5,4391.36,34746.71,2956.3


In [17]:
train.tail()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
15,2021-10-25,0,43,994.31,15226.7,4566.48,35743.78,3020.54
16,2021-10-26,1,43,1011.76,15235.7,4574.79,35755.83,3049.08
17,2021-10-27,2,43,1008.95,15235.8,4551.68,35491.48,3025.49
18,2021-10-28,3,43,1000.13,15448.1,4596.42,35729.89,3009.55
19,2021-10-29,4,43,992.33,15498.4,4605.38,35819.59,2970.68


In [18]:
test.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
215,2021-11-01,0,44,998.57,15595.9,4613.67,35913.68,2978.94
216,2021-11-02,1,44,1009.44,15649.6,4630.65,36053.09,3013.49
217,2021-11-03,2,44,1005.0,15811.6,4660.57,36157.02,2975.71
218,2021-11-04,3,44,1001.43,15940.3,4680.06,36124.66,2983.22
219,2021-11-05,4,44,1001.35,15971.6,4697.53,36329.07,2969.27


In [19]:
test = test[['Date', 'weekday', 'weeknum']]
test

Unnamed: 0,Date,weekday,weeknum
215,2021-11-01,0,44
216,2021-11-02,1,44
217,2021-11-03,2,44
218,2021-11-04,3,44
219,2021-11-05,4,44


## 종목 정보 가져오기

In [20]:
# 종목 정보 가져오기
path = '../data'
list_name = 'Stock_List.csv'
stock_list = pd.read_csv(os.path.join(path,list_name))
stock_list.head()

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,5930,KOSPI
1,SK하이닉스,660,KOSPI
2,NAVER,35420,KOSPI
3,카카오,35720,KOSPI
4,삼성바이오로직스,207940,KOSPI


In [21]:
stock_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370 entries, 0 to 369
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   종목명     370 non-null    object
 1   종목코드    370 non-null    int64 
 2   상장시장    370 non-null    object
dtypes: int64(1), object(2)
memory usage: 8.8+ KB


In [22]:
# 종목 코드 6자리로 맞추기
stock_list['종목코드'] = stock_list["종목코드"].astype(str).str.zfill(6)
stock_list.head()

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,5930,KOSPI
1,SK하이닉스,660,KOSPI
2,NAVER,35420,KOSPI
3,카카오,35720,KOSPI
4,삼성바이오로직스,207940,KOSPI


# 종가 예측

* 1. kosdaq 예측
* 2. nasdaq
* 3. sp500
* 4. dow
* 5. kospi
* 6. 종가

In [23]:
train.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
0,2021-10-04,0,40,969.285,14255.5,4300.46,34003.58,2990.675
1,2021-10-05,1,40,955.37,14433.8,4345.72,34315.99,2962.17
2,2021-10-06,2,40,922.36,14501.9,4363.55,34417.98,2908.31
3,2021-10-07,3,40,953.43,14654.0,4399.76,34754.15,2959.46
4,2021-10-08,4,40,953.11,14579.5,4391.36,34746.71,2956.3


In [25]:
train.to_csv('20211004_29_train.csv', index=False)

In [24]:
test

Unnamed: 0,Date,weekday,weeknum
215,2021-11-01,0,44
216,2021-11-02,1,44
217,2021-11-03,2,44
218,2021-11-04,3,44
219,2021-11-05,4,44


### 테스트셋의 5가지 특성 예측 하기

In [47]:
features = ['Date', 'weekday', 'weeknum','kosdaq', 'nasdaq', 'sp500', 'dow', 'kospi']
for i in range(5):
    model = setup(train, target = features[i+3], ignore_features = features[i+4:], fold=4, fold_shuffle=True, silent=True)
    tuned_model = tune_model(compare_models(sort='MAPE', n_select=1))
    final_model = finalize_model(tuned_model)
    end_features = features[:i+3]
    pred = predict_model(final_model, test)
    test[features[i+3]] = pred['Label']

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,6.5614,59.1036,7.6879,0.9767,0.0026,0.0022
1,7.0384,71.3379,8.4462,0.9518,0.0028,0.0023
2,9.4792,190.4328,13.7997,0.6589,0.0046,0.0032
3,9.8656,266.8352,16.3351,-2.8068,0.0055,0.0033
Mean,8.2362,146.9274,11.5672,-0.0549,0.0039,0.0027
SD,1.4526,86.1647,3.6231,1.5937,0.0012,0.0005


In [48]:
test

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
215,2021-11-01,0,44,972.099426,14971.576678,4483.729492,35186.917969,2974.400879
216,2021-11-02,1,44,972.519958,14975.816364,4484.821289,35193.46875,2974.633301
217,2021-11-03,2,44,971.229126,14958.857623,4480.51709,35167.761719,2972.51123
218,2021-11-04,3,44,973.408752,14980.056049,4485.989258,35200.984375,2977.358887
219,2021-11-05,4,44,973.426453,14980.056049,4486.321289,35205.234375,2974.027344


In [49]:
test.to_csv('20211004_29_pred_1101_05_test.csv', index=False)

In [50]:
test = pd.read_csv('20211004_29_pred_1101_05_test.csv')
test

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
0,2021-11-01,0,44,972.0994,14971.576678,4483.7295,35186.918,2974.401
1,2021-11-02,1,44,972.51996,14975.816364,4484.8213,35193.47,2974.6333
2,2021-11-03,2,44,971.2291,14958.857623,4480.517,35167.76,2972.5112
3,2021-11-04,3,44,973.40875,14980.056049,4485.9893,35200.984,2977.359
4,2021-11-05,4,44,973.42645,14980.056049,4486.3213,35205.234,2974.0273


In [53]:
train.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
0,2021-10-04,0,40,969.285,14255.5,4300.46,34003.58,2990.675
1,2021-10-05,1,40,955.37,14433.8,4345.72,34315.99,2962.17
2,2021-10-06,2,40,922.36,14501.9,4363.55,34417.98,2908.31
3,2021-10-07,3,40,953.43,14654.0,4399.76,34754.15,2959.46
4,2021-10-08,4,40,953.11,14579.5,4391.36,34746.71,2956.3


In [54]:
stock_list['종목코드'].values[0]

'005930'

In [57]:
code_data = fdr.DataReader('005930', start = start_date, end = end_date)[['Close']].reset_index()
train['Close'] = code_data['Close']
train

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi,Close
0,2021-10-04,0,40,969.285,14255.5,4300.46,34003.58,2990.675,72200.0
1,2021-10-05,1,40,955.37,14433.8,4345.72,34315.99,2962.17,71300.0
2,2021-10-06,2,40,922.36,14501.9,4363.55,34417.98,2908.31,71600.0
3,2021-10-07,3,40,953.43,14654.0,4399.76,34754.15,2959.46,71500.0
4,2021-10-08,4,40,953.11,14579.5,4391.36,34746.71,2956.3,69000.0
5,2021-10-11,0,41,946.63,14486.2,4361.19,34496.85,2936.34,68800.0
6,2021-10-12,1,41,940.15,14465.9,4350.64,34378.08,2916.38,69400.0
7,2021-10-13,2,41,953.47,14571.6,4363.8,34378.6,2944.41,70100.0
8,2021-10-14,3,41,983.43,14823.4,4438.23,34911.7,2988.64,70200.0
9,2021-10-15,4,41,990.54,14897.3,4471.37,35295.48,3015.06,70600.0


In [58]:
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])
code_data = fdr.DataReader('005930', start = start_date, end = end_date)[['Close']].reset_index()
code_data = pd.merge(Business_days, code_data, how = 'outer')
train['Close'] = code_data['Close']
train

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi,Close
0,2021-10-04,0,40,969.285,14255.5,4300.46,34003.58,2990.675,
1,2021-10-05,1,40,955.37,14433.8,4345.72,34315.99,2962.17,72200.0
2,2021-10-06,2,40,922.36,14501.9,4363.55,34417.98,2908.31,71300.0
3,2021-10-07,3,40,953.43,14654.0,4399.76,34754.15,2959.46,71600.0
4,2021-10-08,4,40,953.11,14579.5,4391.36,34746.71,2956.3,71500.0
5,2021-10-11,0,41,946.63,14486.2,4361.19,34496.85,2936.34,
6,2021-10-12,1,41,940.15,14465.9,4350.64,34378.08,2916.38,69000.0
7,2021-10-13,2,41,953.47,14571.6,4363.8,34378.6,2944.41,68800.0
8,2021-10-14,3,41,983.43,14823.4,4438.23,34911.7,2988.64,69400.0
9,2021-10-15,4,41,990.54,14897.3,4471.37,35295.48,3015.06,70100.0


In [61]:
train = df[(df['Date'] <= end_date) & (df['Date'] >= start_date)]
train = train.reset_index()
train = train.drop('index',axis=1)
train

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
0,2021-10-04,0,40,969.285,14255.5,4300.46,34003.58,2990.675
1,2021-10-05,1,40,955.37,14433.8,4345.72,34315.99,2962.17
2,2021-10-06,2,40,922.36,14501.9,4363.55,34417.98,2908.31
3,2021-10-07,3,40,953.43,14654.0,4399.76,34754.15,2959.46
4,2021-10-08,4,40,953.11,14579.5,4391.36,34746.71,2956.3
5,2021-10-11,0,41,946.63,14486.2,4361.19,34496.85,2936.34
6,2021-10-12,1,41,940.15,14465.9,4350.64,34378.08,2916.38
7,2021-10-13,2,41,953.47,14571.6,4363.8,34378.6,2944.41
8,2021-10-14,3,41,983.43,14823.4,4438.23,34911.7,2988.64
9,2021-10-15,4,41,990.54,14897.3,4471.37,35295.48,3015.06


## 8가지 특성으로 종목별 종가 예측

In [64]:
stock_pred_dict = {}
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])
for code in tqdm(stock_list['종목코드'].values):
    code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    code_data = pd.merge(Business_days, code_data, how = 'outer')
    train['Close'] = code_data['Close']

    model=setup(train, target = 'Close', silent=True, fold=4, fold_shuffle=True)
    tuned_model = tune_model(
        compare_models(sort='MAPE', n_select=1, 
    exclude=['knn','huber','llar','omp','par'])
    )
    tuned_model_ = finalize_model(tuned_model)
    pred = predict_model(tuned_model_, test)
    
    stock_pred_dict[code] = pred['Label']

Unnamed: 0,Description,Value
0,session_id,6102
1,Target,Close
2,Original Data,"(20, 9)"
3,Missing Values,True
4,Numeric Features,5
5,Categorical Features,2
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(12, 17)"


IntProgress(value=0, description='Processing: ', max=74)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,537.25,462253.5417,660.8258,-0.4432,0.0167,0.0136,0.1525
en,Elastic Net,552.9079,434218.5127,620.4142,-0.7101,0.0159,0.0142,0.015
rf,Random Forest Regressor,582.75,493840.7917,666.4469,-0.1148,0.0169,0.0148,0.1875
ada,AdaBoost Regressor,585.0694,481500.2894,687.0558,-0.5741,0.0174,0.0148,0.0675
xgboost,Extreme Gradient Boosting,590.3796,478143.2629,674.2679,-0.9432,0.017,0.0149,0.1075
br,Bayesian Ridge,614.2472,547685.6406,694.8492,-0.1777,0.0176,0.0156,0.0125
ridge,Ridge Regression,660.5726,649069.4648,799.903,-2.1998,0.0207,0.017,0.0125
gbr,Gradient Boosting Regressor,714.7777,807180.7011,805.6704,-0.8844,0.0204,0.0181,0.0275
dt,Decision Tree Regressor,875.0,1139583.3333,995.8209,-1.5738,0.0253,0.0222,0.0125
lr,Linear Regression,1041.6341,1733593.4874,1255.0947,-9.6614,0.0327,0.0267,0.0125


In [None]:
len(stock_pred_dict)

3

In [None]:
stock_pred = pd.DataFrame(stock_pred_dict)
stock_pred

In [None]:
stock_pred.columns

In [None]:
stock_pred * 2

In [None]:
type(stock_pred['005930'])

In [None]:
final_stock_pred = pd.concat([stock_pred, stock_pred])
final_stock_pred

In [None]:
final_stock_pred = final_stock_pred.reset_index()
final_stock_pred

In [None]:
final_stock_pred = final_stock_pred.drop('index',axis=1)
final_stock_pred

In [None]:
final_stock_pred.columns

# 제출 파일 만들기

In [None]:
sample_name = 'sample_submission.csv'
sub = pd.read_csv(os.path.join(path,sample_name))
sub

In [None]:
for code in final_stock_pred.columns:
    sub[code] = final_stock_pred[code]

In [None]:
sub

In [None]:
sub.to_csv('sub01_cat01.csv', index=False)