In [None]:
import FinanceDataReader as fdr
from tqdm import tqdm
import pandas as pd

# 한 종목씩 예측(Pycaret 이용)

## 모든 종목 예측

* 종가와 상관계수 높은 특성 : KOSDAQ(0.87), KOSPI(0.79), DOW(0.58), NASDAQ(0.50), S&P500(0.56)
* 예측 순서(종가와 상관계수 높은 특성 먼저 예측)
    * weekday, weeknumdm로 KOSDAQ 예측(종가와 상관계수 0.87)
    * weekday, weeknum, KOSDAQ으로 NASDAQ 예측(KOSDAQ과 상관계수 0.68)
    * weekday, weeknum, KOSDAQ, NASDAQ으로 S&P500 예측(NASDAQ과 상관계수 0.93)
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500으로 DOW 예측(S&P500과 상관계수 0.95)
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500, DOW로 KOSPI 예측
    * weekday, weeknum, KOSDAQ, NASDAQ, S&P500, DOW, KOSPI로 다른 종가 예측
* 바로 전날 데이터 이용해 예측

In [None]:
# 제출 점수 :
# 자체 결정계수 점수 : 4.27195

In [None]:
from pycaret.regression import *

# 데이터 불러오기(첫번째 주)
* 훈련 : 10.04 ~ 10.29
* 예측 : 11.01 ~ 11.05

In [None]:
start_date = '2021-10-04'
end_date = '2021-11-04'

### train set

In [None]:
data1 = pd.read_csv('../data/20210104_20211119_data_interpolate.csv')
stock_list = pd.read_csv('../data/stock_list.csv')

In [None]:
data1.info()

Index(['Date', 'exchange_rate', 'kospi', 'kosdaq', 'kospi100', 'dow', 'nasdaq',
       'sp500', '미국채10년-2년', 'BTC', 'Gold', 'Oil', 'US10Y', 'US2Y', 'weekday',
       'weeknum'],
      dtype='object')

In [None]:
data1['Date'] = pd.to_datetime(data1['Date'])
data1['weeknum'] = data1['Date'].dt.weekofyear
data1['weekday'] = data1['Date'].dt.weekday
data1.info()

In [None]:
data1.head()

In [None]:
data1.columns

In [None]:
data1 = data1[['Date', 'weeknum', 'weekday', 'kosdaq', 'nasdaq', 'dow', 'sp500', 'kospi']]

In [None]:
data1.head()

In [None]:
train1 = data1[(data1['Date'] >= start_date) & (data1['Date'] <= end_date)]
train1.head()

In [None]:
train1.tail()

In [None]:
train1 = train1.reset_index()
train1 = train1.drop('index', axis=1)
train1.head()

In [None]:
train1.info()

In [None]:
# 종목 코드 6자리로 맞추기
stock_list['종목코드'] = stock_list['종목코드'].astype(str).str.zfill(6)

In [None]:
stock_list.head()

### test set

In [None]:
test1 = data1[(data1['Date'] >= '2021-11-01') & (data1['Date'] <= '2021-11-05')].reset_index()
test1 = test1.drop('index', axis=1)
test1.head()

In [None]:
test1.info()

In [None]:
test1 = test1[['Date','weeknum','weekday']]
test1

## 3가지 날짜 특성으로 5가지 외부 특성 순서대로 예측하기

In [None]:
features = ['Date', 'weeknum', 'weekday', 'kosdaq', 'nasdaq', 'sp500', 'dow', 'kospi']
days1 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']
pred_test1 = pd.DataFrame()
for d in tqdm(range(5)):
    train_end_day = d
    test_day = d+1
    temp_train = train1[train1['Date'] <= days1[train_end_day]]
    temp_test = test1[test1['Date'] == days1[test_day]]
    
    for i in range(5):      
        model = setup(temp_train, target = features[i+3], ignore_features = features[i+4:], fold=4, fold_shuffle=True, silent=True)
        top1_model = compare_models(sort='MAPE', n_select=1, exclude=['knn','huber','llar','omp','par'])
        final_model = finalize_model(top1_model)
        pred = predict_model(final_model, temp_test)
        
        temp_test[features[i+3]] = pred['Label']
    
    pred_test1 = pd.concat([pred_test1, temp_test])

In [None]:
pred_test1

In [None]:
pred_test1.to_csv('prac_4w_pred_1w_test1.csv', index=False)

## 8가지 특성으로 종가 예측하기

In [None]:
days1 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']
stock_pred_set1 = {}
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

for code in tqdm(stock_list['종목코드'].values):
    code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    code_data = pd.merge(Business_days, code_data, how = 'outer')
    train1['Close'] = code_data['Close']
    temp_pred_test = pd.DataFrame()
    for d in range(5):
        train_end_day = d
        test_day = d+1
        temp_train = train1[train1['Date'] <= days1[train_end_day]]
        temp_test = pred_test1[pred_test1['Date'] == days1[test_day]]

        model = setup(temp_train, target = 'Close', silent=True, fold=4, fold_shuffle=True, use_gpu=True)
        top1_model = compare_models(sort='MAPE', n_select=1, exclude=['knn','huber','llar','omp','par'])
        final_model = finalize_model(top1_model)
        pred = predict_model(final_model, temp_test)
        temp_test[code] = pred['Label']
        temp_pred_test = pd.concat([temp_pred_test, temp_test])
    
    stock_pred_set1[code] = temp_pred_test[code]

In [None]:
len(stock_pred_set1)

In [None]:
pred1 = pd.DataFrame(stock_pred_set1)
pred1.head()

In [20]:
stock_pred_dict1 = {}
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])
for code in tqdm(stock_list['종목코드'].values):
    code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    code_data = pd.merge(Business_days, code_data, how = 'outer')
    train1['Close'] = code_data['Close']

    model = setup(train1, target = 'Close', silent=True, fold=4, fold_shuffle=True, use_gpu=True)
    top1_model1 = compare_models(sort='MAPE', n_select=1, exclude=['knn','huber','llar','omp','par'])
    final_model1 = finalize_model(top1_model1)
    pred1 = predict_model(final_model1, test1)
    
    stock_pred_dict1[code] = pred1['Label']

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
en,Elastic Net,1422.9994,3371779.25,1721.9214,0.8181,0.0314,0.0264,0.015
ridge,Ridge Regression,1483.0601,3127135.125,1706.2843,0.8159,0.0317,0.0275,0.0075
lr,Linear Regression,1968.4901,5093869.6875,2237.7336,0.6157,0.0415,0.0363,0.0075
et,Extra Trees Regressor,2031.6562,8242166.1406,2546.3695,0.3915,0.0473,0.0383,0.4875
rf,Random Forest Regressor,2379.3542,10402570.7083,2953.9397,0.1346,0.0551,0.044,0.5125
catboost,CatBoost Regressor,2513.4024,11055296.3162,3131.9963,0.13,0.0583,0.0466,0.815
ada,AdaBoost Regressor,2670.8333,16685729.1667,3502.7921,-0.2842,0.0654,0.05,0.07
gbr,Gradient Boosting Regressor,2920.0741,15802958.6954,3700.7898,-0.3984,0.07,0.0549,0.03
dt,Decision Tree Regressor,3196.875,18178072.9167,3941.45,-0.3679,0.074,0.06,0.0075
br,Bayesian Ridge,3376.0107,13881056.4341,3666.6958,-0.0087,0.066,0.0613,0.0075


100%|██████████████████████████████████████████████████████████████████████████████| 370/370 [1:25:29<00:00, 13.86s/it]


In [21]:
save_model(final_model1, 'one_by_one_pred_model1')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='Close',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strategy=...
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True,
   

In [None]:
pred1.to_csv('prac_4w_pred_1w.csv', index=False)

# 데이터 불러오기(두번째 주)
* 훈련 : 11.01 ~ 11.26
* 예측 : 11.29 ~ 12.03

In [None]:
start_date = '2021-11-01'
end_date = '2021-11-26'

### train set2

In [None]:
code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
code_data = pd.merge(Business_days, code_data, how = 'outer')

In [None]:
data2 = pd.read_csv('../data/20210104-20211126_stock_index.csv')

In [None]:
data2.info()

In [None]:
data2['Date'] = pd.to_datetime(data2['Date'])
data2.info()

In [None]:
data2.tail()

In [None]:
data2['weeknum'] = data2['Date'].dt.weekofyear
data2['weekday'] = data2['Date'].dt.weekday
data2.head()

### test set2

In [27]:
test2 = pd.read_csv('../data/20211025_1119_pred_1129_1203_test.csv')

In [28]:
test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     5 non-null      object 
 1   weekday  5 non-null      int64  
 2   weeknum  5 non-null      int64  
 3   kosdaq   5 non-null      float64
 4   nasdaq   5 non-null      float64
 5   sp500    5 non-null      float64
 6   dow      5 non-null      float64
 7   kospi    5 non-null      float64
dtypes: float64(5), int64(2), object(1)
memory usage: 448.0+ bytes


In [29]:
test2.head()

Unnamed: 0,Date,weekday,weeknum,kosdaq,nasdaq,sp500,dow,kospi
0,2021-11-29,0,48,994.300008,15516.016764,4613.382896,35954.46,2971.095891
1,2021-11-30,1,48,1009.960002,15730.499891,4652.258954,36125.67,3003.252271
2,2021-12-01,2,48,997.817456,15516.016764,4613.382896,35941.957,2971.845081
3,2021-12-02,3,48,997.817456,15516.016764,4613.382896,35830.055,2977.736577
4,2021-12-03,4,48,1000.700012,15516.016764,4613.382896,35812.945,2977.736577


### 2. 8가지 특성으로 종목별 두번째 주 종가 예측

In [30]:
stock_pred_dict2 = {}
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])
for code in tqdm(stock_list['종목코드'].values):
    code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    code_data = pd.merge(Business_days, code_data, how = 'outer')
    train2['Close'] = code_data['Close']

    model=setup(train2, target = 'Close', silent=True, fold=4, fold_shuffle=True, use_gpu=True)
    top1_model2 =compare_models(sort='MAPE', n_select=1,exclude=['knn','huber','llar','omp','par'])
    final_model2 = finalize_model(top1_model2)
    pred2 = predict_model(final_model2, test2)
    
    stock_pred_dict2[code] = pred2['Label']

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,902.5723,2101521.3426,1129.7983,-0.082,0.0234,0.019,0.0125
ada,AdaBoost Regressor,1055.2083,1591940.1042,1230.165,-0.2289,0.0257,0.022,0.075
rf,Random Forest Regressor,1087.9167,1490861.9271,1167.1208,0.287,0.0242,0.0225,0.4675
gbr,Gradient Boosting Regressor,1088.03,1862307.3872,1311.3386,-1.093,0.0275,0.0226,0.0325
catboost,CatBoost Regressor,1099.6972,1475070.159,1186.113,0.0522,0.0248,0.023,0.8575
dt,Decision Tree Regressor,1158.3333,2027500.0,1375.8786,-0.8596,0.0289,0.0241,0.0125
et,Extra Trees Regressor,1192.2292,1926720.2083,1303.2452,0.0974,0.0273,0.025,0.4675
br,Bayesian Ridge,1456.9519,2802002.1451,1556.7549,-0.1876,0.0321,0.0303,0.01
lightgbm,Light Gradient Boosting Machine,1462.5,2746987.5,1551.7694,-0.0614,0.0323,0.0304,0.0375
lr,Linear Regression,1771.099,7048812.0989,2190.2071,-1.8197,0.0457,0.0366,0.0075


100%|██████████████████████████████████████████████████████████████████████████████| 370/370 [1:31:41<00:00, 14.87s/it]


In [31]:
save_model(final_model2, 'one_by_one_pred_model2')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='Close',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strategy=...
                 ('fix_perfect', Remove_100(target='Close')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
              

In [32]:
len(stock_pred_dict2)

370

In [None]:
data2 = data2[['Date', 'weeknum', 'weekday', 'kosdaq', 'nasdaq', 'dow', 'sp500', 'kospi']]

In [None]:
data2.head()

In [None]:
train2 = data2[(data2['Date'] >= start_date) & (data2['Date'] <= end_date)]
train2.head()

In [None]:
train2.tail()

In [None]:
train2 = train2.reset_index()
train2 = train2.drop('index', axis=1)
train2.head()

In [None]:
train2.info()

In [None]:
# 종목 코드 6자리로 맞추기
stock_list['종목코드'] = stock_list['종목코드'].astype(str).str.zfill(6)

In [None]:
stock_list.head()

### test set

In [None]:
Business_days = pd.DataFrame(pd.date_range(start='2021-11-29', end='2021-12-03', freq='B'), columns=['Date'])
test2 = Business_days
test2['weeknum'] = test2['Date'].dt.weekofyear
test2['weekday'] = test2['Date'].dt.weekday
test2.head()

In [None]:
test2.info()

In [72]:
test1 = test1[['Date','weeknum','weekday']]
test1

Unnamed: 0,Date,weeknum,weekday
0,2021-11-01,44,0
1,2021-11-02,44,1
2,2021-11-03,44,2
3,2021-11-04,44,3
4,2021-11-05,44,4


## 3가지 날짜 특성으로 5가지 외부 특성 순서대로 예측하기

In [None]:
features = ['Date', 'weeknum', 'weekday', 'kosdaq', 'nasdaq', 'sp500', 'dow', 'kospi']

for i in tqdm(range(4)):
    model = setup(train2, target = features[i+3], ignore_features = features[i+4:], fold=4, fold_shuffle=True,
                  silent=True, use_gpu=True)
    top1_model = compare_models(sort='MAPE', n_select=1, exclude=['knn','huber','llar','omp','par'])
    final_model = finalize_model(top1_model)
    pred = predict_model(final_model, test2)

    test2[features[i+3]] = pred['Label']

In [None]:
pred_test2 = test2
pred_test2

In [None]:
pred_test2.to_csv('prac_4w_pred_1w_test2.csv', index=False)

## 8가지 특성으로 종가 예측하기

In [None]:
days1 = ['2021-10-29', '2021-11-01', '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05']
stock_pred_set1 = {}
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

for code in tqdm(stock_list['종목코드'].values):
    code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    code_data = pd.merge(Business_days, code_data, how = 'outer')
    train1['Close'] = code_data['Close']
    temp_pred_test = pd.DataFrame()
    for d in range(5):
        train_end_day = d
        test_day = d+1
        temp_train = train1[train1['Date'] <= days1[train_end_day]]
        temp_test = pred_test1[pred_test1['Date'] == days1[test_day]]

        model = setup(temp_train, target = 'Close', silent=True, fold=4, fold_shuffle=True, use_gpu=True)
        top1_model = compare_models(sort='MAPE', n_select=1, exclude=['knn','huber','llar','omp','par'])
        final_model = finalize_model(top1_model)
        pred = predict_model(final_model, temp_test)
        temp_test[code] = pred['Label']
        temp_pred_test = pd.concat([temp_pred_test, temp_test])
    
    stock_pred_set1[code] = temp_pred_test[code]

In [None]:
len(stock_pred_set1)

In [None]:
pred1 = pd.DataFrame(stock_pred_set1)
pred1.head()

# 제출 파일 만들기

### test set2

In [None]:
test2 = pd.read_csv('../data/20211025_1119_pred_1129_1203_test.csv')

In [None]:
test2.info()

In [None]:
test2.head()

### 2. 8가지 특성으로 종목별 두번째 주 종가 예측

In [None]:
stock_pred_dict2 = {}
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])
for code in tqdm(stock_list['종목코드'].values):
    code_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    code_data = pd.merge(Business_days, code_data, how = 'outer')
    train2['Close'] = code_data['Close']

    model=setup(train2, target = 'Close', silent=True, fold=4, fold_shuffle=True, use_gpu=True)
    top1_model2 =compare_models(sort='MAPE', n_select=1,exclude=['knn','huber','llar','omp','par'])
    final_model2 = finalize_model(top1_model2)
    pred2 = predict_model(final_model2, test2)
    
    stock_pred_dict2[code] = pred2['Label']

In [None]:
save_model(final_model2, 'one_by_one_pred_model2')

In [None]:
len(stock_pred_dict2)

In [None]:
sub = pd.read_csv('../data/sample_submission.csv')
sub

In [None]:
sub.info()

In [None]:
for code in tqdm(sub.columns.values[1:]):
    temp = list(stock_pred_dict1[code]) + list(stock_pred_dict2[code])
    sub[code] = temp

In [None]:
sub

In [None]:
sub.info()

In [None]:
sub.to_csv('sub05_pycaret04.csv', index=False)