In [3]:
import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV

In [12]:
# read stock code, list

path = './open'
list_name = 'Stock_List.csv'
sample_name = 'sample_submission.csv'

stock_list = pd.read_csv(os.path.join(path,list_name))
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,005930,KOSPI
1,SK하이닉스,000660,KOSPI
2,NAVER,035420,KOSPI
3,카카오,035720,KOSPI
4,삼성바이오로직스,207940,KOSPI
...,...,...,...
365,맘스터치,220630,KOSDAQ
366,다날,064260,KOSDAQ
367,제이시스메디칼,287410,KOSDAQ
368,크리스에프앤씨,110790,KOSDAQ


In [13]:
# select start, end data
start_date = '20200613'
end_date = '20211031'

start_weekday = pd.to_datetime(start_date).weekday()
max_weeknum = pd.to_datetime(end_date).strftime('%V')
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

print(f'WEEKDAY of "start_date" : {start_weekday}')
print(f'NUM of WEEKS to "end_date" : {max_weeknum}')
print(f'HOW MANY "Business_days" : {Business_days.shape}', )
display(Business_days.head())

WEEKDAY of "start_date" : 5
NUM of WEEKS to "end_date" : 43
HOW MANY "Business_days" : (360, 1)


Unnamed: 0,Date
0,2020-06-15
1,2020-06-16
2,2020-06-17
3,2020-06-18
4,2020-06-19


In [14]:
# read submission file

# sample_name = 'sample_submission_week3.csv'
sample_submission = pd.read_csv(os.path.join(path,sample_name))

In [15]:

def get_prediction(x_close, y_close, x_close_public):
    
    param_grid = {
        'max_depth': [None, 50, 80, 110],
        'criterion': ["mse", "mae"],
        'n_estimators': [25, 50, 100, 150, 200]
    }

    prediction_close = 0.0
    for i, model in enumerate(models):
        if "max_depth" in dir(model):  # randomforest
            grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
            grid_search.fit(x_close, y_close)
            best_grid = grid_search.best_estimator_
            prediction_close += best_grid.predict(np.expand_dims(x_close_public, 0))*models_rate[i]
        else:            
            model.fit(x_close, y_close)
            prediction_close += model.predict(np.expand_dims(x_close_public, 0))*models_rate[i]
    
    return prediction_close

In [16]:
# public data, 토요일에 수정해야함

model1 = LinearRegression()
model2 = RandomForestRegressor(criterion="mae", n_estimators=200)
models = [model1, model2]

model1_rate = 0.7
model2_rate = 0.3
models_rate = [model1_rate, model2_rate]

for code in tqdm(stock_list['종목코드'].values):
    data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    data = pd.merge(Business_days, data, how = 'outer')
    
    data.Close = data.Close.ffill()
    data.Close = data.Close.bfill()

    data_close=data.drop(['Date'],axis=1)
    data_close=data_close.to_numpy()
    # data flatten  # data flatten for reshape
    data_close_flatten=data_close
    predictions = []

    
    # predict day 1

    # close
    data_close_day1 = data_close_flatten.reshape((-1,1))

    x_close = data_close_day1[:-2]  # training x
    y_close = data_close_day1[1:-1]  # training y
    x_close_public = data_close_day1[-1]  # predict x

    prediction_close = get_prediction(x_close, y_close, x_close_public)
    predictions.append(prediction_close)
    
    # predict day 1 finish
    
    
    # predict day 2
    
    # close
    # data_close_day2 = np.append(data_close_flatten, 0)  # padding
    data_close_day2 = data_close_flatten.reshape((-1,2))

    x_close = data_close_day2[:-2]
    y_close = data_close_day2[1:-1][:, 1]  # 2일차만 훈련, 예측함
    x_close_public = data_close_day2[-1]   

    prediction_close = get_prediction(x_close, y_close, x_close_public)
    predictions.append(prediction_close)
    
    # predict day 2 finish
    
    
    # predict day 3
    
    # close
    # data_close_day3 = np.append(data_close_flatten, 0)
    # data_close_day3 = np.insert(data_close_day3, 0, data_close_day3[0])
    data_close_day3 = data_close_flatten.reshape((-1, 3))

    x_close = data_close_day3[:-2]
    y_close = data_close_day3[1:-1][:, 2]
    x_close_public = data_close_day3[-1]
        
    prediction_close = get_prediction(x_close, y_close, x_close_public)
    predictions.append(prediction_close)
    
    # predict day 3 finish

    
    # predict day 4
    
    # close
    # data_close_day4 = np.insert(data_close_flatten, 0, data_close_flatten[0])
    # data_close_day4 = np.insert(data_close_day4, 0, data_close_day4[0])
    # data_close_day4 = np.append(data_close_day4, 0)
    # data_close_day4 = np.append(data_close_day4, 0)
    # data_close_day4 = np.append(data_close_day4, 0)
    data_close_day4 = data_close_flatten.reshape((-1, 4))

    x_close = data_close_day4[:-2]
    y_close = data_close_day4[1:-1][:, 3]
    x_close_public = data_close_day4[-1]

    prediction_close = get_prediction(x_close, y_close, x_close_public)
    predictions.append(prediction_close)
    
    # predict day 4 finish
    
    # predict day 5
    
    # close
    data_close_day5 = data_close_flatten.reshape((-1, 5))
    
    x_close = data_close_day5[:-2]
    y_close = data_close_day5[1:-1][:, 4]
    x_close_public = data_close_day5[-2]

    prediction_close = get_prediction(x_close, y_close, x_close_public)
    predictions.append(prediction_close)
    
    # predict day 5 finish
        
    #sample_submission.loc[:,code] = predictions*2
    sample_submission.loc[:4,code] = predictions
sample_submission.isna().sum().sum()

  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(

0

In [17]:

start_date = '20211101'
end_date = '20211105'

start_weekday = pd.to_datetime(start_date).weekday()
max_weeknum = pd.to_datetime(end_date).strftime('%V')
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

print(f'WEEKDAY of "start_date" : {start_weekday}')
print(f'NUM of WEEKS to "end_date" : {max_weeknum}')
print(f'HOW MANY "Business_days" : {Business_days.shape}', )
display(Business_days.head())

WEEKDAY of "start_date" : 0
NUM of WEEKS to "end_date" : 44
HOW MANY "Business_days" : (5, 1)


Unnamed: 0,Date
0,2021-11-01
1,2021-11-02
2,2021-11-03
3,2021-11-04
4,2021-11-05


In [18]:
real=[]
real_data=pd.read_csv('C:\\Users\\user\\Desktop\\computing\\outside\\dacon\\stock_predict2\\open\\sample_submission.csv')
for code in tqdm(stock_list['종목코드'].values):
    data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    data = pd.merge(Business_days, data, how = 'outer')
    data['weekday'] = data.Date.apply(lambda x : x.weekday())
    data['weeknum'] = data.Date.apply(lambda x : x.strftime('%V'))
    data.Close = data.Close.ffill()
    data.Close = data.Close.bfill()

    real_data_close = pd.pivot_table(data = data, values = 'Close', columns = 'weekday', index = 'weeknum')
    real_data_close_flatten = np.ravel(real_data_close.to_numpy())
    # print(real_data_close_flatten)
    

    real_data.loc[:4,code]=real_data_close_flatten


real_data

100%|██████████| 370/370 [02:28<00:00,  2.49it/s]


Unnamed: 0,Day,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-01,27850,35200,60000,146000,103000,16600,49750,84300,106500,...,48300,51700,80500,36750,25700,52000,348500,261600,26600,17600
1,2021-11-02,29250,35050,61700,148500,107000,17350,48950,86000,107500,...,50800,54100,81600,35850,25950,52900,348000,258600,27100,18100
2,2021-11-03,30250,34050,61500,145500,110000,17100,50300,85000,105500,...,49450,54600,82700,34700,25450,51200,346500,253700,26750,18300
3,2021-11-04,29450,33800,61100,145000,125500,17750,50600,87000,106000,...,48700,53800,81500,35500,25500,50600,356500,249600,26350,17550
4,2021-11-05,29550,33450,60600,144500,133500,17300,50400,88000,107000,...,49800,53900,85100,35500,25100,50200,383500,247800,26050,17550
5,2021-11-29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2021-11-30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2021-12-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2021-12-02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2021-12-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
type(real_data)
type(sample_submission)

label_np = real_data.to_numpy()[0:5,1:]
pred_np = sample_submission.to_numpy()[0:5, 1:]
print(pred_np)


result=np.mean(abs(label_np-pred_np)/label_np)*100
print(result)

[[28180.12881365511 34734.92628485734 60020.04352921345 ...
  261912.59341766758 26808.97314752352 17175.704372662134]
 [28415.167120868977 34737.04459334556 60572.3833241637 ...
  261906.23317460617 27012.530891756214 17420.20555263924]
 [28872.605189204907 34904.042605761504 60227.25688077925 ...
  261343.08593951914 27296.181555463736 17690.419807519997]
 [29681.511598592988 34670.545393590684 61068.17295893324 ...
  262661.71530017076 27120.71394298413 17884.932940598657]
 [29093.2147311636 36264.85271211459 61544.07339832012 ...
  258657.08531296384 27086.919007684537 18811.645664108295]]
3.488768339291749


In [None]:
sample_submission.to_csv('linear+rf_test.csv',index=False)

In [20]:
#private
# select start, end data

start_date = '20200711'
end_date = '20211126'

start_weekday = pd.to_datetime(start_date).weekday()
max_weeknum = pd.to_datetime(end_date).strftime('%V')
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

print(f'WEEKDAY of "start_date" : {start_weekday}')
print(f'NUM of WEEKS to "end_date" : {max_weeknum}')
print(f'HOW MANY "Business_days" : {Business_days.shape}', )
display(Business_days.head())

WEEKDAY of "start_date" : 5
NUM of WEEKS to "end_date" : 47
HOW MANY "Business_days" : (360, 1)


Unnamed: 0,Date
0,2020-07-13
1,2020-07-14
2,2020-07-15
3,2020-07-16
4,2020-07-17


In [21]:
# public data, 토요일에 수정해야함

model1 = LinearRegression()
model2 = RandomForestRegressor(criterion="mae", n_estimators=200)
models = [model1, model2]

model1_rate = 0.7
model2_rate = 0.3
models_rate = [model1_rate, model2_rate]

for code in tqdm(stock_list['종목코드'].values):
    data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    data = pd.merge(Business_days, data, how = 'outer')
    
    data.Close = data.Close.ffill()
    data.Close = data.Close.bfill()

    data_close=data.drop(['Date'],axis=1)
    data_close=data_close.to_numpy()
    # data flatten  # data flatten for reshape
    data_close_flatten=data_close
    predictions = []

    
    # predict day 1

    # close
    data_close_day1 = data_close_flatten.reshape((-1,1))

    x_close = data_close_day1[:-2]  # training x
    y_close = data_close_day1[1:-1]  # training y
    x_close_public = data_close_day1[-1]  # predict x

    prediction_close = get_prediction(x_close, y_close, x_close_public)
    predictions.append(prediction_close)
    
    # predict day 1 finish
    
    
    # predict day 2
    
    # close
    # data_close_day2 = np.append(data_close_flatten, 0)  # padding
    data_close_day2 = data_close_flatten.reshape((-1,2))

    x_close = data_close_day2[:-2]
    y_close = data_close_day2[1:-1][:, 1]  # 2일차만 훈련, 예측함
    x_close_public = data_close_day2[-1]   

    prediction_close = get_prediction(x_close, y_close, x_close_public)
    predictions.append(prediction_close)
    
    # predict day 2 finish
    
    
    # predict day 3
    
    # close
    # data_close_day3 = np.append(data_close_flatten, 0)
    # data_close_day3 = np.insert(data_close_day3, 0, data_close_day3[0])
    data_close_day3 = data_close_flatten.reshape((-1, 3))

    x_close = data_close_day3[:-2]
    y_close = data_close_day3[1:-1][:, 2]
    x_close_public = data_close_day3[-1]
        
    prediction_close = get_prediction(x_close, y_close, x_close_public)
    predictions.append(prediction_close)
    
    # predict day 3 finish

    
    # predict day 4
    
    # close
    # data_close_day4 = np.insert(data_close_flatten, 0, data_close_flatten[0])
    # data_close_day4 = np.insert(data_close_day4, 0, data_close_day4[0])
    # data_close_day4 = np.append(data_close_day4, 0)
    # data_close_day4 = np.append(data_close_day4, 0)
    # data_close_day4 = np.append(data_close_day4, 0)
    data_close_day4 = data_close_flatten.reshape((-1, 4))

    x_close = data_close_day4[:-2]
    y_close = data_close_day4[1:-1][:, 3]
    x_close_public = data_close_day4[-1]

    prediction_close = get_prediction(x_close, y_close, x_close_public)
    predictions.append(prediction_close)
    
    # predict day 4 finish
    
    # predict day 5
    
    # close
    data_close_day5 = data_close_flatten.reshape((-1, 5))
    
    x_close = data_close_day5[:-2]
    y_close = data_close_day5[1:-1][:, 4]
    x_close_public = data_close_day5[-2]

    prediction_close = get_prediction(x_close, y_close, x_close_public)
    predictions.append(prediction_close)
    
    # predict day 5 finish
        
    #sample_submission.loc[:,code] = predictions*2
    sample_submission.loc[5:9,code] = predictions
sample_submission.isna().sum().sum()

  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  self.best_estimator_.fit(X, y, **fit_params)
  return asarray(a).ndim
  1%|          | 4/370 [04:31<6:54:20, 67.93s/it]


KeyboardInterrupt: 

In [None]:
sample_submission.to_csv('linear+rf.csv',index=False)

In [None]:
sample_submission

In [None]:
# # public data, 토요일에 수정해야함

# model1 = LinearRegression()
# model2 = RandomForestRegressor(criterion="mae", n_estimators=200)
# models = [model1, model2]

# model1_rate = 0.7
# model2_rate = 0.3
# models_rate = [model1_rate, model2_rate]

# data = fdr.DataReader("000060", start = start_date, end = end_date)[['Close']].reset_index()
# data = pd.merge(Business_days, data, how = 'outer')
# data['weekday'] = data.Date.apply(lambda x : x.weekday())
# data['weeknum'] = data.Date.apply(lambda x : x.strftime('%V'))
# data.Close = data.Close.ffill()
# data.Close = data.Close.bfill()

# data_close = pd.pivot_table(data = data, values = 'Close', columns = 'weekday', index = 'weeknum')

# # data flatten  # data flatten for reshape
# data_close_flatten = np.ravel(data_close.to_numpy())

# predictions = []


# # predict day 1

# # close
# data_close_day1 = data_close_flatten.reshape((-1,1))

# x_close = data_close_day1[:-6]  # training x
# y_close = data_close_day1[1:-5]  # training y
# x_close_public = data_close_day1[-6]  # predict x

# prediction_close = get_prediction(x_close, y_close, x_close_public)
# predictions.append(prediction_close)

# # predict day 1 finish


# # predict day 2

# # close
# data_close_day2 = np.append(data_close_flatten, 0)  # padding
# data_close_day2 = data_close_day2.reshape((-1,2))

# x_close = data_close_day2[:-4]
# y_close = data_close_day2[1:-3][:, 1]  # 2일차만 훈련, 예측함
# x_close_public = data_close_day2[-4]   

# prediction_close = get_prediction(x_close, y_close, x_close_public)
# predictions.append(prediction_close)

# # predict day 2 finish


# # predict day 3

# # close
# data_close_day3 = np.append(data_close_flatten, 0)
# data_close_day3 = np.insert(data_close_day3, 0, data_close_day3[0])
# data_close_day3 = data_close_day3.reshape((-1, 3))

# x_close = data_close_day3[:-3]
# y_close = data_close_day3[1:-2][:, 2]
# x_close_public = data_close_day3[-3]

# prediction_close = get_prediction(x_close, y_close, x_close_public)
# predictions.append(prediction_close)

# # predict day 3 finish


# # predict day 4

# # close
# data_close_day4 = np.insert(data_close_flatten, 0, data_close_flatten[0])
# data_close_day4 = np.insert(data_close_day4, 0, data_close_day4[0])
# data_close_day4 = np.append(data_close_day4, 0)
# data_close_day4 = np.append(data_close_day4, 0)
# data_close_day4 = np.append(data_close_day4, 0)
# data_close_day4 = data_close_day4.reshape((-1, 4))

# x_close = data_close_day4[:-3]
# y_close = data_close_day4[1:-2][:, 3]
# x_close_public = data_close_day4[-3]

# prediction_close = get_prediction(x_close, y_close, x_close_public)
# predictions.append(prediction_close)

# # predict day 4 finish

# # predict day 5

# # close
# data_close_day5 = data_close_flatten.reshape((-1, 5))

# x_close = data_close_day5[:-2]
# y_close = data_close_day5[1:-1][:, 4]
# x_close_public = data_close_day5[-2]

# prediction_close = get_prediction(x_close, y_close, x_close_public)
# predictions.append(prediction_close)

# # predict day 5 finish


  self.best_estimator_.fit(X, y, **fit_params)


RandomForestRegressor(max_depth=110, n_estimators=25)
RandomForestRegressor(criterion='mae', max_depth=50, n_estimators=25)
RandomForestRegressor(criterion='mae', max_depth=150, n_estimators=25)
RandomForestRegressor(criterion='mae', max_depth=80, n_estimators=25)
RandomForestRegressor(max_depth=80, n_estimators=25)
