In [26]:
# !pip install finance-datareader

In [27]:
import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr

#### 종목 코드 데이터

In [28]:
path = './data'
list_name = 'Stock_List.csv'
stock_list = pd.read_csv(os.path.join(path,list_name))
stock_list.head()

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,5930,KOSPI
1,SK하이닉스,660,KOSPI
2,NAVER,35420,KOSPI
3,카카오,35720,KOSPI
4,삼성바이오로직스,207940,KOSPI


In [29]:
# 종목 코드 6자리로 맞추기
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,005930,KOSPI
1,SK하이닉스,000660,KOSPI
2,NAVER,035420,KOSPI
3,카카오,035720,KOSPI
4,삼성바이오로직스,207940,KOSPI
...,...,...,...
365,맘스터치,220630,KOSDAQ
366,다날,064260,KOSDAQ
367,제이시스메디칼,287410,KOSDAQ
368,크리스에프앤씨,110790,KOSDAQ


#### 종목 주가 불러오기 테스트

In [30]:
# 삼성전자 주가 
start_date = '20200101'
end_date = '20201231'
sample_code = '005930'
stock = fdr.DataReader(sample_code, start = start_date, end = end_date)
stock

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-02,55500,56000,55000,55200,12993228,-0.010753
2020-01-03,56000,56600,54900,55500,15422255,0.005435
2020-01-06,54900,55600,54600,55500,10278951,0.000000
2020-01-07,55700,56400,55600,55800,10009778,0.005405
2020-01-08,56200,57400,55900,56800,23501171,0.017921
...,...,...,...,...,...,...
2020-12-23,72400,74000,72300,73900,19411326,0.022130
2020-12-24,74100,78800,74000,77800,32502870,0.052774
2020-12-28,79000,80100,78200,78700,40085044,0.011568
2020-12-29,78800,78900,77300,78300,30339449,-0.005083


In [31]:
# 평일 날짜 데이터 만들기
start_date = '20210104'
end_date = '20211105'

start_weekday = pd.to_datetime(start_date).weekday()
max_weeknum = pd.to_datetime(end_date).strftime('%V')
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

print(f'WEEKDAY of "start_date" : {start_weekday}')
print(f'NUM of WEEKS to "end_date" : {max_weeknum}')
print(f'HOW MANY "Business_days" : {Business_days.shape}', )
display(Business_days.head())

WEEKDAY of "start_date" : 0
NUM of WEEKS to "end_date" : 44
HOW MANY "Business_days" : (220, 1)


Unnamed: 0,Date
0,2021-01-04
1,2021-01-05
2,2021-01-06
3,2021-01-07
4,2021-01-08


# Baseline 모델의 구성 소개 ( Sample )
* X : (월 ~ 금) * 43주간
* y : (다음주 월 ~ 금) * 43주간
    * y_0 : 다음주 월요일
    * y_1 : 다음주 화요일
    * y_2 : 다음주 수요일
    * y_3 : 다음주 목요일
    * y_4 : 다음주 금요일
* 이번주 월~금요일의 패턴을 학습해 다음주 월요일 ~ 금요일을 각각 예측하는 모델을 생성

* 이 과정을 모든 종목(370개)에 적용

In [32]:
sample_code = stock_list.loc[0,'종목코드']
sample_code

'005930'

In [33]:
sample = fdr.DataReader(sample_code, start = start_date, end = end_date)[['Close']].reset_index()
sample

Unnamed: 0,Date,Close
0,2021-01-04,83000
1,2021-01-05,83900
2,2021-01-06,82200
3,2021-01-07,82900
4,2021-01-08,88800
...,...,...
204,2021-11-01,69900
205,2021-11-02,71500
206,2021-11-03,70400
207,2021-11-04,70600


In [34]:
sample = pd.merge(Business_days, sample, how = 'outer')
sample

Unnamed: 0,Date,Close
0,2021-01-04,83000.0
1,2021-01-05,83900.0
2,2021-01-06,82200.0
3,2021-01-07,82900.0
4,2021-01-08,88800.0
...,...,...
215,2021-11-01,69900.0
216,2021-11-02,71500.0
217,2021-11-03,70400.0
218,2021-11-04,70600.0


In [35]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220 entries, 0 to 219
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    220 non-null    datetime64[ns]
 1   Close   209 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 5.2 KB


In [36]:
sample['weekday'] = sample.Date.apply(lambda x : x.weekday())
sample['weeknum'] = sample.Date.apply(lambda x : x.strftime('%V'))
sample

Unnamed: 0,Date,Close,weekday,weeknum
0,2021-01-04,83000.0,0,01
1,2021-01-05,83900.0,1,01
2,2021-01-06,82200.0,2,01
3,2021-01-07,82900.0,3,01
4,2021-01-08,88800.0,4,01
...,...,...,...,...
215,2021-11-01,69900.0,0,44
216,2021-11-02,71500.0,1,44
217,2021-11-03,70400.0,2,44
218,2021-11-04,70600.0,3,44


In [37]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220 entries, 0 to 219
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     220 non-null    datetime64[ns]
 1   Close    209 non-null    float64       
 2   weekday  220 non-null    int64         
 3   weeknum  220 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 8.6+ KB


In [38]:
sample.Close = sample.Close.ffill()
sample

Unnamed: 0,Date,Close,weekday,weeknum
0,2021-01-04,83000.0,0,01
1,2021-01-05,83900.0,1,01
2,2021-01-06,82200.0,2,01
3,2021-01-07,82900.0,3,01
4,2021-01-08,88800.0,4,01
...,...,...,...,...
215,2021-11-01,69900.0,0,44
216,2021-11-02,71500.0,1,44
217,2021-11-03,70400.0,2,44
218,2021-11-04,70600.0,3,44


In [39]:
sample = pd.pivot_table(data = sample, values = 'Close', columns = 'weekday', index = 'weeknum')
sample

weekday,0,1,2,3,4
weeknum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,83000.0,83900.0,82200.0,82900.0,88800.0
2,91000.0,90600.0,89700.0,89700.0,88000.0
3,85000.0,87000.0,87200.0,88100.0,86800.0
4,89400.0,86700.0,85600.0,83700.0,82000.0
5,83000.0,84400.0,84600.0,82500.0,83500.0
6,83000.0,82700.0,81600.0,81600.0,81600.0
7,84200.0,84900.0,83200.0,82100.0,82600.0
8,82200.0,82000.0,82000.0,85300.0,82500.0
9,82500.0,83600.0,84000.0,82400.0,82100.0
10,82000.0,81400.0,80900.0,82000.0,82800.0


In [40]:
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

In [41]:
model = LinearRegression()

In [42]:
# 43-44주 데이터(10/25-29, 11/1-5)
test = sample.iloc[0:-2]
test

weekday,0,1,2,3,4
weeknum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,83000.0,83900.0,82200.0,82900.0,88800.0
2,91000.0,90600.0,89700.0,89700.0,88000.0
3,85000.0,87000.0,87200.0,88100.0,86800.0
4,89400.0,86700.0,85600.0,83700.0,82000.0
5,83000.0,84400.0,84600.0,82500.0,83500.0
6,83000.0,82700.0,81600.0,81600.0,81600.0
7,84200.0,84900.0,83200.0,82100.0,82600.0
8,82200.0,82000.0,82000.0,85300.0,82500.0
9,82500.0,83600.0,84000.0,82400.0,82100.0
10,82000.0,81400.0,80900.0,82000.0,82800.0


In [43]:
# 2021년 1월 04일 ~ 2021년 10월 22일
x = sample.iloc[0:-2].to_numpy()
x.shape

(42, 5)

In [44]:
# sample.iloc[1:-1] : # 2021년 1월 11일 ~ 2021년 10월 29일
y = sample.iloc[1:-1].to_numpy()
y_0 = y[:,0]
y_1 = y[:,1]
y_2 = y[:,2]
y_3 = y[:,3]
y_4 = y[:,4]

y_values = [y_0, y_1, y_2, y_3, y_4]

In [45]:
# sample.iloc[-2] : 10/25-29
x_public = sample.iloc[-2].to_numpy()

### 예측

In [46]:
predictions = []
for y_value in y_values :
    model.fit(x,y_value)
    prediction = model.predict(np.expand_dims(x_public,0))
    predictions.append(prediction[0])
predictions

[70206.67660106532,
 69631.42785252717,
 69062.32129096359,
 69258.21096883612,
 68846.00977524318]

### 실제 Public 값

In [47]:
sample.iloc[-1].values

array([69900., 71500., 70400., 70600., 70200.])

## 전체 모델링

In [48]:
sample_name = 'sample_submission.csv'
sample_submission = pd.read_csv(os.path.join(path,sample_name))

In [49]:
sample_submission

Unnamed: 0,Day,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2021-11-02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2021-11-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2021-11-04,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2021-11-05,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2021-11-29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2021-11-30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2021-12-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2021-12-02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2021-12-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
model = LinearRegression()
for code in tqdm(stock_list['종목코드'].values):
    data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    data = pd.merge(Business_days, data, how = 'outer')
    data['weekday'] = data.Date.apply(lambda x : x.weekday())
    data['weeknum'] = data.Date.apply(lambda x : x.strftime('%V'))
    data.Close = data.Close.ffill()
    data = pd.pivot_table(data = data, values = 'Close', columns = 'weekday', index = 'weeknum')
    
    x = data.iloc[0:-2].to_numpy() # 2021년 1월 04일 ~ 2021년 10월 22일까지의 데이터로
    y = data.iloc[1:-1].to_numpy() # 2021년 1월 11일 ~ 2021년 10월 29일까지의 데이터를 학습한다.
    y_0 = y[:,0]
    y_1 = y[:,1]
    y_2 = y[:,2]
    y_3 = y[:,3]
    y_4 = y[:,4]

    y_values = [y_0, y_1, y_2, y_3, y_4]
    x_public = data.iloc[-2].to_numpy() # 2021년 11월 1일부터 11월 5일까지의 데이터를 예측할 것이다.
    
    predictions = []
    for y_value in y_values :
        model.fit(x,y_value)
        prediction = model.predict(np.expand_dims(x_public,0))
        predictions.append(prediction[0])
    sample_submission.loc[:,code] = predictions * 2
sample_submission.isna().sum().sum()

100%|████████████████████████████████████████████████████████████████████████████████| 370/370 [02:01<00:00,  3.03it/s]


0

# 제출 파일 만들기

In [51]:
sample_submission.columns

Index(['Day', '000060', '000080', '000100', '000120', '000150', '000240',
       '000250', '000270', '000660',
       ...
       '330860', '336260', '336370', '347860', '348150', '348210', '352820',
       '357780', '363280', '950130'],
      dtype='object', length=371)

In [52]:
columns = list(sample_submission.columns[1:])

columns = ['Day'] + [str(x).zfill(6) for x in columns]

sample_submission.columns = columns

In [53]:
sample_submission.to_csv('BASELINE_Linear.csv',index=False)

In [54]:
sample_submission

Unnamed: 0,Day,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-01,27919.530611,34687.673458,60773.779528,142621.815394,104901.698658,16669.447967,47219.595113,85236.83317,103490.352393,...,49749.405974,51984.322942,84384.021784,36846.592704,25721.026664,53328.350326,336697.743579,262257.538308,27176.08509,17382.219194
1,2021-11-02,28750.750484,35032.651375,60312.130021,143012.927861,107216.342323,17001.594758,46672.351191,85360.327648,102788.687368,...,48923.253693,51539.56413,84739.157977,35258.227509,25330.52806,53645.466661,335662.149461,264562.230652,27415.549191,17338.522537
2,2021-11-03,28858.095631,34995.888574,60241.47041,145626.792237,111192.733424,17665.577952,45757.683516,85665.326378,102943.419081,...,48834.428543,49341.153729,85450.80606,35320.479243,25623.131825,55136.12148,329167.718872,264623.119599,27466.446666,17286.577226
3,2021-11-04,28901.301911,34866.098057,59701.569734,145351.536595,109530.741544,17846.022961,46380.15235,85494.864447,99958.476851,...,48954.93564,48626.256108,84823.889868,34343.016848,25878.11514,55964.135511,329482.609718,261821.140588,27594.351745,17247.876124
4,2021-11-05,28152.93854,34873.93908,59968.39316,143505.827198,108761.777883,18078.266972,46975.701291,84943.135732,100294.829339,...,49427.019462,47063.105078,86397.651814,34062.808374,26472.657621,55323.587424,321108.356663,264131.897754,27408.36665,17492.773824
5,2021-11-29,27919.530611,34687.673458,60773.779528,142621.815394,104901.698658,16669.447967,47219.595113,85236.83317,103490.352393,...,49749.405974,51984.322942,84384.021784,36846.592704,25721.026664,53328.350326,336697.743579,262257.538308,27176.08509,17382.219194
6,2021-11-30,28750.750484,35032.651375,60312.130021,143012.927861,107216.342323,17001.594758,46672.351191,85360.327648,102788.687368,...,48923.253693,51539.56413,84739.157977,35258.227509,25330.52806,53645.466661,335662.149461,264562.230652,27415.549191,17338.522537
7,2021-12-01,28858.095631,34995.888574,60241.47041,145626.792237,111192.733424,17665.577952,45757.683516,85665.326378,102943.419081,...,48834.428543,49341.153729,85450.80606,35320.479243,25623.131825,55136.12148,329167.718872,264623.119599,27466.446666,17286.577226
8,2021-12-02,28901.301911,34866.098057,59701.569734,145351.536595,109530.741544,17846.022961,46380.15235,85494.864447,99958.476851,...,48954.93564,48626.256108,84823.889868,34343.016848,25878.11514,55964.135511,329482.609718,261821.140588,27594.351745,17247.876124
9,2021-12-03,28152.93854,34873.93908,59968.39316,143505.827198,108761.777883,18078.266972,46975.701291,84943.135732,100294.829339,...,49427.019462,47063.105078,86397.651814,34062.808374,26472.657621,55323.587424,321108.356663,264131.897754,27408.36665,17492.773824
