### Stock Prediction Workshop

In [188]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor


### Data loading
유럽 입자 물리 연구소 주가데이터 로딩

In [189]:
# CERN.us.txt파일을 로딩한후 앞에서 10건을 출력해 보세요.
df = pd.read_csv('CERN.us.txt', sep=',')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt
0,2005-02-25,6.375,6.545,6.365,6.5125,3816664,0
1,2005-02-28,6.4775,6.54,6.4625,6.5125,4163088,0
2,2005-03-01,6.5225,6.625,6.5,6.595,4320872,0
3,2005-03-02,6.605,6.6825,6.5625,6.62,3547720,0
4,2005-03-03,6.6475,6.6825,6.4375,6.5125,3183528,0


In [190]:
#데이터 프레임의 정보를 출력하세요.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3201 entries, 0 to 3200
Data columns (total 7 columns):
Date       3201 non-null object
Open       3201 non-null float64
High       3201 non-null float64
Low        3201 non-null float64
Close      3201 non-null float64
Volume     3201 non-null int64
OpenInt    3201 non-null int64
dtypes: float64(4), int64(2), object(1)
memory usage: 175.1+ KB


In [191]:
### Date 컬럼의 타입이 Object 타입이므로 Datetime 타입으로 변경하세요. 
# pandas 의 to_datetime을 이용하세요. 

df.Date = pd.to_datetime(df.Date)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3201 entries, 0 to 3200
Data columns (total 7 columns):
Date       3201 non-null datetime64[ns]
Open       3201 non-null float64
High       3201 non-null float64
Low        3201 non-null float64
Close      3201 non-null float64
Volume     3201 non-null int64
OpenInt    3201 non-null int64
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 175.1 KB


In [192]:
### 연도가 2010년도 이후 데이터만 추려서 df2에 담으세요. 
### Date 컬럼의 dt 객체의 year는 df.Date.dt.year 로 가져올수 있습니다. 

df2 = df[df.Date.dt.year >= 2010].copy() 
df2.head()


Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt
1222,2010-01-04,20.94,21.103,20.713,21.068,3058908,0
1223,2010-01-05,21.355,22.345,21.25,22.313,7324724,0
1224,2010-01-06,22.5,22.873,22.265,22.548,6844460,0
1225,2010-01-07,22.423,22.75,22.23,22.68,2968936,0
1226,2010-01-08,22.578,22.663,22.345,22.633,3238372,0


In [193]:
### df2의 인덱스를 0부터 df2의 len 만큼 순차적인 정수값으로 세팅하세요. 
df2.index = range(len(df2))
df2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt
0,2010-01-04,20.94,21.103,20.713,21.068,3058908,0
1,2010-01-05,21.355,22.345,21.25,22.313,7324724,0
2,2010-01-06,22.5,22.873,22.265,22.548,6844460,0
3,2010-01-07,22.423,22.75,22.23,22.68,2968936,0
4,2010-01-08,22.578,22.663,22.345,22.633,3238372,0


In [194]:
# 지수이동 평균, 단순이동평균을 구해서 feature 로 추가하세요. 
# 지수이동평균은 EMA_9, 5일 이동평균은 SMA_5, 10일 이동평균은 SMA_10, 15일 이동평균은 SMA_15, 30일 이동평균은 SMA_30으로 컬럼을 추가하세요. 

df2['EMA_9'] = df2['Close'].ewm(9).mean().shift()        #지수이동평균(9일)
df2['SMA_5'] = df2['Close'].rolling(5).mean().shift()    #단순이동평균(5일)
df2['SMA_10'] = df2['Close'].rolling(10).mean().shift()  #단순이동평균(10일)
df2['SMA_15'] = df2['Close'].rolling(15).mean().shift()  #단순이동평균(15일)
df2['SMA_30'] = df2['Close'].rolling(30).mean().shift()  #단순이동평균(30일)

df2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30
0,2010-01-04,20.94,21.103,20.713,21.068,3058908,0,,,,,
1,2010-01-05,21.355,22.345,21.25,22.313,7324724,0,21.068,,,,
2,2010-01-06,22.5,22.873,22.265,22.548,6844460,0,21.723263,,,,
3,2010-01-07,22.423,22.75,22.23,22.68,2968936,0,22.027594,,,,
4,2010-01-08,22.578,22.663,22.345,22.633,3238372,0,22.217302,,,,


In [195]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt
0,2005-02-25,6.375,6.545,6.365,6.5125,3816664,0
1,2005-02-28,6.4775,6.54,6.4625,6.5125,4163088,0
2,2005-03-01,6.5225,6.625,6.5,6.595,4320872,0
3,2005-03-02,6.605,6.6825,6.5625,6.62,3547720,0
4,2005-03-03,6.6475,6.6825,6.4375,6.5125,3183528,0


In [196]:
### NaN이 있는 행을 삭제 하세요. dropna 
df2.dropna(inplace=True)

In [197]:
# train_data는 70%,test_data는 30%로 데이터를 분할하세요. 
# 시계열은 시간순으로 데이터를 분할해야합니다. (랜덤샘플링 하면 안됨)

test_size  = 0.3
test_split_idx  = int(df2.shape[0] * (1-test_size))

train_df  = df2.loc[:test_split_idx].copy()
test_df   = df2.loc[test_split_idx+1:].copy()

In [198]:
train_df.shape

(1335, 12)

In [199]:
test_df.shape

(614, 12)

### Y 값 변경

In [200]:
# Close 열의 주가는 다음날을 예측하는 것이 목표이므로 하루 앞으로 shift 해서, Close 컬럼을 대체하세요.
print(pd.concat([df2.Close, df2['Close'].shift(-1)], axis=1).head()
)
df2['Close'] = df2['Close'].shift(-1)

# 아래는 예시입니다(before/after)

     Close   Close
30  20.093  20.315
31  20.315  20.430
32  20.430  20.533
33  20.533  20.225
34  20.225  20.633


In [201]:
train_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30
30,2010-02-17,20.0,20.243,19.91,20.093,2047524,0,19.954861,19.6976,19.4897,19.6374,20.749333
31,2010-02-18,20.12,20.385,20.043,20.315,1936820,0,19.969223,19.7682,19.5267,19.601267,20.716833
32,2010-02-19,20.27,20.525,20.205,20.43,1841604,0,20.00503,19.9622,19.6292,19.580733,20.650233
33,2010-02-22,20.545,20.63,20.27,20.533,2054768,0,20.048882,20.0862,19.7799,19.604867,20.579633
34,2010-02-23,20.52,20.66,20.105,20.225,3282488,0,20.098679,20.2568,19.9197,19.712733,20.508067


In [202]:
### 불필요한 컬럼 삭제 
drop_cols = ['Date', 'Volume', 'Open', 'Low', 'High', 'OpenInt']

train_df = train_df.drop(drop_cols, 1)
test_df  = test_df.drop(drop_cols, 1)

In [203]:
### 학습용 데이터인 y_train 은 train_df의 Close 컬럼으로 할당하고, train_df에서는 Close 컬럼을 삭제하세요. 
### 테스트 데이터인 y_test는 test_df 의 Close 컬럼으로 할당하고, test_df에서 Close 컬럼을 삭제하세요.

y_train = train_df['Close'].copy()
X_train = train_df.drop(['Close'], 1)

y_test  = test_df['Close'].copy()
X_test  = test_df.drop(['Close'], 1)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1335 entries, 30 to 1364
Data columns (total 5 columns):
EMA_9     1335 non-null float64
SMA_5     1335 non-null float64
SMA_10    1335 non-null float64
SMA_15    1335 non-null float64
SMA_30    1335 non-null float64
dtypes: float64(5)
memory usage: 62.6 KB


In [204]:
train_df.head()

Unnamed: 0,Close,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30
30,20.093,19.954861,19.6976,19.4897,19.6374,20.749333
31,20.315,19.969223,19.7682,19.5267,19.601267,20.716833
32,20.43,20.00503,19.9622,19.6292,19.580733,20.650233
33,20.533,20.048882,20.0862,19.7799,19.604867,20.579633
34,20.225,20.098679,20.2568,19.9197,19.712733,20.508067


### 모델링 
- GradientBoostingRegressor 를 이용하여 모델링하고, 
- GridSearch 기능을 사용해 최적의 하이퍼파라미터를 찾으세요
- 최적이 파라미터를 출력하고, train 의 best score를 출력하세요.

In [206]:
%%time
from sklearn.ensemble import GradientBoostingRegressor

parameters = {
    'n_estimators': [100, 300, 400],
    'learning_rate': [0.001, 0.01, 0.05],
    'max_depth': [10, 12, 15],
    'random_state': [42]
}

model = GradientBoostingRegressor()
clf = GridSearchCV(model, parameters)
clf.fit(X_train, y_train)

print(f'Best params: {clf.best_params_}')
print(f'Best validation score = {clf.best_score_}')

Best params: {'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 400, 'random_state': 42}
Best validation score = -0.8449760794903611


In [207]:
# 테스트 데이터의 점수(결정계수)를 출력해보세요.
clf.score(X_test, y_test)

0.8734087636494429