In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import os
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn import svm
import seaborn as sns; sns.set()

df = pd.read_csv('../data/ETFs_main.csv')

In [2]:
"""기술지표: 이동평균, 거래량 이동 평균, rsi 등"""

def moving_average(df, n):
    MA = pd.Series(df['CLOSE_SPY'].rolling(n, min_periods=n).mean(), name = 'MA_'+str(n))
    df = df.join(MA)
    return df

def volume_moving_average(df, n):
    VMA = pd.Series(df['VOLUME'].rolling(n, min_periods=n).mean(), name = 'VMA_'+str(n))
    df = df.join(VMA)
    return df

def relative_strength_index(df, n):
    i = 0
    UpI = [0]
    DoI = [0]
    while i+1 <= df.index[-1]:
        UpMove = df.loc[i+1, 'HIGH'] - df.loc[i, 'HIGH']
        DoMove = df.loc[i, 'LOW'] - df.loc[i+1, 'LOW']
        if UpMove > DoMove and UpMove > 0:
            UpD = UpMove
        else:
            UpD = 0
        UpI.append(UpD)
        if DoMove > UpMove and DoMove > 0:
            DoD = DoMove
        else:
            DoD = 0
        DoI.append(DoD)
        i = i + 1
    UpI = pd.Series(UpI)
    DoI = pd.Series(DoI)
    PosDI = pd.Series(UpI.ewm(span=n, min_periods=n).mean())   # 지수 이동 평균
    NegDI = pd.Series(DoI.ewm(span=n, min_periods=n).mean())
    RSI = pd.Series(PosDI / (PosDI + NegDI), name = 'RSI_'+str(n))
    df = df.join(RSI)
    return df

In [3]:
"""지표 추가, 인덱스 설정, 결측치 제거, 일별 수익률 계산"""

df = moving_average(df, 45)
df = volume_moving_average(df, 45)
df = relative_strength_index(df, 14)

df = df.set_index('Dates')
df = df.dropna()

df['target'] = df['CLOSE_SPY'].pct_change()
df['target'] = np.where(df['target'] > 0, 1, -1)   # 오른 날은 1, 내린 날은 -1로 만들어줌
# df['target'] = df.apply(lambda x: 1 if df['target'] > 0 else -1, axis=1)

df['target'] = df['target'].shift(-1)   # 당일까지의 데이터로 다음날을 예측해야 하기 때문에 한칸씩 앞으로 당겨줌
df = df.dropna()

df['target'] = df['target'].astype(np.int64)   # 정수 처리
y_var = df['target']
x_var = df.drop(['target', 'OPEN', 'HIGH', 'LOW', 'VOLUME', 'CLOSE_SPY'], axis=1)
x_var.head()

Unnamed: 0_level_0,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2007-04-27,67.56,83.73,2.4474,0.55,51.84,24.54,12.45,41.75,143.551556,110669600.0,0.670018
2007-04-30,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,111646600.0,0.531751
2007-05-02,66.66,83.38,2.4366,0.59,49.59,24.66,13.08,42.02,143.680667,112161300.0,0.55405
2007-05-03,67.49,83.11,2.4346,0.6,49.28,24.69,13.09,42.435,143.780222,112342100.0,0.601028
2007-05-04,68.19,83.23,2.4006,0.6,48.3,24.6,12.91,42.595,143.905111,112885300.0,0.665987


In [4]:
"""훈련셋, 테스트셋 나누기"""

# 기간이 섞이면 안되니 순차적으로 split할 수 있게 shuffle=False로 둔다
X_train, X_test, y_train, y_test = train_test_split(x_var, y_var, test_size=0.3, shuffle=False, random_state=3)

# 양성, 음성 샘플 비율 
train_count = y_train.count()
test_count = y_test.count()

print('train set label ratio')
print(y_train.value_counts() / train_count)
print('test set label ratio')
print(y_test.value_counts() / test_count)

train set label ratio
 1    0.543501
-1    0.456499
Name: target, dtype: float64
test set label ratio
 1    0.530562
-1    0.469438
Name: target, dtype: float64


In [5]:
"""혼동 행렬을 계산하는 함수(오차 행렬)"""

def get_confusion_matrix(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_score = roc_auc_score(y_test, pred)
    print('confusion matrix')
    print('accuracy: {}, precision: {}, recall: {}, F1: {}, ROC AUC score: {}'.format(accuracy, precision, recall, f1, roc_score) )


In [6]:
# xgboost 분류기를 활용하여 모델 만들기
xgb_dis = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb_dis.fit(X_train, y_train)
xgb_pred = xgb_dis.predict(X_test)
print(xgb_dis.score(X_train, y_train))
get_confusion_matrix(y_test, xgb_pred)

0.8763102725366876
confusion matrix
accuracy: 0.49144254278728605, precision: 0.5241935483870968, recall: 0.44930875576036866, F1: 0.4838709677419355, ROC AUC score: 0.4941856278801844


In [7]:
# Grid Search를 통해 다양한 파라미터들을 테스트한다
n_estimators = range(10, 30, 10)
params = {
    'bootstrap': [True],
    'n_estimators': n_estimators,
    'max_depth': [4,6,8],
    'min_samples_leaf': [2,3],
    'min_samples_split': [2,4,6],
    'max_features': [4]
}

my_cv = TimeSeriesSplit(n_splits=5).split(X_train)
clf = GridSearchCV(RandomForestClassifier(), params, cv=my_cv, n_jobs=-1)
clf.fit(X_train, y_train)

print('best parameter: ', clf.best_params_)   # 최적의 파라미터
print('best prediction: ', clf.best_score_)   # 가장 높은 정확도

# 테스트셋에서의 결과
pred_con = clf.predict(X_test)
get_confusion_matrix(y_test, pred_con)

best parameter:  {'bootstrap': True, 'max_depth': 4, 'max_features': 4, 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 20}
best prediction:  0.5616352201257863
confusion matrix
accuracy: 0.4743276283618582, precision: 0.5135135135135135, recall: 0.17511520737327188, F1: 0.26116838487972505, ROC AUC score: 0.4938076036866359


In [11]:
"""0.05% 이상의 수익률을 얻었을 때만 상승했다고 보고 'target'값을 업데이트 한다."""

df_new = pd.read_csv('../data/ETFs_main.csv')
df_new = moving_average(df_new, 45)
df_new = volume_moving_average(df_new, 45)
df_new = relative_strength_index(df_new, 14)
df_new = df_new.set_index('Dates')
df_new = df_new.dropna()

df_new['target'] = df_new['CLOSE_SPY'].pct_change()
df_new['target'] = np.where(df_new['target'] > 0.0005, 1, -1)   


df_new['target'] = df_new['target'].shift(-1)   
df_new = df_new.dropna()
df_new['target'].value_counts()

 1.0    1375
-1.0    1351
Name: target, dtype: int64

In [14]:
# 그리고 다시 모델 훈련
df_new['target'] = df_new['target'].astype(np.int64) 
y_var = df_new['target']
x_var = df_new.drop(['target', 'OPEN', 'HIGH', 'LOW', 'VOLUME', 'CLOSE_SPY'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(x_var, y_var, test_size=0.3, shuffle=False, random_state=3)

my_cv = TimeSeriesSplit(n_splits=5).split(X_train)
clf = GridSearchCV(RandomForestClassifier(), params, cv=my_cv, n_jobs=-1)
clf.fit(X_train, y_train)
print('best parameter: ', clf.best_params_)  
print('best prediction: ', clf.best_score_)   

# 테스트셋에서의 결과
pred_con = clf.predict(X_test)
get_confusion_matrix(y_test, pred_con)

best parameter:  {'bootstrap': True, 'max_depth': 4, 'max_features': 4, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
best prediction:  0.5333333333333333
confusion matrix
accuracy: 0.5122249388753056, precision: 0.5095541401273885, recall: 0.19900497512437812, F1: 0.2862254025044723, ROC AUC score: 0.5069544106391122
