In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
matplotlib.rcParams['axes.unicode_minus']=False

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

kospi200 = pd.read_csv("./data/data.csv", encoding="cp949")
kospi200 = kospi200[kospi200.columns[:2]]
kospi200["종목코드"] = kospi200["종목코드"].map('{:06d}'.format)

name_dict = { i : j for i, j in zip(kospi200["종목명"], kospi200["종목코드"]) }
code_dict = { j : i for i, j in zip(kospi200["종목명"], kospi200["종목코드"]) }

import pickle
with open('raw_data_20180811.pickle','rb') as handle:
    raw_data = pickle.load(handle)
    
copy_keys = kospi200["종목코드"]

for i in copy_keys:
    if (len(raw_data[i]) < 504):
        del raw_data[i]
    elif (raw_data[i].duplicated().sum() > 20):
        del raw_data[i]

In [4]:
raw_data['005930'].head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-01-02,21400.0,21600.0,21300.0,21600.0,21.233131,13151350
2012-01-03,21860.0,22100.0,21840.0,22100.0,21.724644,16927750
2012-01-04,22100.0,22200.0,21500.0,21600.0,21.233131,17103700
2012-01-05,21460.0,21580.0,21100.0,21100.0,20.741627,17298400
2012-01-06,21120.0,21320.0,20600.0,20800.0,20.446722,18816250


In [8]:
def train_test_split(df, train_ratio, X, y, random = False, dtrain = False):
    """
    Argument
    df : DataFrame object
    train_ratio : float, in range(0,1)
    X : sequence data, such as list, tuple (Train Features)
    y : str
    random : boolean, you need "numpy" library cause we use np.random.permutation function.
    
    Return
    train : 학습시킬 Feature data
    train_idx : 지도학습의 Y value
    test : test 검증할 Feature data
    test_idx : test set의 Y value
    """
    train_size = int(len(df)*train_ratio)
    if random:
        shuffle_indicies = np.random.permutation(len(df))
        train_indicies = shuffle_indicies[:train_size]
        test_indicies = shuffle_indicies[train_size:]
    else:
        normal_indicies = np.arange(len(df))
        train_indicies = normal_indicies[:train_size]
        test_indicies = normal_indicies[train_size:]
    if dtrain:
        train = df.iloc[:train_size]
        train_idx = df.iloc[:train_size]
        test = df.iloc[train_size:]
        test_idx = df.iloc[train_size:]
    else:
        train = df.iloc[:train_size][X]
        train_idx = df.iloc[:train_size][y]
        test = df.iloc[train_size:][X]
        test_idx = df.iloc[train_size:][y]
    return train, train_idx, test, test_idx

In [5]:
for i in raw_data.keys():
    raw_data[i] = raw_data[i].assign(pct_change = raw_data[i].Close.shift(1).pct_change().fillna(0))
    raw_data[i] = raw_data[i].assign(pct_change_real = raw_data[i].Close.pct_change().fillna(0)) #.shift(1)
    raw_data[i] = raw_data[i].assign(his_vol_7 = raw_data[i]["pct_change"].rolling(window = 7).std()*(252**0.5))
    raw_data[i] = raw_data[i].assign(his_vol_21 = raw_data[i]["pct_change"].rolling(window = 21).std()*(252**0.5))
    raw_data[i] = raw_data[i].assign(his_vol_30 = raw_data[i]["pct_change"].rolling(window = 30).std()*(252**0.5))
    raw_data[i] = raw_data[i].assign(close_ma_3 = raw_data[i].Close.shift(1).rolling(window =3).mean())
    raw_data[i] = raw_data[i].assign(close_ma_10 = raw_data[i].Close.shift(1).rolling(window =10).mean())
    raw_data[i] = raw_data[i].assign(volume_ma_3 = raw_data[i].Volume.shift(1).rolling(window =3).mean())
    raw_data[i] = raw_data[i].assign(sh_low = raw_data[i].Low.shift(1))
    raw_data[i] = raw_data[i].assign(sh_high = raw_data[i].High.shift(1))
    raw_data[i] = raw_data[i].assign(sh_adjclose = raw_data[i]['Adj Close'].shift(1))
    raw_data[i] = raw_data[i].assign(sh_volume = raw_data[i].Volume.shift(1))
for i in raw_data.keys():
    raw_data[i] = raw_data[i].assign(Y1 = raw_data[i].Close - raw_data[i].Close.rolling(window =7).mean().shift(1))
    raw_data[i] = raw_data[i].assign(Y2 = raw_data[i]["pct_change_real"] - raw_data[i]["pct_change"].rolling(window = 7).mean().shift(1))
    raw_data[i] = raw_data[i].assign(Y3 = raw_data[i].Close.rolling(window =7).mean() - raw_data[i].Close.rolling(window =7).mean().shift(1))
for i in raw_data.keys():
    raw_data[i] = raw_data[i].assign(Y1 = np.where(raw_data[i].Y1>=0, 1, 0))
    raw_data[i] = raw_data[i].assign(Y2 = np.where(raw_data[i].Y2>=0, 1, 0))
    raw_data[i] = raw_data[i].assign(Y3 = np.where(raw_data[i].Y3>=0, 1, 0))
for i in raw_data.keys():
    raw_data[i] = raw_data[i].dropna(axis=0)

In [10]:
predictors=['Open', 'pct_change', 'his_vol_7', 'his_vol_21', 'close_ma_3', 'close_ma_10']
target='Y3'

In [11]:
train = {}
train_y = {}
test = {}
test_y = {}
for code, df in raw_data.items():
    X_train, y_train, X_test, y_test = train_test_split(df, 0.8, predictors, target, dtrain=True)
    train[code] = X_train
    train_y[code] = y_train
    test[code] = X_test
    test_y[code] = y_test

In [16]:
train_scale = {}
train_y_scale = {}
test_scale = {}
test_y_scale = {}
for code, df in raw_data.items():
    X_train, y_train, X_test, y_test = train_test_split(df, 0.8, predictors, target, dtrain=True)
    train_scale[code] = X_train
    test_scale[code] = X_test

In [17]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
li = ["Open","pct_change","his_vol_7","his_vol_21","close_ma_3","close_ma_10"]
for code in raw_data.keys():
    train_scale[code][li] = std_scaler.fit_transform(train_scale[code][li].values)
    test_scale[code][li] = std_scaler.fit_transform(test_scale[code][li].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [19]:
from xgboost.sklearn import XGBClassifier as XGBClassifier_wrapper
from xgboost import XGBClassifier

xgb_clf = XGBClassifier_wrapper(random_state=42)
xgb_clf2 = XGBClassifier(random_state=42)

xgb_clf_scale = XGBClassifier_wrapper(random_state=42)
xgb_clf2_scale = XGBClassifier(random_state=42)

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, roc_curve, auc
from sklearn import metrics 
from sklearn.model_selection import cross_val_predict

In [32]:
import warnings
warnings.simplefilter('ignore', DeprecationWarning)

def TestAccuracy(train, test, predictors, target, alg):
    score_set = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'auc', 'f1'])
    for i in raw_data.keys():
        X_train = train[i][predictors].values
        y_train = train[i][target].values
        X_test = test[i][predictors].values
        y_test = test[i][target].values
        a=[]
        alg.fit(X_train, y_train)
        y_pred = alg.predict(X_test)
        a.append(metrics.accuracy_score(y_test, y_pred))
        value_1=cross_val_predict(alg, X_train, y_train, cv=5,method="predict_proba")
        a.append(metrics.precision_score(y_test, y_pred))
        a.append(metrics.recall_score(y_test, y_pred))
        fpr, tpr, thresholds = metrics.roc_curve(y_train, value_1[:,1])
        a.append(metrics.auc(fpr, tpr))
        a.append(metrics.f1_score(y_test,y_pred).mean())
        score_set.loc[i] = a
    return score_set

In [34]:
result1 = TestAccuracy(train, test, predictors, target, xgb_clf)
result2 = TestAccuracy(train, test, predictors, target, xgb_clf2)
result3 = TestAccuracy(train_scale, test_scale, predictors, target, xgb_clf_scale)
result4 = TestAccuracy(train_scale, test_scale, predictors, target, xgb_clf2_scale)

In [35]:
result1.describe()

Unnamed: 0,accuracy,precision,recall,auc,f1
count,177.0,177.0,177.0,177.0,177.0
mean,0.731021,0.733255,0.737756,0.671821,0.727223
std,0.054106,0.098052,0.091045,0.095314,0.059488
min,0.548589,0.433333,0.5,0.368863,0.54251
25%,0.695925,0.669421,0.682635,0.621566,0.686869
50%,0.736677,0.751381,0.746753,0.684007,0.735905
75%,0.768025,0.802632,0.804054,0.734307,0.768769
max,0.84953,0.9375,0.953488,0.866311,0.855422


In [36]:
result2.describe()

Unnamed: 0,accuracy,precision,recall,auc,f1
count,177.0,177.0,177.0,177.0,177.0
mean,0.731021,0.733255,0.737756,0.671821,0.727223
std,0.054106,0.098052,0.091045,0.095314,0.059488
min,0.548589,0.433333,0.5,0.368863,0.54251
25%,0.695925,0.669421,0.682635,0.621566,0.686869
50%,0.736677,0.751381,0.746753,0.684007,0.735905
75%,0.768025,0.802632,0.804054,0.734307,0.768769
max,0.84953,0.9375,0.953488,0.866311,0.855422


In [37]:
result3.describe()

Unnamed: 0,accuracy,precision,recall,auc,f1
count,177.0,177.0,177.0,177.0,177.0
mean,0.731021,0.733255,0.737756,0.671821,0.727223
std,0.054106,0.098052,0.091045,0.095314,0.059488
min,0.548589,0.433333,0.5,0.368863,0.54251
25%,0.695925,0.669421,0.682635,0.621566,0.686869
50%,0.736677,0.751381,0.746753,0.684007,0.735905
75%,0.768025,0.802632,0.804054,0.734307,0.768769
max,0.84953,0.9375,0.953488,0.866311,0.855422


In [38]:
result4.describe()

Unnamed: 0,accuracy,precision,recall,auc,f1
count,177.0,177.0,177.0,177.0,177.0
mean,0.731021,0.733255,0.737756,0.671821,0.727223
std,0.054106,0.098052,0.091045,0.095314,0.059488
min,0.548589,0.433333,0.5,0.368863,0.54251
25%,0.695925,0.669421,0.682635,0.621566,0.686869
50%,0.736677,0.751381,0.746753,0.684007,0.735905
75%,0.768025,0.802632,0.804054,0.734307,0.768769
max,0.84953,0.9375,0.953488,0.866311,0.855422


In [39]:
train_scale['005930'][predictors]

Unnamed: 0_level_0,Open,pct_change,his_vol_7,his_vol_21,close_ma_3,close_ma_10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-02-14,-1.547199,1.146068,-0.095740,0.043078,-1.585118,-1.576897
2012-02-15,-1.474475,-0.206907,-0.269062,0.047912,-1.592061,-1.591281
2012-02-16,-1.308249,3.015637,1.264249,0.762877,-1.465349,-1.561447
2012-02-17,-1.157608,-0.040662,1.223504,0.643058,-1.375088,-1.532144
2012-02-20,-1.058911,2.127267,1.613200,0.940932,-1.208453,-1.473540
2012-02-21,-1.074495,-0.091694,1.504042,0.911314,-1.139022,-1.419731
2012-02-22,-1.038133,0.214720,0.869090,0.511815,-1.060912,-1.373380
2012-02-23,-1.084884,0.823955,0.854185,0.305342,-1.024460,-1.317440
2012-02-24,-1.188775,-1.895749,1.808175,0.686823,-1.050497,-1.276949
2012-02-27,-1.079689,0.994073,0.793957,0.752897,-1.050497,-1.214083


In [41]:
trial = 0
for code in raw_data.keys():
    trial += 1
    """
    X
    """
    raw_data[code]["pct_change"] = raw_data[code].Close.pct_change().shift(1)
    for i in range(3, 31):
        raw_data[code]["close_ma_{}".format(i)] = raw_data[code].Close.rolling(window = i).mean().shift(1)
        raw_data[code]["volume_ma_{}".format(i)] =  raw_data[code].Volume.rolling(window = i).mean().shift(1)
    for i in [7, 21, 30, 60, 90, 91, 100, 120]:
        raw_data[code]["his_vol_{}".format(i)] = raw_data[code]["pct_change"].fillna(0).rolling(window = i).std()*(252**0.5)
    """
    Y
    """
    
    raw_data[code] = raw_data[code].assign(Y1 = raw_data[code].Close - raw_data[code].close_ma_7.shift(1))
    raw_data[code] = raw_data[code].assign(Y2 =
                                           raw_data[code]["pct_change"] - 
                                           raw_data[code]["pct_change"].rolling(window = 7).mean().shift(1))
    raw_data[code] = raw_data[code].assign(Y3 = raw_data[code].Close.rolling(window=7).mean() - 
                                           raw_data[code].Close.shift(1).rolling(window=7).mean())
    
    raw_data[code] = raw_data[code].assign(Y1 = np.where(raw_data[code].Y1>=0, 1, 0))
    raw_data[code] = raw_data[code].assign(Y2 = np.where(raw_data[code].Y2>=0, 1, 0))
    raw_data[code] = raw_data[code].assign(Y3 = np.where(raw_data[code].Y3>=0, 1, 0))
    """
    Drop NaN
    """
    raw_data[code] = raw_data[code].dropna(axis=0)
    print(trial)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177


In [42]:
train = {}
train_y = {}
test = {}
test_y = {}
for code, df in raw_data.items():
    X_train, y_train, X_test, y_test = train_test_split(df, 0.8, predictors, target, dtrain=True)
    train[code] = X_train
    train_y[code] = y_train
    test[code] = X_test
    test_y[code] = y_test

In [43]:
train_scale = {}
train_y_scale = {}
test_scale = {}
test_y_scale = {}
for code, df in raw_data.items():
    X_train, y_train, X_test, y_test = train_test_split(df, 0.8, predictors, target, dtrain=True)
    train_scale[code] = X_train
    test_scale[code] = X_test

In [44]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
li = ["Open","pct_change","his_vol_7","his_vol_21","close_ma_3","close_ma_10"]
for code in raw_data.keys():
    train_scale[code][li] = std_scaler.fit_transform(train_scale[code][li].values)
    test_scale[code][li] = std_scaler.fit_transform(test_scale[code][li].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [45]:
from xgboost.sklearn import XGBClassifier as XGBClassifier_wrapper
from xgboost import XGBClassifier

xgb_clf = XGBClassifier_wrapper(random_state=42)
xgb_clf2 = XGBClassifier(random_state=42)

xgb_clf_scale = XGBClassifier_wrapper(random_state=42)
xgb_clf2_scale = XGBClassifier(random_state=42)

In [46]:
result1b = TestAccuracy(train, test, predictors, target, xgb_clf)
result2b = TestAccuracy(train, test, predictors, target, xgb_clf2)
result3b = TestAccuracy(train_scale, test_scale, predictors, target, xgb_clf_scale)
result4b = TestAccuracy(train_scale, test_scale, predictors, target, xgb_clf2_scale)

In [47]:
result1b.describe()

Unnamed: 0,accuracy,precision,recall,auc,f1
count,177.0,177.0,177.0,177.0,177.0
mean,0.730869,0.723953,0.752776,0.662011,0.729516
std,0.058622,0.101416,0.090733,0.100549,0.063293
min,0.491694,0.4,0.427711,0.393748,0.529231
25%,0.697674,0.666667,0.70073,0.608651,0.687307
50%,0.734219,0.741722,0.764706,0.684688,0.742138
75%,0.774086,0.801653,0.811594,0.723572,0.777108
max,0.850498,0.92,0.971014,0.85589,0.83871


In [48]:
result1b.describe()

Unnamed: 0,accuracy,precision,recall,auc,f1
count,177.0,177.0,177.0,177.0,177.0
mean,0.730869,0.723953,0.752776,0.662011,0.729516
std,0.058622,0.101416,0.090733,0.100549,0.063293
min,0.491694,0.4,0.427711,0.393748,0.529231
25%,0.697674,0.666667,0.70073,0.608651,0.687307
50%,0.734219,0.741722,0.764706,0.684688,0.742138
75%,0.774086,0.801653,0.811594,0.723572,0.777108
max,0.850498,0.92,0.971014,0.85589,0.83871


In [49]:
result1b.describe()

Unnamed: 0,accuracy,precision,recall,auc,f1
count,177.0,177.0,177.0,177.0,177.0
mean,0.730869,0.723953,0.752776,0.662011,0.729516
std,0.058622,0.101416,0.090733,0.100549,0.063293
min,0.491694,0.4,0.427711,0.393748,0.529231
25%,0.697674,0.666667,0.70073,0.608651,0.687307
50%,0.734219,0.741722,0.764706,0.684688,0.742138
75%,0.774086,0.801653,0.811594,0.723572,0.777108
max,0.850498,0.92,0.971014,0.85589,0.83871


In [50]:
result1b.describe()

Unnamed: 0,accuracy,precision,recall,auc,f1
count,177.0,177.0,177.0,177.0,177.0
mean,0.730869,0.723953,0.752776,0.662011,0.729516
std,0.058622,0.101416,0.090733,0.100549,0.063293
min,0.491694,0.4,0.427711,0.393748,0.529231
25%,0.697674,0.666667,0.70073,0.608651,0.687307
50%,0.734219,0.741722,0.764706,0.684688,0.742138
75%,0.774086,0.801653,0.811594,0.723572,0.777108
max,0.850498,0.92,0.971014,0.85589,0.83871


# 결론
- 데이터 차이이다.