In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rc('font', family='Hancom Gothic')
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# KOSPI200 불러오기 & 데이터셋 생성

In [72]:
import pickle
with open('raw_data_20180811.pickle','rb') as handle:

    raw_data = pickle.load(handle)
    
kospi200 = pd.read_excel("C:\\Users\\axasd\\Downloads\\상장법인목록.xlsx")
kospi200 = kospi200[kospi200.columns[:2]]
kospi200["종목코드"] = kospi200["종목코드"].map('{:06d}'.format)

code_dict = {
    i : j for i, j in zip(kospi200["회사명"], kospi200["종목코드"])
}

In [73]:
for i in raw_data.keys():
    raw_data[i] = raw_data[i].assign(sum_close_7 = raw_data[i].Close.rolling(window =7).sum())
    raw_data[i] = raw_data[i].assign(ma_3 = raw_data[i].Close.rolling(window = 3).mean())
    raw_data[i] = raw_data[i].assign(ma_5 = raw_data[i].Close.rolling(window = 5).mean())
    raw_data[i] = raw_data[i].assign(ma_10 = raw_data[i].Close.rolling(window = 10).mean())
    raw_data[i] = raw_data[i].assign(ma_15 = raw_data[i].Close.rolling(window = 15).mean())

for i in raw_data.keys():
    raw_data[i] = raw_data[i].assign(pct_change7 = raw_data[i].sum_close_7.pct_change().fillna(0))
    raw_data[i] = raw_data[i].assign(sum_7_Y = np.where(raw_data[i]["pct_change7"].fillna(0)>=0, 1,0))
    
for i in raw_data.keys():
    raw_data[i] = raw_data[i].dropna(axis=0)

In [74]:
copy_keys = kospi200["종목코드"]
for i in copy_keys:
    if (len(raw_data[i]) < 252):
        del raw_data[i]

# 데이터 셋 : oepn, MA3, MA5, MA10
# Y : 7일 종가 평균 등락

### Train, Test Split

In [75]:
train6 = {}
train_idx6 = {}
test6 = {}
test_idx6 = {}
for code, df in raw_data.items():
    train6[code] = df.iloc[:int(len(df)*0.8)][["Open","ma_3","ma_5", "ma_10"]]
    train_idx6[code] = df.iloc[:int(len(df)*0.8)]["sum_7_Y"]
    test6[code] = df.iloc[int(len(df)*0.8):][["Open","ma_3","ma_5", "ma_10"]]
    test_idx6[code] = df.iloc[int(len(df)*0.8):]["sum_7_Y"]

### Scale 조정

In [76]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
li6 = ["Open","ma_3","ma_5", "ma_10"]
for code in raw_data.keys():
    train6[code][li6] = std_scaler.fit_transform(train6[code][li6].values)
    test6[code][li6] = std_scaler.fit_transform(test6[code][li6].values)

### model 호출

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

log_clf = LogisticRegression()
DTC_clf = DecisionTreeClassifier()
svm_clf = SVC()

### Predict

In [78]:
from sklearn.metrics import accuracy_score

score_list6 = pd.DataFrame(columns=('LR','DT','SVM'))


for i in raw_data.keys():
    a6=[]
    X_train6 = train6[i].values
    y_train6 = train_idx6[i].values
    X_test6 = test6[i].values
    y_test6 = test_idx6[i].values
    for clf in (log_clf, DTC_clf, svm_clf):
        clf.fit(X_train6, y_train6)
        y_pred6 = clf.predict(X_test6)
        a6.append(accuracy_score(y_test6, y_pred6))
    score_list6.loc[i] = a6

In [79]:
score_list6.ix["042660"]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


LR     0.428571
DT     0.763975
SVM    0.701863
Name: 042660, dtype: float64

In [42]:
X_train6 = train6["042660"].values
y_train6 = train_idx6["042660"].values
X_test6 = test6["042660"].values
y_test6 = test_idx6["042660"].values

# 대우조선해양 그리드서치(Decision Tree)

In [43]:
from sklearn.model_selection import GridSearchCV

params = {'max_leaf_nodes': list(range(2, 10)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(), params, n_jobs=-1, verbose=1)

grid_search_cv.fit(train6["042660"].values, train_idx6["042660"].values)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  57 out of  72 | elapsed:    4.1s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    4.1s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9], 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [44]:
grid_search_cv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=8, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [45]:
tree_clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=8, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
tree_clf.fit(train6["042660"].values, train_idx6["042660"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=8, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [46]:
y_pred6 = tree_clf.predict(test6["042660"].values)
accuracy_score(test_idx6["042660"].values, y_pred6)

0.7080745341614907

# 대우조선해양 그리드서치(SVM)

In [63]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

pipe_svc = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [
    {'clf__C': param_range, 'clf__kernel': ['linear']},
    {'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']}]

grid_search_cv = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=1)

In [64]:
grid_search_cv.fit(train6["042660"].values, train_idx6["042660"].values)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'clf__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], 'clf__kernel': ['linear']}, {'clf__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], 'clf__gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], 'clf__kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [65]:
grid_search_cv.best_estimator_

Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False))])

In [66]:
svm_clf = Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False))])
svm_clf.fit(train6["042660"].values, train_idx6["042660"])

Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False))])

In [67]:
y_pred6 = svm_clf.predict(test6["042660"].values)
accuracy_score(test_idx6["042660"].values, y_pred6)

0.9161490683229814