In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rc('font', family='Hancom Gothic')
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [2]:
kospi200 = pd.read_csv("./data/data.csv", encoding="cp949")
kospi200 = kospi200[kospi200.columns[:2]]
kospi200["종목코드"] = kospi200["종목코드"].map('{:06d}'.format)
kospi200.head()

Unnamed: 0,종목코드,종목명
0,6840,AK홀딩스
1,27410,BGF
2,282330,BGF리테일
3,138930,BNK금융지주
4,1040,CJ


In [3]:
code_dict = { i : j for i, j in zip(kospi200["종목명"], kospi200["종목코드"]) }
name_dict = { j : i for i, j in zip(kospi200["종목명"], kospi200["종목코드"]) }

import pickle

with open('raw_data_20180811.pickle', 'rb') as handle:
    raw_data = pickle.load(handle)

In [4]:
del_list = []
for key, df in raw_data.items():
    if len(df) < 252:
        del_list.append(key)
for code in del_list:
    del raw_data[code]
print([name_dict[i] for i in del_list], len([name_dict[i] for i in del_list]))

['BGF리테일', 'HDC현대산업개발', 'SK케미칼', '효성중공업'] 4


In [5]:
for i in raw_data.keys():
    raw_data[i] = raw_data[i].assign(sum_close_7 = raw_data[i].Close.rolling(window =7).sum())
    raw_data[i] = raw_data[i].assign(ma_3 = raw_data[i].Close.rolling(window = 3).mean())
    raw_data[i] = raw_data[i].assign(ma_5 = raw_data[i].Close.rolling(window = 5).mean())
    raw_data[i] = raw_data[i].assign(ma_10 = raw_data[i].Close.rolling(window = 10).mean())
    raw_data[i] = raw_data[i].assign(ma_15 = raw_data[i].Close.rolling(window = 15).mean())

for i in raw_data.keys():
    raw_data[i] = raw_data[i].assign(pct_change7 = raw_data[i].sum_close_7.pct_change().fillna(0))
    raw_data[i] = raw_data[i].assign(sum_7_Y = np.where(raw_data[i]["pct_change7"].fillna(0)>=0, 1,0))
    
for i in raw_data.keys():
    raw_data[i] = raw_data[i].dropna(axis=0)

In [6]:
train6 = {}
train_idx6 = {}
test6 = {}
test_idx6 = {}
for code, df in raw_data.items():
    train6[code] = df.iloc[:int(len(df)*0.8)][["Open","ma_3","ma_5", "ma_10"]]
    train_idx6[code] = df.iloc[:int(len(df)*0.8)]["sum_7_Y"]
    test6[code] = df.iloc[int(len(df)*0.8):][["Open","ma_3","ma_5", "ma_10"]]
    test_idx6[code] = df.iloc[int(len(df)*0.8):]["sum_7_Y"]

In [7]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
li6 = ["Open","ma_3","ma_5", "ma_10"]
for code in raw_data.keys():
    train6[code][li6] = std_scaler.fit_transform(train6[code][li6].values)
    test6[code][li6] = std_scaler.fit_transform(test6[code][li6].values)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

log_clf = LogisticRegression()
DTC_clf = DecisionTreeClassifier()
svm_clf = SVC()

In [9]:
from sklearn.metrics import accuracy_score

score_list6 = pd.DataFrame(columns=('LR','DT','SVM'))


for i in raw_data.keys():
    a6=[]
    X_train6 = train6[i].values
    y_train6 = train_idx6[i].values
    X_test6 = test6[i].values
    y_test6 = test_idx6[i].values
    for clf in (log_clf, DTC_clf, svm_clf):
        clf.fit(X_train6, y_train6)
        y_pred6 = clf.predict(X_test6)
        a6.append(accuracy_score(y_test6, y_pred6))
    score_list6.loc[i] = a6

In [10]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

CM = pd.DataFrame(columns=('LR','DT','SVM'))
roc_auc_score = pd.DataFrame(columns=('LR','SVM','DT'))
y_train_5 = np.array([])
y_scores1 = np.array([])
y_scores2 = np.array([])
y_scores3 = np.array([])

for i in raw_data.keys():
    b1=[]
    b2=[]
    X_train = train6[i].values
    y_train = train_idx6[i].values
    X_test = test6[i].values
    y_test = test_idx6[i].values
    score1=cross_val_predict(log_clf, X_train, y_train, cv=3,method="decision_function")
    score2=cross_val_predict(svm_clf, X_train, y_train, cv=3,method="decision_function")
    score3=cross_val_predict(DTC_clf, X_train, y_train, cv=3,method="predict_proba")
    fpr, tpr, thresholds = roc_curve(y_train, score1)
    b2.append([auc(fpr, tpr)])
    fpr, tpr, thresholds = roc_curve(y_train, score2)
    b2.append([auc(fpr, tpr)])
    fpr, tpr, thresholds = roc_curve(y_train, score3[:,1])
    b2.append([auc(fpr, tpr)])
    for clf in (log_clf, DTC_clf, svm_clf):
        y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3)
        b1.append([confusion_matrix(y_train, y_train_pred)])
    CM.loc[i] = b1
    roc_auc_score.loc[i] = b2
    y_train_5 = np.concatenate((y_train_5,y_train),axis=0)
    y_scores1 = np.concatenate((y_scores1,score1),axis=0)
    y_scores2 = np.concatenate((y_scores2,score2),axis=0)
    y_scores3 = np.concatenate((y_scores3,score3[:,1]),axis=0)

In [15]:
score1


array([-0.16855581, -0.08172658,  1.02473578, ...,  1.34734406,
        1.00228911,  1.01792859])