In [2]:
# import library
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# adjust 한글 font
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname='c:/Windows/Fonts/malgun.ttf').get_name()
rc('font', family=font_name)

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# 한글출력
# plt.rcParams['font.family'] = 'NanumBarunGothic'
plt.rcParams['axes.unicode_minus'] = False

In [None]:
# main.py
from data_manager import *

if __name__ == '__main__':
    # 주식 데이터 read
    
    # 데이터 전처리
    
    # Generate Feature
    
    # Make Target
    
    # 모델 학습
    
    # 예측한 결과로 투자, 수익을 기록
    
    # 최종 portfolio 수익을 출력

In [1]:
import pickle
with open('raw_data_20190115.pickle', 'rb') as handle:
    raw_data = pickle.load(handle)

In [21]:
del_stock = []
for stock_name, data in raw_data.items():
    if int(str(data.index[0].year) + '{:02d}'.format(data.index[0].month)) > 201301:
            print(stock_name)
            del_stock.append(stock_name)

BGF
BGF리테일
HDC현대산업개발
LIG넥스원
SK케미칼
넷마블
동아에스티
두산밥캣
만도
삼성물산
삼성바이오로직스
삼성에스디에스
오리온
이노션
종근당
코스맥스
쿠쿠홀딩스
현대로템
현대중공업지주
효성중공업


In [22]:
for stock_name in del_stock:
    del raw_data[stock_name]

In [24]:
len(raw_data.keys())

182

In [33]:
train, test = {}, {}
for stock_name in raw_data.keys():
    train[stock_name] = raw_data[stock_name].loc['2013-01-01':'2018-01-01']
    test[stock_name] = raw_data[stock_name].loc['2018-01-01':]

In [73]:
# get_technical_indicator.py
def get_MACD(df, close='Adj Close', short_ma=12, long_ma=26, signal_period=9):
    ewm_12 = df[close].ewm(span = short_ma, min_periods = short_ma - 1).mean()
    ewm_26 = df[close].ewm(span = long_ma, min_periods = long_ma - 1).mean()
    df['MACD'] = ewm_12 - ewm_26
    df['MACDSignal'] = df['MACD'].ewm(span = signal_period, min_periods = signal_period-1).mean()
    df['MACDDiff'] = df['MACD'] - df['MACDSignal']
    return df

def get_Stochastic(df, close='Adj Close', high='High', low='Low', n=14):
    df['sto_K_{}'.format(n)] = 100 * ((df[close] - df[low].rolling(n).min()) / 
                                      (df[high].rolling(n).max() - df[low].rolling(n).min()))
    df['sto_D_{}'.format(n)] = df['sto_K_{}'.format(n)].rolling(3).mean()
    return df

def get_RSI(df, close='Adj Close', n=14):
    U = np.where(df[close].diff(1)>0, df[close].diff(1), 0)
    D = np.where(df[close].diff(1)<0, df[close].diff(1)*(-1), 0)

    AU = pd.Series(U).rolling(window=n, min_periods=n).mean()
    AD = pd.Series(D).rolling(window=n, min_periods=n).mean()

    RSI = AU.div(AD+AU) * 100
    df['RSI_{}'.format(n)] = RSI.values
    return df

def get_RoC(df, close='Adj Close', periods=2):
    df['roc_{}'.format(periods)] = df[close].pct_change(periods=periods)
    return df

def get_CV(df, close='Adj Close', window=10):
    df['his_vol_{}'.format(window)] = np.log(df[close] / df[close].shift(1)).rolling(window).std()*(252**0.5)
    return df

def get_WilliamR(df, close='Adj Close', high='High', low='Low', n=14):
    W_R = -100 * ((df[high].rolling(n).max() - df[close]) / (df[high].rolling(n).max() - df[low].rolling(n).min()))
    df['WR_{}'.format(n)] = W_R
    return df

def get_CCI(df, close='Adj Close', high='High', low='Low', window=10):
    TP = (df[close] + df[low] + df[high]) / 3
    dataX = []
    for i in range(len(TP) - window+1):
        a = TP.values[i:(i+window)]
        dataX.append(a)
    dataMean = [i.mean() for i in dataX]
    AVEDEV = [abs(x - mean).mean() for x, mean 
              in zip(dataX, dataMean)]
    for i in range(window-1):
        AVEDEV.insert(0,np.nan)
    AVEDEV = pd.Series(AVEDEV, index = TP.index)
    CCI = (TP - TP.rolling(20).mean()) / (0.015 * AVEDEV)
    df['CCI_{}'.format(window)] = CCI
    return df

def get_DI(df, close='Adj Close', window=10):
    df['DI_{}'.format(window)] = 100 * ((df[close] - df[close].rolling(window).mean()) / df[close].rolling(window).mean())
    return df

In [92]:
# data_manager.py
# from get_technical_indicator import *

# def load_chart_data(fpath):
#     chart_data = pd.read_csv(fpath, thousands=',')
#     return chart_data

def preprocess(chart_data, close='Adj Close', volume='Volume', windows=10):
    prep_data = chart_data
    for window in windows:
        prep_data['close_ma{}'.format(window)] = prep_data[close].rolling(window).mean()
        prep_data['volume_ma{}'.format(window)] = (
            prep_data[volume].rolling(window).mean())
    return prep_data, len(prep_data.columns)

def build_feature(input_data, close='Adj Close', high='High', low='Low'):
    data = input_data.copy()
    for window in range(12, 91):
        data = get_CV(data, close=close, window=window)
    for periods in range(1, 11):
        data = get_RoC(data, close=close, periods=periods)
    for n in range(3, 91):
        data = get_RSI(data, close=close, n=n)
        data = get_Stochastic(data,close=close, high=high, low=low, n=n)
        data = get_WilliamR(data, close=close, high=high, low=low, n=n)
    data = get_CCI(data, close=close, high=high, low=low, window=20)
    for window in [5,10]:
        data = get_DI(data, close=close, window=window)
    data = get_MACD(data, close=close)
    return data

def make_target(df, use_fn='f', period=1, window=1, 
                method='regress', thresh=.02, close='Adj Close'):
    y_df = pd.DataFrame(index=df.index)
    f = lambda x, i : np.log(x.shift(-1 * i) / x)
    g = lambda x, i, j : np.log(x.shift(-1 * i) / x.rolling(j).mean())
    if window > 99:
        target = 'sh{:02}_Y_{:03}'.format(period, window)
    else:
        target = 'sh{:02}_Y_{:02}'.format(period, window)
    target = target[:6] if use_fn=='f' else target
    y_df[target] = \
            f(df[close], period) if use_fn == 'f' else g(df[close], period, window)
    if method != 'regress':
        y_df[target] = np.where(
            y_df[target] > thresh, 1, np.where(
                y_df[target] < (-1 * thresh), -1, 0))
    return y_df

In [94]:
def concat_x_y(df, period, window, n_col=0, close='Adj Close',
               use_fn='f', thresh=.02, method='regress'):
    y_df = make_target(df, use_fn=use_fn, period=period, window=window, 
                           thresh=thresh, method=method, close=close)
    y = y_df.columns[0]
    compact_df = pd.concat((df, y_df), axis=1)
#     compact_df = compact_df[compact_df.columns[n_col:]]
#     compact_df = compact_df.dropna()
    return compact_df, y

def train_test_split(compact_df, start, end, date='date'):
    train = compact_df[(compact_df[date] >= start) &
                   (compact_df[date] <= end)]
    test = compact_df[compact_df[date] >= \
                (pd.Timestamp(end) + pd.Timedelta('1 days')).strftime('%Y-%m-%d')]
    train, test = train.dropna(), test.dropna()
    return train, test

def correl_selection(df, y, corr_li):
    col_set = {}
    for col_name in corr_li:
        corr_ = abs(
            df[
                [col for col in df 
                 if (col.find(col_name) != -1)|(col.find(y) != -1)]
            ].corr()[y]
        ).sort_values(ascending=False)
        col_set[corr_.index[1]] = corr_.iloc[1]
    return col_set

In [None]:
import xgboost as xgb
import sklearn.metrics as met
import pickle
with open('raw_data_20190115.pickle', 'rb') as handle:
    raw_data = pickle.load(handle)

del_stock = []
for stock_name, data in raw_data.items():
    if int(str(data.index[0].year) + '{:02d}'.format(data.index[0].month)) > 201301:
            print(stock_name)
            del_stock.append(stock_name)

for stock_name in del_stock:
    del raw_data[stock_name]

for stock_name, data in raw_data.items():
    raw_data[stock_name] = data.reset_index()
    
for stock_name in raw_data.keys():
    chart_data = raw_data[stock_name].copy()
    windows = [i for i in range(1, 90)]
    prep_data, n_col = preprocess(chart_data, windows=windows)
    training_data = build_feature(prep_data, 
                      close='Adj Close', high='High', low='Low')
    del prep_data

    start, end = '2013-01-01', '2018-01-01' # 훈련시킬 기간

    thresh = .02 # Use only method=='Classification'
    # periods = [i for i in range(1, 91)]
    # windows = [i for i in range(1, 91, 3)]
    periods = [1, 5, 10, 20, 30, 40, 50, 60]
    windows = [i for i in range(1, 91, 3)]

    prepare_y_dict = {}
    for period in periods:
        pred_n = 'pred_{:02}'.format(period)
        print(pred_n + ' : (max precision 갱신 시 message 출력)')
        _prepare = {}
        max_precision, max_prec_day = 0, 0
        compact_df, comp_y = concat_x_y(training_data, period=period, window=1, 
                                   n_col=n_col, use_fn='f', close='Adj Close',
                                   thresh=thresh, method='regress')
        train, test = train_test_split(compact_df, start, end, date='Date')
        col_set = correl_selection(train, y=comp_y, corr_li=corr_li)
        y_valid = train[comp_y].values
        comp_y_tuple = (train, test, col_set)
        for window in windows:
            compact_df, y = concat_x_y(training_data, period=period, window=window, 
                                       n_col=n_col, use_fn='g', close='Adj Close',
                                       thresh=thresh, method='regress')
            train, test = train_test_split(compact_df, start, end, date='Date')
            col_set = correl_selection(train, y=y, corr_li=corr_li)
            n_cv = 5
            ix_cv = int(len(train) / n_cv)
            precision = 0
            for i in range(n_cv - 1):
                X_train = train[list(col_set.keys())].iloc[i*ix_cv:(i+1)*ix_cv]
                y_train = train[y].iloc[i*ix_cv:(i+1)*ix_cv]
                X_valid = train[list(col_set.keys())].iloc[(i+1)*ix_cv:(i+2)*ix_cv]
                y_true = y_valid[(i+1)*ix_cv:(i+2)*ix_cv]

                xgb_reg = xgb.XGBRegressor(
                     learning_rate =0.15,
                     n_estimators=100,
                     max_depth=5,
                     min_child_weight=2,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27,
                     eval_metric='map')
                xgb_reg.fit(X_train, y_train)

                y_pred = xgb_reg.predict(X_valid)
                precision += met.precision_score(
                                    np.where(y_true>0, 1, 0), 
                                    np.where(y_pred>0, 1, 0))
            precision = precision / (n_cv - 1)
            if (precision > max_precision):
                max_prec_day = y[-2:]
                max_y = y
                max_precision = precision
                print('\t{} : {:.2%}'.format(y, max_precision), end='')
            _prepare[y] = (train, test, col_set)

        print('\n\tmax_precision_day : {}\n\tmax_precision : {:.2%}'.format(max_prec_day, max_precision))
        train, test, col_set = _prepare[y[:-2]+max_prec_day]
        prepare_y_dict[pred_n] = {comp_y : comp_y_tuple,
                                  max_y : (train, test, col_set, max_prec_day, max_precision)}

In [83]:
chart_data = raw_data['KB금융'].copy()

In [84]:
chart_data = chart_data.reset_index()

In [85]:
windows = [i for i in range(1, 90)]
prep_data, n_col = preprocess(chart_data, windows=windows)
training_data = build_feature(prep_data, 
                  close='Adj Close', high='High', low='Low')
del prep_data

In [104]:
start, end = '2013-01-01', '2018-01-01'
thresh = .02 # Use only method=='Classification'
# periods = [i for i in range(1, 91)]
# windows = [i for i in range(1, 91, 3)]
periods = [1, 5, 10, 20, 30, 40, 50, 60]
windows = [i for i in range(1, 91, 3)]
corr_li = ['sto_K','sto_D','RSI','roc','his_vol','WR',
                 'CCI','DI','MACD','MACDSignal','MACDDiff']

In [105]:
prepare_y_dict = {}
for period in periods:
    pred_n = 'pred_{:02}'.format(period)
    print(pred_n + ' : (max precision 갱신 시 message 출력)')
    _prepare = {}
    max_precision, max_prec_day = 0, 0
    compact_df, comp_y = concat_x_y(training_data, period=period, window=1, 
                               n_col=n_col, use_fn='f', close='Adj Close',
                               thresh=thresh, method='regress')
    train, test = train_test_split(compact_df, start, end, date='Date')
    col_set = correl_selection(train, y=comp_y, corr_li=corr_li)
    y_valid = train[comp_y].values
    comp_y_tuple = (train, test, col_set)
    for window in windows:
        compact_df, y = concat_x_y(training_data, period=period, window=window, 
                                   n_col=n_col, use_fn='g', close='Adj Close',
                                   thresh=thresh, method='regress')
        train, test = train_test_split(compact_df, start, end, date='Date')
        col_set = correl_selection(train, y=y, corr_li=corr_li)
        n_cv = 5
        ix_cv = int(len(train) / n_cv)
        precision = 0
        for i in range(n_cv - 1):
            X_train = train[list(col_set.keys())].iloc[i*ix_cv:(i+1)*ix_cv]
            y_train = train[y].iloc[i*ix_cv:(i+1)*ix_cv]
            X_valid = train[list(col_set.keys())].iloc[(i+1)*ix_cv:(i+2)*ix_cv]
            y_true = y_valid[(i+1)*ix_cv:(i+2)*ix_cv]

            xgb_reg = xgb.XGBRegressor(
                 learning_rate =0.15,
                 n_estimators=100,
                 max_depth=5,
                 min_child_weight=2,
                 gamma=0,
                 subsample=0.8,
                 colsample_bytree=0.8,
                 nthread=4,
                 scale_pos_weight=1,
                 seed=27,
                 eval_metric='map')
            xgb_reg.fit(X_train, y_train)

            y_pred = xgb_reg.predict(X_valid)
            precision += met.precision_score(
                                np.where(y_true>0, 1, 0), 
                                np.where(y_pred>0, 1, 0))
        precision = precision / (n_cv - 1)
        if (precision > max_precision):
            max_prec_day = y[-2:]
            max_y = y
            max_precision = precision
            print('\t{} : {:.2%}'.format(y, max_precision), end='')
        _prepare[y] = (train, test, col_set)
        
    print('\n\tmax_precision_day : {}\n\tmax_precision : {:.2%}'.format(max_prec_day, max_precision))
    train, test, col_set = _prepare[y[:-2]+max_prec_day]
    prepare_y_dict[pred_n] = {comp_y : comp_y_tuple,
                              max_y : (train, test, col_set, max_prec_day, max_precision)}

pred_01 : (max precision 갱신 시 message 출력)
	sh01_Y_01 : 50.27%	sh01_Y_22 : 50.45%
	max_precision_day : 22
	max_precision : 50.45%
pred_05 : (max precision 갱신 시 message 출력)
	sh05_Y_01 : 56.89%
	max_precision_day : 01
	max_precision : 56.89%
pred_10 : (max precision 갱신 시 message 출력)
	sh10_Y_01 : 53.87%	sh10_Y_04 : 55.31%	sh10_Y_07 : 57.75%
	max_precision_day : 07
	max_precision : 57.75%
pred_20 : (max precision 갱신 시 message 출력)
	sh20_Y_01 : 58.01%
	max_precision_day : 01
	max_precision : 58.01%
pred_30 : (max precision 갱신 시 message 출력)
	sh30_Y_01 : 57.09%
	max_precision_day : 01
	max_precision : 57.09%
pred_40 : (max precision 갱신 시 message 출력)
	sh40_Y_01 : 60.15%	sh40_Y_04 : 60.92%
	max_precision_day : 04
	max_precision : 60.92%
pred_50 : (max precision 갱신 시 message 출력)
	sh50_Y_01 : 62.04%	sh50_Y_04 : 68.31%
	max_precision_day : 04
	max_precision : 68.31%
pred_60 : (max precision 갱신 시 message 출력)
	sh60_Y_01 : 63.74%	sh60_Y_04 : 67.50%
	max_precision_day : 04
	max_precision : 67.50%


In [112]:
prepare_y_dict['pred_05']['sh05_Y_88']

(           Date     Open     High      Low    Close     Adj Close   Volume  \
 1136 2013-01-02  38300.0  39000.0  38050.0  38800.0  34356.855469  1404499   
 1137 2013-01-03  39000.0  39450.0  38400.0  39300.0  34799.597656  1858926   
 1138 2013-01-04  38900.0  39300.0  38800.0  39150.0  34666.773438  1025315   
 1139 2013-01-07  39350.0  39850.0  39200.0  39850.0  35286.613281  1153123   
 1140 2013-01-08  39450.0  39600.0  38750.0  38750.0  34312.582031  1103199   
 1141 2013-01-09  38500.0  38700.0  37950.0  38200.0  33825.562500  1214488   
 1142 2013-01-10  38000.0  38300.0  37550.0  38250.0  33869.832031   884770   
 1143 2013-01-11  38350.0  38550.0  37800.0  37850.0  33515.640625   590747   
 1144 2013-01-14  37650.0  38850.0  37650.0  38800.0  34356.855469  1065453   
 1145 2013-01-15  38300.0  39150.0  38150.0  38200.0  33825.562500   746685   
 1146 2013-01-16  38500.0  38900.0  38200.0  38500.0  34091.207031   783621   
 1147 2013-01-17  38800.0  38900.0  38500.0  38850.0

In [78]:


for stock_name, data in raw_data.items():
    raw_data[stock_name] = data.reset_index()
    
for stock_name in raw_data.keys():
    chart_data = raw_data[stock_name].copy()
    windows = [i for i in range(1, 90)]
    prep_data, n_col = preprocess(chart_data, windows=windows)
    training_data = build_feature(prep_data, 
                      close='Adj Close', high='High', low='Low')
    del prep_data

    start, end = '2013-01-01', '2018-01-01' # 훈련시킬 기간

    thresh = .02 # Use only method=='Classification'
    # periods = [i for i in range(1, 91)]
    # windows = [i for i in range(1, 91, 3)]
    periods = [1, 5, 10, 20, 30, 40, 50, 60]
    windows = [i for i in range(1, 91, 3)]

    for period in periods:
        pred_n = 'pred_{:02}'.format(period)
        print(pred_n + ' : (max precision 갱신 시 message 출력)')
        _prepare = {}
        max_precision, max_prec_day, rmse = 0, 0, 0
        compact_df, comp_y = concat_x_y(training_data, period=period, window=1, 
                                   n_col=n_col, use_fn='f', 
                                   thresh=thresh, method='regress')
        train, test = train_test_split(compact_df, start, end)
        col_set = correl_selection(train, y=comp_y, corr_li=corr_li)
        y_valid = train[comp_y].values
        comp_y_tuple = (train, test, col_set)
        for window in windows:
            compact_df, y = concat_x_y(training_data, period=period, window=window, 
                                       n_col=n_col, use_fn='g', 
                                       thresh=thresh, method='regress')
            train, test = train_test_split(compact_df, start, end)
            col_set = correl_selection(train, y=y, corr_li=corr_li)
            n_cv = 5
            ix_cv = int(len(train) / n_cv)
            precision = 0
            for i in [0,1,2,3]:
                X_train = train[list(col_set.keys())].iloc[i*ix_cv:(i+1)*ix_cv]
                y_train = train[y].iloc[i*ix_cv:(i+1)*ix_cv]
                X_valid = train[list(col_set.keys())].iloc[(i+1)*ix_cv:(i+2)*ix_cv]
                y_true = y_valid[(i+1)*ix_cv:(i+2)*ix_cv]

                xgb_reg = xgb.XGBRegressor(
                     learning_rate =0.15,
                     n_estimators=100,
                     max_depth=5,
                     min_child_weight=2,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27,
                     eval_metric='map')
                xgb_reg.fit(X_train, y_train)

                y_pred = xgb_reg.predict(X_valid)
                precision += met.precision_score(
                                    np.where(y_true>0, 1, 0), 
                                    np.where(y_pred>0, 1, 0))
            precision = precision / (n_cv - 1)
            if (precision > max_precision):
                max_prec_day = y[-2:]
                max_precision = precision
            print('\t{} : {:.2%}'.format(y, precision), end='')
            _prepare[y] = (train, test, col_set)
        if max_precision == 0:
            print('\t이 예측은 사용할 수 없습니다.')
            continue
        print('\tmax_precision_day : {}\n\tmax_precision : {:.2%}'.format(max_prec_day, max_precision))
        train, test, col_set = _prepare[y[:-2]+max_prec_day]
        prepare_y_dict[pred_n] = {comp_y : comp_y_tuple,
                                  y : (train, test, col_set, max_prec_day, max_precision)}
        tot_time = time.time() - start_time
        print("\tDone '" + pred_n + "'\t\telapse:{:.2f} sec, TOT:{:.2f} min".format(
                    tot_time - check_point, tot_time / 60))
        check_point = tot_time


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,close_ma1,volume_ma1,close_ma2,volume_ma2,...,RSI_90,sto_K_90,sto_D_90,WR_90,CCI_20,DI_5,DI_10,MACD,MACDSignal,MACDDiff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-04,6000.0,6110.0,5660.0,6110.0,0.091443,74195000,0.091443,74195000.0,,,...,,,,,,,,,,
2000-01-05,5800.0,6060.0,5520.0,5580.0,0.083511,74680000,0.083511,74680000.0,0.087477,74437500.0,...,,,,,,,,,,
2000-01-06,5750.0,5780.0,5580.0,5620.0,0.084110,54390000,0.084110,54390000.0,0.083811,64535000.0,...,,,,,,,,,,
2000-01-07,5560.0,5670.0,5360.0,5540.0,0.082912,40305000,0.082912,40305000.0,0.083511,47347500.0,...,,,,,,,,,,
2000-01-10,5600.0,5770.0,5580.0,5770.0,0.086355,46880000,0.086355,46880000.0,0.084634,43592500.0,...,,,,,,0.804051,,,,
2000-01-11,5820.0,6100.0,5770.0,5770.0,0.086355,59745000,0.086355,59745000.0,0.086355,53312500.0,...,,,,,,2.015863,,,,
2000-01-12,5610.0,5740.0,5600.0,5720.0,0.085606,29220000,0.085606,29220000.0,0.085980,44482500.0,...,,,,,,0.632908,,,,
2000-01-13,5600.0,5740.0,5560.0,5710.0,0.085457,41190000,0.085457,41190000.0,0.085531,35205000.0,...,,,,,,0.140619,,,,
2000-01-14,5720.0,5880.0,5680.0,5830.0,0.087253,49375000,0.087253,49375000.0,0.086355,45282500.0,...,,,,,,1.215472,,,,
2000-01-17,6000.0,6180.0,5920.0,6100.0,0.091293,63505000,0.091293,63505000.0,0.089273,56440000.0,...,,,,,,4.702452,5.627130,,,


In [None]:
start_time, check_point = time.time(), 0
print('#1 Generate Feature & Target and Correlation Selection\n')
print('#2 Max Precision Filter')
print(' - 각 예측 기간별 Time window에 대하여 Max Precision을 측정 및 선별)\n')

stock_code = '005930'
chart_data = load_chart_data('./stock_data/{}.csv'.format(stock_code))
prepare_y_dict = {}

windows = [i for i in range(1, 90)]
prep_data, n_col = preprocess(chart_data, windows=windows)
training_data = build_training_data(prep_data)
del prep_data

features_chart_data = ['date', 'open', 'high', 'low', 'close', 'volume']
chart_data = training_data[features_chart_data]

thresh = .02 # Use only method=='Classification'
# periods = [i for i in range(1, 91)]
# windows = [i for i in range(1, 91, 3)]
periods = [1, 5, 10, 20, 30, 40, 50, 60]
windows = [i for i in range(1, 91, 3)]
corr_li = ['sto_K','sto_D','RSI','roc','his_vol','WR',
                 'CCI','DI','MACD','MACDSignal','MACDDiff']
start, end = '2007-01-01', '2017-12-31' # 훈련시킬 기간
for period in periods:
    pred_n = 'pred_{:02}'.format(period)
    print(pred_n + ' : (max precision 갱신 시 message 출력)')
    _prepare = {}
    max_precision, max_prec_day, rmse = 0, 0, 0
    compact_df, comp_y = concat_x_y(training_data, period=period, window=1, 
                               n_col=n_col, use_fn='f', 
                               thresh=thresh, method='regress')
    train, test = train_test_split(compact_df, start, end)
    col_set = correl_selection(train, y=comp_y, corr_li=corr_li)
#     y_valid = np.where(train[comp_y].values > 0, 1, 0)
    y_valid = train[comp_y].values
    comp_y_tuple = (train, test, col_set)
    for window in windows:
        compact_df, y = concat_x_y(training_data, period=period, window=window, 
                                   n_col=n_col, use_fn='g', 
                                   thresh=thresh, method='regress')
        train, test = train_test_split(compact_df, start, end)
        col_set = correl_selection(train, y=y, corr_li=corr_li)
        n_cv = 5
        ix_cv = int(len(train) / n_cv)
        precision = 0
        for i in [0,1,2,3]:
            X_train = train[list(col_set.keys())].iloc[i*ix_cv:(i+1)*ix_cv]
            y_train = train[y].iloc[i*ix_cv:(i+1)*ix_cv]
            X_valid = train[list(col_set.keys())].iloc[(i+1)*ix_cv:(i+2)*ix_cv]
            y_true = y_valid[(i+1)*ix_cv:(i+2)*ix_cv]

            xgb_reg = xgb.XGBRegressor(
                 learning_rate =0.15,
                 n_estimators=100,
                 max_depth=5,
                 min_child_weight=2,
                 gamma=0,
                 subsample=0.8,
                 colsample_bytree=0.8,
                 nthread=4,
                 scale_pos_weight=1,
                 seed=27,
                 eval_metric='map')
            xgb_reg.fit(X_train, y_train)
        
#             y_pred = np.where(xgb_reg.predict(X_valid) > 0, 1, 0)
            y_pred = xgb_reg.predict(X_valid)
            precision += met.precision_score(
                                np.where(y_true>0, 1, 0), 
                                np.where(y_pred>0, 1, 0))
            rmse += met.mean_squared_error(y_true, y_pred) ** .5
        precision = precision / (n_cv - 1)
        rmse = rmse / (n_cv - 1)
        if (precision > max_precision) & (rmse <= 0.01 * (period ** .3)):
            max_prec_day = y[-2:]
            max_precision = precision
            argmax_rmse = rmse
            tot_time = time.time() - start_time
#             print('\t{} : {:.2%}, rmse={:.2%}'.format(y, max_precision, argmax_rmse), end='')
#             print('\telapse:{:.2f} sec, TOT:{:.2f} min'.format(
#                 tot_time-check_point, tot_time / 60))
#             check_point = tot_time
        print('\t{} : {:.2%}, rmse={:.2%}'.format(y, precision, rmse), end='')
        print('\telapse:{:.2f} sec, TOT:{:.2f} min'.format(
            tot_time-check_point, tot_time / 60))
        check_point = tot_time
#         _prepare[y] = (train, test, col_set)
#     if max_precision == 0:
#         print('\t이 예측은 사용할 수 없습니다.')
#         continue
#     print('\tmax_precision_day : {}\n\tmax_precision : {:.2%}'.format(max_prec_day, max_precision))
#     train, test, col_set = _prepare[y[:-2]+max_prec_day]
#     prepare_y_dict[pred_n] = {comp_y : comp_y_tuple,
#                               y : (train, test, col_set, max_prec_day, max_precision)}
#     tot_time = time.time() - start_time
#     print("\tDone '" + pred_n + "'\t\telapse:{:.2f} sec, TOT:{:.2f} min".format(
#                 tot_time - check_point, tot_time / 60))
#     check_point = tot_time