In [None]:
# import library
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# adjust 한글 font
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname='c:/Windows/Fonts/malgun.ttf').get_name()
rc('font', family=font_name)

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# 한글출력
# plt.rcParams['font.family'] = 'NanumBarunGothic'
plt.rcParams['axes.unicode_minus'] = False

In [None]:
# get_technical_indicator.py
def get_MACD(df, close='Adj Close', short_ma=12, long_ma=26, signal_period=9):
    ewm_12 = df[close].ewm(span = short_ma, min_periods = short_ma - 1).mean()
    ewm_26 = df[close].ewm(span = long_ma, min_periods = long_ma - 1).mean()
    df['MACD'] = ewm_12 - ewm_26
    df['MACDSignal'] = df['MACD'].ewm(span = signal_period, min_periods = signal_period-1).mean()
    df['MACDDiff'] = df['MACD'] - df['MACDSignal']
    return df

def get_Stochastic(df, close='Adj Close', high='High', low='Low', n=14):
    df['sto_K_{}'.format(n)] = 100 * ((df[close] - df[low].rolling(n).min()) / 
                                      (df[high].rolling(n).max() - df[low].rolling(n).min()))
    df['sto_D_{}'.format(n)] = df['sto_K_{}'.format(n)].rolling(3).mean()
    return df

def get_RSI(df, close='Adj Close', n=14):
    U = np.where(df[close].diff(1)>0, df[close].diff(1), 0)
    D = np.where(df[close].diff(1)<0, df[close].diff(1)*(-1), 0)

    AU = pd.Series(U).rolling(window=n, min_periods=n).mean()
    AD = pd.Series(D).rolling(window=n, min_periods=n).mean()

    RSI = AU.div(AD+AU) * 100
    df['RSI_{}'.format(n)] = RSI.values
    return df

def get_RoC(df, close='Adj Close', periods=2):
    df['roc_{}'.format(periods)] = df[close].pct_change(periods=periods)
    return df

def get_CV(df, close='Adj Close', window=10):
    df['his_vol_{}'.format(window)] = np.log(df[close] / df[close].shift(1)).rolling(window).std()*(252**0.5)
    return df

def get_WilliamR(df, close='Adj Close', high='High', low='Low', n=14):
    W_R = -100 * ((df[high].rolling(n).max() - df[close]) / (df[high].rolling(n).max() - df[low].rolling(n).min()))
    df['WR_{}'.format(n)] = W_R
    return df

def get_CCI(df, close='Adj Close', high='High', low='Low', window=10):
    TP = (df[close] + df[low] + df[high]) / 3
    dataX = []
    for i in range(len(TP) - window+1):
        a = TP.values[i:(i+window)]
        dataX.append(a)
    dataMean = [i.mean() for i in dataX]
    AVEDEV = [abs(x - mean).mean() for x, mean 
              in zip(dataX, dataMean)]
    for i in range(window-1):
        AVEDEV.insert(0,np.nan)
    AVEDEV = pd.Series(AVEDEV, index = TP.index)
    CCI = (TP - TP.rolling(20).mean()) / (0.015 * AVEDEV)
    df['CCI_{}'.format(window)] = CCI
    return df

def get_DI(df, close='Adj Close', window=10):
    df['DI_{}'.format(window)] = 100 * ((df[close] - df[close].rolling(window).mean()) / df[close].rolling(window).mean())
    return df

In [None]:
# data_manager.py
# from get_technical_indicator import *

# def load_chart_data(fpath):
#     chart_data = pd.read_csv(fpath, thousands=',')
#     return chart_data

def preprocess(chart_data, close='Adj Close', volume='Volume', windows=10):
    prep_data = chart_data
    for window in windows:
        prep_data['close_ma{}'.format(window)] = prep_data[close].rolling(window).mean()
        prep_data['volume_ma{}'.format(window)] = (
            prep_data[volume].rolling(window).mean())
    return prep_data, len(prep_data.columns)

def build_feature(input_data, close='Adj Close', high='High', low='Low'):
    data = input_data.copy()
    for window in range(12, 91):
        data = get_CV(data, close=close, window=window)
    for periods in range(1, 11):
        data = get_RoC(data, close=close, periods=periods)
    for n in range(3, 91):
        data = get_RSI(data, close=close, n=n)
        data = get_Stochastic(data,close=close, high=high, low=low, n=n)
        data = get_WilliamR(data, close=close, high=high, low=low, n=n)
    data = get_CCI(data, close=close, high=high, low=low, window=20)
    for window in [5,10]:
        data = get_DI(data, close=close, window=window)
    data = get_MACD(data, close=close)
    return data

def make_target(df, use_fn='f', period=1, window=1, 
                method='regress', thresh=.02, close='Adj Close'):
    y_df = pd.DataFrame(index=df.index)
    f = lambda x, i : np.log(x.shift(-1 * i) / x)
    g = lambda x, i, j : np.log(x.shift(-1 * i) / x.rolling(j).mean())
    if window > 99:
        target = 'sh{:02}_Y_{:03}'.format(period, window)
    else:
        target = 'sh{:02}_Y_{:02}'.format(period, window)
    target = target[:6] if use_fn=='f' else target
    y_df[target] = \
            f(df[close], period) if use_fn == 'f' else g(df[close], period, window)
    if method != 'regress':
        y_df[target] = np.where(
            y_df[target] > thresh, 1, np.where(
                y_df[target] < (-1 * thresh), -1, 0))
    return y_df

In [None]:
def concat_x_y(df, period, window, n_col=0, close='Adj Close',
               use_fn='f', thresh=.02, method='regress'):
    y_df = make_target(df, use_fn=use_fn, period=period, window=window, 
                           thresh=thresh, method=method, close=close)
    y = y_df.columns[0]
    compact_df = pd.concat((df, y_df), axis=1)
#     compact_df = compact_df[compact_df.columns[n_col:]]
#     compact_df = compact_df.dropna()
    return compact_df, y

def train_test_split(compact_df, start, end, date='date'):
    train = compact_df[(compact_df[date] >= start) &
                   (compact_df[date] <= end)]
    test = compact_df[compact_df[date] >= \
                (pd.Timestamp(end) + pd.Timedelta('1 days')).strftime('%Y-%m-%d')]
    train, test = train.dropna(), test.dropna()
    return train, test

def correl_selection(df, y, corr_li):
    col_set = {}
    for col_name in corr_li:
        corr_ = abs(
            df[
                [col for col in df 
                 if (col.find(col_name) != -1)|(col.find(y) != -1)]
            ].corr()[y]
        ).sort_values(ascending=False)
        col_set[corr_.index[1]] = corr_.iloc[1]
    return col_set

In [None]:
import xgboost as xgb
import sklearn.metrics as met
import pickle
with open('raw_data_20190115.pickle', 'rb') as handle:
    raw_data = pickle.load(handle)

del_stock = []
for stock_name, data in raw_data.items():
    if int(str(data.index[0].year) + '{:02d}'.format(data.index[0].month)) > 201301:
            print(stock_name)
            del_stock.append(stock_name)

for stock_name in del_stock:
    del raw_data[stock_name]

for stock_name, data in raw_data.items():
    raw_data[stock_name] = data.reset_index()

result_store= {}
for stock_name in raw_data.keys():
    chart_data = raw_data[stock_name].copy()
    windows = [i for i in range(1, 90)]
    prep_data, n_col = preprocess(chart_data, windows=windows)
    training_data = build_feature(prep_data, 
                      close='Adj Close', high='High', low='Low')
    del prep_data

    start, end = '2013-01-01', '2018-01-01' # 훈련시킬 기간

    thresh = .02 # Use only method=='Classification'
    # periods = [i for i in range(1, 91)]
    # windows = [i for i in range(1, 91, 3)]
    periods = [1, 5, 10, 20, 30, 40, 50, 60]
    windows = [i for i in range(1, 91, 3)]

    prepare_y_dict = {}
    for period in periods:
        pred_n = 'pred_{:02}'.format(period)
        print(pred_n + ' : (max precision 갱신 시 message 출력)')
        _prepare = {}
        max_precision, max_prec_day = 0, 0
        compact_df, comp_y = concat_x_y(training_data, period=period, window=1, 
                                   n_col=n_col, use_fn='f', close='Adj Close',
                                   thresh=thresh, method='regress')
        train, test = train_test_split(compact_df, start, end, date='Date')
        col_set = correl_selection(train, y=comp_y, corr_li=corr_li)
        y_valid = train[comp_y].values
        comp_y_tuple = (train, test, col_set)
        for window in windows:
            compact_df, y = concat_x_y(training_data, period=period, window=window, 
                                       n_col=n_col, use_fn='g', close='Adj Close',
                                       thresh=thresh, method='regress')
            train, test = train_test_split(compact_df, start, end, date='Date')
            col_set = correl_selection(train, y=y, corr_li=corr_li)
            n_cv = 5
            ix_cv = int(len(train) / n_cv)
            precision = 0
            for i in range(n_cv - 1):
                X_train = train[list(col_set.keys())].iloc[i*ix_cv:(i+1)*ix_cv]
                y_train = train[y].iloc[i*ix_cv:(i+1)*ix_cv]
                X_valid = train[list(col_set.keys())].iloc[(i+1)*ix_cv:(i+2)*ix_cv]
                y_true = y_valid[(i+1)*ix_cv:(i+2)*ix_cv]

                xgb_reg = xgb.XGBRegressor(
                     learning_rate =0.15,
                     n_estimators=100,
                     max_depth=5,
                     min_child_weight=2,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27,
                     eval_metric='map')
                xgb_reg.fit(X_train, y_train)

                y_pred = xgb_reg.predict(X_valid)
                precision += met.precision_score(
                                    np.where(y_true>0, 1, 0), 
                                    np.where(y_pred>0, 1, 0))
            precision = precision / (n_cv - 1)
            if (precision > max_precision):
                max_prec_day = y[-2:]
                max_y = y
                max_precision = precision
                print('\t{} : {:.2%}'.format(y, max_precision), end='')
            _prepare[y] = (train, test, col_set)

        print('\n\tmax_precision_day : {}\n\tmax_precision : {:.2%}'.format(max_prec_day, max_precision))
        train, test, col_set = _prepare[y[:-2]+max_prec_day]
        prepare_y_dict[pred_n] = {comp_y : comp_y_tuple,
                                  max_y : (train, test, col_set, max_prec_day, max_precision)}
    result_store[stock_name] = prepare_y_dict