In [1]:
from db import DB
db = DB()

INTERVAL = 120

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.style.use('fivethirtyeight') # fivethirtyeight, fast
plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei'] # font to handle chinese
%matplotlib inline

In [35]:
#####################
### prepare data 
#####################
import pandas as pd
from talib import abstract
import utility as util

def prepare_stock_data(db):
    stock_data = {}
    for stock_id in db.get_stock_info().index:
        df_daily = db.get_daily_price(stock_id)
        df_monthly = db.get_by_stock_id(stock_id, 'monthly_revenue')
        df_quarterly = db.get_by_stock_id(stock_id, 'quarterly_report')
        
        ### fundamental
        def compared_ratio(s, num):
            return s / s.shift(num) - 1
        def compared_diff(s, num):
            return s - s.shift(num)
        
        df_monthly['3月營收'] = df_monthly.當月營收.rolling(3).mean()
        df_monthly['12月營收'] = df_monthly.當月營收.rolling(12).mean()
        df_monthly['當月營收_年增率'] = compared_ratio(df_monthly['當月營收'], 12)
        df_monthly['3月營收_年增率'] = compared_ratio(df_monthly['3月營收'], 12)
        df_monthly['12月營收_年增率'] = compared_ratio(df_monthly['12月營收'], 12)
        util.fill_short_interval_by_long_interval(df_daily, df_monthly, '當月營收_年增率')
        util.fill_short_interval_by_long_interval(df_daily, df_monthly, '3月營收_年增率')
        util.fill_short_interval_by_long_interval(df_daily, df_monthly, '12月營收_年增率')
        
        df_quarterly['毛利率'] = df_quarterly.毛利 / df_quarterly.營收
        df_quarterly['營益率'] = df_quarterly.營利 / df_quarterly.營收
        df_quarterly['淨利率'] = df_quarterly.稅後淨利 / df_quarterly.營收
        df_quarterly['毛利率_年增'] = compared_diff(df_quarterly['毛利率'], 4)
        df_quarterly['營益率_年增'] = compared_diff(df_quarterly['營益率'], 4)
        df_quarterly['淨利率_年增'] = compared_diff(df_quarterly['淨利率'], 4)
        util.fill_short_interval_by_long_interval(df_daily, df_quarterly, '毛利率_年增')
        util.fill_short_interval_by_long_interval(df_daily, df_quarterly, '營益率_年增')
        util.fill_short_interval_by_long_interval(df_daily, df_quarterly, '淨利率_年增')
    
        df_quarterly['EPS4季'] = df_quarterly.EPS.rolling(4).sum()
        df_quarterly['EPS4季_季增率'] = compared_ratio(df_quarterly['EPS4季'], 1)
        股數 = df_quarterly.股本 / 10
        df_quarterly['淨值/股'] = df_quarterly.權益 / 股數
        util.fill_short_interval_by_long_interval(df_daily, df_quarterly, 'EPS4季')
        util.fill_short_interval_by_long_interval(df_daily, df_quarterly, 'EPS4季_季增率')
        util.fill_short_interval_by_long_interval(df_daily, df_quarterly, '淨值/股')
        
        df_daily['本益比'] = df_daily.close / df_daily.EPS4季
        df_daily['本淨比'] = df_daily.close / df_daily['淨值/股']
        df_daily['眼光費'] = (df_daily.close - df_daily['淨值/股']) / df_daily.EPS4季
        
        fundamental_cols = ['當月營收_年增率', '3月營收_年增率', '12月營收_年增率',
                            '毛利率_年增', '營益率_年增', '淨利率_年增',
                            'EPS4季', 'EPS4季_季增率', '淨值/股', '本益比', '本淨比', '眼光費']
        
        ### technical
        close_5 = df_daily.close.rolling(5).mean()
        close_10 = df_daily.close.rolling(10).mean()
        close_20 = df_daily.close.rolling(20).mean()
        volume_5 = df_daily.volume.rolling(5).mean()
        volume_10 = df_daily.volume.rolling(10).mean()
        df_daily['close_1/5'] = df_daily.close / close_5
        df_daily['close_5/10'] = close_5 / close_10
        df_daily['close_10/20'] = close_10 / close_20
        df_daily['volume_1/5'] = df_daily.volume / volume_5
        df_daily['volume_1/10'] = df_daily.volume / volume_10
        
#         df_daily['close_5'] = df_daily.close.rolling(5).mean()
#         df_daily['close_10'] = df_daily.close.rolling(10).mean()
#         df_daily['close_20'] = df_daily.close.rolling(20).mean()
#         df_daily['volume_5'] = df_daily.volume.rolling(5).mean()
#         df_daily['volume_10'] = df_daily.volume.rolling(10).mean()
#         MACD = abstract.MACD(df_daily)
#         MACD.columns = ['DIFF', 'DEM', 'D-M']
#         df_daily[['DIFF', 'DEM', 'D-M']] = MACD
#         KD = abstract.STOCH(df_daily)
#         KD.columns = ['K', 'D']
#         df_daily[['K', 'D']] = KD
        
#         technical_cols = ['close', 'close_5', 'close_10', 'close_20', 'volume', 'volume_5', 'volume_10', 
#                           'DIFF', 'DEM', 'K', 'D']
        technical_cols = ['close', 'close_1/5', 'close_5/10', 'close_10/20', 'volume_1/5', 'volume_1/10']
        
        ### chip
#         buy_surplus = db.get_by_stock_id(stock_id, 'daily_buy_sell_surplus')
#         df_daily = pd.merge(df_daily, buy_surplus, left_index=True, right_index=True)
#         df_daily['foreign_5'] = df_daily.foreign.rolling(5).mean()
#         df_daily['foreign_10'] = df_daily.foreign.rolling(10).mean()
        
#         chip_cols = ['foreign_5', 'foreign_10']
        
        stock_data[stock_id] = df_daily[fundamental_cols + technical_cols].dropna()

    return stock_data

stock_data = prepare_stock_data(db)

In [77]:
#####################
### label 
#####################
for stock_id in stock_data:
    df = stock_data[stock_id].copy()
    max_close = df['close'].rolling(INTERVAL).max().shift(-INTERVAL)
    min_close = df['close'].rolling(INTERVAL).min().shift(-INTERVAL)
    buy_high_return = max_close / df.close - 1 > 0.1
    buy_low_risk = min_close / df.close - 1 > -0.05
    sell_high_return = min_close / df.close - 1 < -0.1
    sell_low_risk = max_close / df.close - 1 < 0.05
    
    df['label'] = 0
    for i in range(len(df.index) - INTERVAL):
        if buy_high_return.iloc[i] & buy_low_risk.iloc[i]:
            df['label'].iloc[i] = 1
        elif sell_high_return.iloc[i] & sell_low_risk.iloc[i]:
            df['label'].iloc[i] = -1
    
    df.to_csv('ml_data/' + stock_id + '.csv', index=True, index_label='date')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [23]:
db.get_stock_info().index

Index(['1101', '1102', '1303', '2105', '2002', '1402', '1434', '9904', '1216',
       '1227', '2377', '2357', '2301', '2354', '2385', '3702', '2347', '2303',
       '6239', '2317', '2382', '4938', '2356', '3231'],
      dtype='object', name='stock_id')

In [78]:
#####################
### read data
#####################
# TEST_STOCK_IDS = [] 
# TRAIN_STOCK_IDS = ['1102', '1303', '2002', '1402', '9904', '1216', '2377', '2357', 
#                    '2354', '2385', '2347', '2303', '2317', '2382', '2356', '3231']

train_dfs = []
stock_data = {}
for stock_id in db.get_stock_info().index:
    df = pd.read_csv('ml_data/' + stock_id + '.csv', index_col=['date'], parse_dates=['date'])
    stock_data[stock_id] = df
#     if stock_id not in TEST_STOCK_IDS:
#     if stock_id in TRAIN_STOCK_IDS:
    train_dfs.append(df.iloc[:-INTERVAL])

df_all = pd.concat(train_dfs).sort_index()
y = df_all[['label']]
X = df_all.drop(['label'], axis=1)

In [48]:
def accuracy_score(s1, s2, silence=True):
    result = []
    for label, predict in zip(s1, s2):
        if predict != 0:
            score = 1 if predict == label else 0
            result.append(score)
    if len(result) == 0:
        return None
    if not silence:
        print(np.sum(result), '/', len(result), '=')
    return np.mean(result)

In [84]:
#####################
### train
#####################
import numpy as np
import itertools
from sklearn.ensemble import RandomForestClassifier

def build_and_train(X, y, n_estimators=30, max_depth=10, silence=False):
    print('\nn_estimators', n_estimators, 'max_depth', max_depth)
    ### build model
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features='sqrt', random_state=7)
        
    ### split train & test data
    day_step = 120
    dates = sorted(list(set(X.index)))
    dates_length = len(dates)
    accuracy_train = []
    accuracy_test = []
    
    for i_start in range(day_step * 5, dates_length, day_step):
        i_end = i_start + day_step
        if i_end >= dates_length:
            i_end = dates_length - 1
        if not silence:
            print('==============================')
            print('test data from', dates[i_start].date() , 'to', dates[i_end].date())
        X_train, X_test = (X.loc[dates[0] : dates[i_start]], X.loc[dates[i_start] : dates[i_end]])
        y_train, y_test = (np.ravel(y.loc[dates[0] : dates[i_start]]), np.ravel(y.loc[dates[i_start] : dates[i_end]]))
        
        ### train model
        model.fit(X_train, y_train)
    
        y_pred = model.predict(X_train)
        acc = accuracy_score(y_train, y_pred)
        accuracy_train.append(acc)
        if not silence:
            print('Train Accuracy:', acc)

        ### predict
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        accuracy_test.append(acc)
        if not silence:
            print('Test Accuracy: ', acc)
    
    print('Average Train Accuracy:', np.mean(accuracy_train))
    print('Average Test Accuracy:', np.mean(accuracy_test))
    return model

# for n_estimators, max_depth in itertools.product([30, 20, 10], [8, 6, 4]):
#     build_and_train(X, y, n_estimators, max_depth, True)
model = build_and_train(X, y, 30, 10)


n_estimators 30 max_depth 10
test data from 2017-07-03 to 2017-12-21
Train Accuracy: 0.9724995291015257
Test Accuracy:  0.6344827586206897
test data from 2017-12-21 to 2018-06-26
Train Accuracy: 0.9690670214535174
Test Accuracy:  0.5540123456790124
test data from 2018-06-26 to 2018-12-13
Train Accuracy: 0.9582432586187279
Test Accuracy:  0.5489443378119002
test data from 2018-12-13 to 2019-06-18
Train Accuracy: 0.9592688257500497
Test Accuracy:  0.6405959031657356
Average Train Accuracy: 0.9647696587309551
Average Test Accuracy: 0.5945088363193345


In [80]:
feature_importances = pd.Series(data=model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importances)

close          0.113011
眼光費            0.108976
本淨比            0.095809
本益比            0.087469
EPS4季          0.084024
12月營收_年增率      0.080118
淨值/股           0.063787
3月營收_年增率       0.063101
毛利率_年增         0.055533
營益率_年增         0.055295
EPS4季_季增率      0.051245
當月營收_年增率       0.047016
淨利率_年增         0.045948
close_10/20    0.020204
close_5/10     0.012497
close_1/5      0.008425
volume_1/10    0.004287
volume_1/5     0.003254
dtype: float64


In [17]:
from sklearn.externals import joblib
### save model
# joblib.dump(model, 'ml_data/model.pkl')

### load model
# model = joblib.load('ml_data/model.pkl')

In [86]:
#####################
### predict & plot
#####################
def get_region(series, value):
    region = []
    get = False
    for i in range(len(series.index)):
        if series[i] == value:
            if not get:
                region.append([series.index[i]])
                get = True
            continue
        if get:
            region[-1].append(series.index[i - 1])
            get = False
    if len(region) > 0 and len(region[-1]) == 1:
        region[-1].append(series.index[-1])
    return region

def draw_region(region, ax, color):
    for xmin, xmax in region:
        ax.axvspan(xmin, xmax, color=color, alpha=0.5)

def compare_label(label, predict):
    label_buy = np.sum(label == 1)
    label_sell = np.sum(label == -1)
    predict_buy = np.sum(predict == 1)
    predict_sell = np.sum(predict == -1)
    print('buy', predict_buy, '/', label_buy, '=', predict_buy / label_buy)
    print('sell', predict_sell, '/', label_sell, '=', predict_sell / label_sell)
    
stock_info = db.get_stock_info()
for stock_id in stock_data:
# for stock_id in TRAIN_STOCK_IDS:
#     if stock_id not in TEST_STOCK_IDS:
#         continue
    stock_name = stock_info.loc[stock_id, 'stock_name']
    df_daily_ori = stock_data[stock_id]
    df_daily = df_daily_ori.copy()
    ### predict
    df_daily['predict'] = model.predict(df_daily_ori.drop(['label'], axis=1))
    compare_label(df_daily.label.iloc[:-INTERVAL], df_daily.predict.iloc[:-INTERVAL])
    print('Test Accuracy', stock_name, stock_id, 
          accuracy_score(df_daily.label.iloc[:-INTERVAL], df_daily.predict.iloc[:-INTERVAL], False), '\n')
    ### plot
#     region_buy = get_region(df_daily['predict'], 1)
#     ax = df_daily[['close']].plot(grid=True, title=stock_id + ',' + stock_name, figsize=(10,4))
#     draw_region(region_buy, ax, 'red')
#     region_sell = get_region(df_daily['predict'], -1)
#     draw_region(region_sell, ax, 'green')

buy 229 / 349 = 0.6561604584527221
sell 131 / 177 = 0.7401129943502824
358 / 360 =
Test Accuracy 台泥 1101 0.9944444444444445 

buy 303 / 389 = 0.7789203084832905
sell 243 / 292 = 0.8321917808219178
538 / 546 =
Test Accuracy 亞泥 1102 0.9853479853479854 

buy 273 / 317 = 0.861198738170347
sell 232 / 291 = 0.7972508591065293
478 / 505 =
Test Accuracy 南亞 1303 0.9465346534653465 

buy 73 / 140 = 0.5214285714285715
sell 411 / 459 = 0.8954248366013072
475 / 484 =
Test Accuracy 正新 2105 0.981404958677686 

buy 139 / 145 = 0.9586206896551724
sell 165 / 163 = 1.0122699386503067
301 / 304 =
Test Accuracy 中鋼 2002 0.9901315789473685 

buy 301 / 309 = 0.9741100323624595
sell 173 / 263 = 0.6577946768060836
460 / 474 =
Test Accuracy 遠東新 1402 0.9704641350210971 

buy 279 / 340 = 0.8205882352941176
sell 65 / 119 = 0.5462184873949579
287 / 344 =
Test Accuracy 福懋 1434 0.8343023255813954 

buy 32 / 150 = 0.21333333333333335
sell 172 / 327 = 0.5259938837920489
169 / 204 =
Test Accuracy 寶成 9904 0.82843137254901