In [1]:
from db import DB
db = DB()

INTERVAL = 120

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.style.use('fivethirtyeight') # fivethirtyeight, fast
plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei'] # font to handle chinese
%matplotlib inline

In [35]:
#####################
### prepare data 
#####################
import pandas as pd
from talib import abstract
import utility as util

def prepare_stock_data(db):
    stock_data = {}
    for stock_id in db.get_stock_info().index:
        df_daily = db.get_daily_price(stock_id)
        df_monthly = db.get_by_stock_id(stock_id, 'monthly_revenue')
        df_quarterly = db.get_by_stock_id(stock_id, 'quarterly_report')
        
        ### fundamental
        def compared_ratio(s, num):
            return s / s.shift(num) - 1
        def compared_diff(s, num):
            return s - s.shift(num)
        
        df_monthly['3月營收'] = df_monthly.當月營收.rolling(3).mean()
        df_monthly['12月營收'] = df_monthly.當月營收.rolling(12).mean()
        df_monthly['當月營收_年增率'] = compared_ratio(df_monthly['當月營收'], 12)
        df_monthly['3月營收_年增率'] = compared_ratio(df_monthly['3月營收'], 12)
        df_monthly['12月營收_年增率'] = compared_ratio(df_monthly['12月營收'], 12)
        util.fill_short_interval_by_long_interval(df_daily, df_monthly, '當月營收_年增率')
        util.fill_short_interval_by_long_interval(df_daily, df_monthly, '3月營收_年增率')
        util.fill_short_interval_by_long_interval(df_daily, df_monthly, '12月營收_年增率')
        
        df_quarterly['毛利率'] = df_quarterly.毛利 / df_quarterly.營收
        df_quarterly['營益率'] = df_quarterly.營利 / df_quarterly.營收
        df_quarterly['淨利率'] = df_quarterly.稅後淨利 / df_quarterly.營收
        df_quarterly['毛利率_年增'] = compared_diff(df_quarterly['毛利率'], 4)
        df_quarterly['營益率_年增'] = compared_diff(df_quarterly['營益率'], 4)
        df_quarterly['淨利率_年增'] = compared_diff(df_quarterly['淨利率'], 4)
        util.fill_short_interval_by_long_interval(df_daily, df_quarterly, '毛利率_年增')
        util.fill_short_interval_by_long_interval(df_daily, df_quarterly, '營益率_年增')
        util.fill_short_interval_by_long_interval(df_daily, df_quarterly, '淨利率_年增')
    
        df_quarterly['EPS4季'] = df_quarterly.EPS.rolling(4).sum()
        df_quarterly['EPS4季_季增率'] = compared_ratio(df_quarterly['EPS4季'], 1)
        股數 = df_quarterly.股本 / 10
        df_quarterly['淨值/股'] = df_quarterly.權益 / 股數
        util.fill_short_interval_by_long_interval(df_daily, df_quarterly, 'EPS4季')
        util.fill_short_interval_by_long_interval(df_daily, df_quarterly, 'EPS4季_季增率')
        util.fill_short_interval_by_long_interval(df_daily, df_quarterly, '淨值/股')
        
        df_daily['本益比'] = df_daily.close / df_daily.EPS4季
        df_daily['本淨比'] = df_daily.close / df_daily['淨值/股']
        df_daily['眼光費'] = (df_daily.close - df_daily['淨值/股']) / df_daily.EPS4季
        
        fundamental_cols = ['當月營收_年增率', '3月營收_年增率', '12月營收_年增率',
                            '毛利率_年增', '營益率_年增', '淨利率_年增',
                            'EPS4季', 'EPS4季_季增率', '淨值/股', '本益比', '本淨比', '眼光費']
        
        ### technical
        close_5 = df_daily.close.rolling(5).mean()
        close_10 = df_daily.close.rolling(10).mean()
        close_20 = df_daily.close.rolling(20).mean()
        volume_5 = df_daily.volume.rolling(5).mean()
        volume_10 = df_daily.volume.rolling(10).mean()
        df_daily['close_1/5'] = df_daily.close / close_5
        df_daily['close_5/10'] = close_5 / close_10
        df_daily['close_10/20'] = close_10 / close_20
        df_daily['volume_1/5'] = df_daily.volume / volume_5
        df_daily['volume_1/10'] = df_daily.volume / volume_10
        
#         df_daily['close_5'] = df_daily.close.rolling(5).mean()
#         df_daily['close_10'] = df_daily.close.rolling(10).mean()
#         df_daily['close_20'] = df_daily.close.rolling(20).mean()
#         df_daily['volume_5'] = df_daily.volume.rolling(5).mean()
#         df_daily['volume_10'] = df_daily.volume.rolling(10).mean()
#         MACD = abstract.MACD(df_daily)
#         MACD.columns = ['DIFF', 'DEM', 'D-M']
#         df_daily[['DIFF', 'DEM', 'D-M']] = MACD
#         KD = abstract.STOCH(df_daily)
#         KD.columns = ['K', 'D']
#         df_daily[['K', 'D']] = KD
        
#         technical_cols = ['close', 'close_5', 'close_10', 'close_20', 'volume', 'volume_5', 'volume_10', 
#                           'DIFF', 'DEM', 'K', 'D']
        technical_cols = ['close', 'close_1/5', 'close_5/10', 'close_10/20', 'volume_1/5', 'volume_1/10']
        
        ### chip
#         buy_surplus = db.get_by_stock_id(stock_id, 'daily_buy_sell_surplus')
#         df_daily = pd.merge(df_daily, buy_surplus, left_index=True, right_index=True)
#         df_daily['foreign_5'] = df_daily.foreign.rolling(5).mean()
#         df_daily['foreign_10'] = df_daily.foreign.rolling(10).mean()
        
#         chip_cols = ['foreign_5', 'foreign_10']
        
        stock_data[stock_id] = df_daily[fundamental_cols + technical_cols].dropna()

    return stock_data

stock_data = prepare_stock_data(db)

In [36]:
#####################
### label 
#####################
for stock_id in stock_data:
    df = stock_data[stock_id].copy()
    max_close = df['close'].rolling(INTERVAL).max().shift(-INTERVAL)
    min_close = df['close'].rolling(INTERVAL).min().shift(-INTERVAL)
    buy_high_return = max_close / df.close - 1 > 0.3
    buy_low_risk = min_close / df.close - 1 > -0.1
    sell_high_return = min_close / df.close - 1 < -0.15
    sell_low_risk = max_close / df.close - 1 < 0.1
    
    df['label'] = 0
    for i in range(len(df.index) - INTERVAL):
        if buy_high_return.iloc[i] & buy_low_risk.iloc[i]:
            df['label'].iloc[i] = 1
        elif sell_high_return.iloc[i] & sell_low_risk.iloc[i]:
            df['label'].iloc[i] = -1
    
    df.to_csv('ml_data/' + stock_id + '.csv', index=True, index_label='date')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [23]:
db.get_stock_info().index

Index(['1101', '1102', '1303', '2105', '2002', '1402', '1434', '9904', '1216',
       '1227', '2377', '2357', '2301', '2354', '2385', '3702', '2347', '2303',
       '6239', '2317', '2382', '4938', '2356', '3231'],
      dtype='object', name='stock_id')

In [57]:
#####################
### read data
#####################
TEST_STOCK_IDS = ['1101', '2105', '1434', '1227', '2301', '3702', '6239', '4938'] 

train_dfs = []
stock_data = {}
for stock_id in db.get_stock_info().index:
    df = pd.read_csv('ml_data/' + stock_id + '.csv', index_col=['date'], parse_dates=['date'])
    stock_data[stock_id] = df
    if stock_id not in TEST_STOCK_IDS:
        train_dfs.append(df.iloc[:-INTERVAL])

df_all = pd.concat(train_dfs).sort_index()
y = df_all[['label']]
X = df_all.drop(['label'], axis=1)

In [48]:
def accuracy_score(s1, s2, silence=True):
    result = []
    for label, predict in zip(s1, s2):
        if predict != 0:
            score = 1 if predict == label else 0
            result.append(score)
    if len(result) == 0:
        return None
    if not silence:
        print(np.sum(result), '/', len(result), '=')
    return np.mean(result)

In [61]:
#####################
### train
#####################
import numpy as np
import itertools
from sklearn.ensemble import RandomForestClassifier

def build_and_train(X, y, n_estimators=30, max_depth=10, silence=False):
    print('\nn_estimators', n_estimators, 'max_depth', max_depth)
    ### build model
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features='sqrt', random_state=7)
        
    ### split train & test data
    day_step = 120
    dates = sorted(list(set(X.index)))
    dates_length = len(dates)
    accuracy_train = []
    accuracy_test = []
    
    for i_start in range(day_step * 5, dates_length, day_step):
        i_end = i_start + day_step
        if i_end >= dates_length:
            i_end = dates_length - 1
        if not silence:
            print('==============================')
            print('test data from', dates[i_start].date() , 'to', dates[i_end].date())
        X_train, X_test = (X.loc[dates[0] : dates[i_start]], X.loc[dates[i_start] : dates[i_end]])
        y_train, y_test = (np.ravel(y.loc[dates[0] : dates[i_start]]), np.ravel(y.loc[dates[i_start] : dates[i_end]]))

        ### train model
        model.fit(X_train, y_train)
    
        y_pred = model.predict(X_train)
        acc = accuracy_score(y_train, y_pred)
        accuracy_train.append(acc)
        if not silence:
            print('Train Accuracy:', acc)

        ### predict
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        accuracy_test.append(acc)
        if not silence:
            print('Test Accuracy: ', acc)
    
    print('Average Train Accuracy:', np.mean(accuracy_train))
    print('Average Test Accuracy:', np.mean(accuracy_test))
    return model

# for n_estimators, max_depth in itertools.product([30, 20, 10], [8, 6, 4]):
#     build_and_train(X, y, n_estimators, max_depth, True)
model = build_and_train(X, y, 30, 10)


n_estimators 30 max_depth 10
test data from 2017-07-03 to 2017-12-21
Train Accuracy: 0.9915552427867699
Test Accuracy:  0.49645390070921985
test data from 2017-12-21 to 2018-06-26
Train Accuracy: 0.9915282954930532
Test Accuracy:  0.46255506607929514
test data from 2018-06-26 to 2018-12-13
Train Accuracy: 0.9855995569094433
Test Accuracy:  0.6495726495726496
test data from 2018-12-13 to 2019-06-18
Train Accuracy: 0.9877256317689531
Test Accuracy:  0.4140625
Average Train Accuracy: 0.9891021817395549
Average Test Accuracy: 0.5056610290902912


In [62]:
feature_importances = pd.Series(data=model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importances)

close          0.122406
12月營收_年增率      0.100256
本益比            0.096751
眼光費            0.091728
本淨比            0.088182
淨值/股           0.073414
3月營收_年增率       0.072788
EPS4季_季增率      0.071234
EPS4季          0.067441
當月營收_年增率       0.050056
毛利率_年增         0.046183
營益率_年增         0.043757
淨利率_年增         0.042328
close_10/20    0.014181
close_5/10     0.007895
close_1/5      0.005773
volume_1/10    0.003218
volume_1/5     0.002410
dtype: float64


In [17]:
from sklearn.externals import joblib
### save model
# joblib.dump(model, 'ml_data/model.pkl')

### load model
# model = joblib.load('ml_data/model.pkl')

In [63]:
#####################
### predict & plot
#####################
def get_region(series, value):
    region = []
    get = False
    for i in range(len(series.index)):
        if series[i] == value:
            if not get:
                region.append([series.index[i]])
                get = True
            continue
        if get:
            region[-1].append(series.index[i - 1])
            get = False
    if len(region) > 0 and len(region[-1]) == 1:
        region[-1].append(series.index[-1])
    return region

def draw_region(region, ax, color):
    for xmin, xmax in region:
        ax.axvspan(xmin, xmax, color=color, alpha=0.5)
    
stock_info = db.get_stock_info()
for stock_id in stock_data:
    if stock_id not in TEST_STOCK_IDS:
        continue
    stock_name = stock_info.loc[stock_id, 'stock_name']
    df_daily_ori = stock_data[stock_id]
    df_daily = df_daily_ori.copy()
    ### predict
    df_daily['predict'] = model.predict(df_daily_ori.drop(['label'], axis=1))
    print('Test Accuracy', stock_name, accuracy_score(df_daily.label.iloc[:-INTERVAL], df_daily.predict.iloc[:-INTERVAL], False))
    ### plot
#     region_buy = get_region(df_daily['predict'], 1)
#     ax = df_daily[['close']].plot(grid=True, title=stock_id + ',' + stock_name, figsize=(10,4))
#     draw_region(region_buy, ax, 'red')
#     region_sell = get_region(df_daily['predict'], -1)
#     draw_region(region_sell, ax, 'green')

48 / 168 =
Test Accuracy 台泥 0.2857142857142857
0 / 2 =
Test Accuracy 正新 0.0
102 / 449 =
Test Accuracy 福懋 0.22717149220489977
26 / 49 =
Test Accuracy 佳格 0.5306122448979592
0 / 105 =
Test Accuracy 光寶科 0.0
13 / 180 =
Test Accuracy 大聯大 0.07222222222222222
103 / 307 =
Test Accuracy 力成 0.3355048859934853
37 / 94 =
Test Accuracy 和碩 0.39361702127659576
