In [1]:
from fetch_data import fetch_stock

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error


%matplotlib inline
plt.rcParams['figure.figsize'] = (20,10)

In [2]:
symbol = '600297.SS'
df = fetch_stock(symbol)

stock data fetching completed
reading data into dataframe


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4590 entries, 4589 to 0
Data columns (total 6 columns):
timestamp    4590 non-null datetime64[ns]
open         4590 non-null float64
high         4590 non-null float64
low          4590 non-null float64
close        4590 non-null float64
volume       4590 non-null int64
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 251.0 KB


In [4]:
df.tail()

Unnamed: 0,timestamp,open,high,low,close,volume
4,2019-05-16,4.56,4.61,4.53,4.56,13591453
3,2019-05-17,4.66,4.74,4.4,4.43,21757671
2,2019-05-20,4.31,4.36,4.14,4.27,17862636
1,2019-05-21,4.28,4.46,4.25,4.42,18984598
0,2019-05-22,4.42,4.54,4.39,4.53,12848630


In [60]:
class stock_data():

    def __init__(self, df):
        self.data = df.set_index(['timestamp']).query('volume!=0').copy()
        self.proc_data = self.data.copy()

    def next_n_return(self, n):
        for col in self.data.columns:
            self.proc_data[f'{col}_next_{n}_day_return'] = \
            (self.proc_data[col] - self.proc_data[col].shift(n))/self.proc_data[col].shift(n)
            
    def create_trend_label(self, period):
        self.proc_data[f'next_{period}_return_label'] = (self.proc_data.close - \
            self.proc_data.close.shift(period)).apply(lambda x: 1 if x>0 else -1)
        self.tar = f'next_{period}_return_label'

    def rolling_average(self, windows):
        for col in self.data.columns:
            self.proc_data[f'{col}_{windows}_rolling_mean'] = self.data[col].rolling(windows).mean()

    def rolling_std(self, windows):
        for col in self.data.columns:
            self.proc_data[f'{col}_{windows}_rolling_std'] = self.data[col].rolling(windows).std()

    def preprocess(self, ns=[None], windows=[None]):
        try:
            for n in ns:
                self.next_n_return(n)
        except:
            print('no ns')
            
        try:
            for w in windows:
                self.rolling_average(w)
                self.rolling_std(w)
        except:
            print('no windows')
            
        self.skip = max(max(ns), max(windows))
            
    def create_train_test(self, trn_len=None, ts=0.1, classifier=False):
        trn_len = int(len(self.data) * ts) if not trn_len else trn_len
        tar = self.tar if classifier else 'close'
        self.data_xtrain, self.data_xtest = self.proc_data.iloc[self.skip:trn_len-1], self.proc_data.iloc[trn_len-1:-1]
        self.data_ytrain, self.data_ytest = self.proc_data[tar][self.skip+1:trn_len], self.proc_data[tar][trn_len:]
        

In [61]:
stock = stock_data(df)

In [67]:
type(stock)

__main__.stock_data

In [62]:
stock.preprocess(ns=[1, 5], windows=[1, 5, 10])

In [63]:
stock.create_trend_label(1)
stock.proc_data.head()

Unnamed: 0_level_0,open,high,low,close,volume,open_next_1_day_return,high_next_1_day_return,low_next_1_day_return,close_next_1_day_return,volume_next_1_day_return,...,high_10_rolling_mean,low_10_rolling_mean,close_10_rolling_mean,volume_10_rolling_mean,open_10_rolling_std,high_10_rolling_std,low_10_rolling_std,close_10_rolling_std,volume_10_rolling_std,next_1_return_label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-11-16,5.1451,5.4926,4.9648,5.0211,172301855,,,,,,...,,,,,,,,,,-1
2000-11-17,4.9665,4.9764,4.6621,4.672,36627024,-0.034713,-0.093981,-0.060969,-0.069527,-0.787425,...,,,,,,,,,,-1
2000-11-20,4.6009,4.7613,4.5661,4.6654,13004598,-0.073613,-0.043224,-0.020592,-0.001413,-0.644945,...,,,,,,,,,,-1
2000-11-21,4.6654,4.6968,4.5843,4.6141,9329042,0.014019,-0.013547,0.003986,-0.010996,-0.282635,...,,,,,,,,,,-1
2000-11-22,4.5827,4.715,4.5744,4.6306,6109951,-0.017726,0.003875,-0.00216,0.003576,-0.345061,...,,,,,,,,,,1


In [64]:
stock.create_train_test()

In [48]:
import xgboost as xgb

In [65]:
xgb_reg = xgb.XGBRegressor(n_estimators=1000, n_jobs=-1)
xgb_reg.fit(stock.data_xtrain, stock.data_ytrain, verbose=2)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [66]:
preds = xgb_reg.predict(stock.data_xtest)
mean_absolute_error(preds, stock.data_ytest)

1.1516041799947643

In [54]:
xg_reg = xgb.XGBClassifier(n_estimators=1000, n_jobs=-1)


In [55]:
xg_reg.fit(stock.data_xtrain, stock.data_ytrain, verbose=1)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=1000, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [57]:
from sklearn.metrics import accuracy_score

In [58]:
preds = xg_reg.predict(stock.data_xtest)
accuracy_score(preds, stock.data_ytest)

0.4967793880837359