# Training a tabular fast.ai NN for stock signal generation

## on a Technical Indicators and OHCV History

In [1]:
from fastai.tabular import *
from tqdm import tqdm_notebook as tqdm

In [2]:
import pandas as pd
from TechnicalIndicators import sma, ema, macd, bollinger_bands, obv

### Download backtesting data

In [3]:
rawdf = pd.read_csv('data_small.csv')
rawdf.head()

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,num_trades,tkbbav,tkqav,ign.
0,1501545600000,0.072459,0.075079,0.07237,0.074589,966.046,1501549199999,71.14797,593,491.626,36.116752,25543.757517
1,1501549200000,0.074399,0.074514,0.072721,0.073383,506.858,1501552799999,37.242746,527,252.594,18.56375,25590.694846
2,1501552800000,0.073357,0.073645,0.072588,0.073437,424.002,1501556399999,31.028191,590,187.941,13.754426,25588.67278
3,1501556400000,0.073579,0.074142,0.073064,0.073869,500.44,1501559999999,36.825663,538,247.589,18.211874,25576.311149
4,1501560000000,0.073895,0.076981,0.073869,0.075423,1011.387,1501563599999,76.215404,562,569.096,42.873766,25643.886224


In [4]:
dataset_df = rawdf[['open_time', 'open', 'high', 'low', 'close', 'volume' ]]
print(dataset_df.shape)
dataset_df.head()

(17464, 6)


Unnamed: 0,open_time,open,high,low,close,volume
0,1501545600000,0.072459,0.075079,0.07237,0.074589,966.046
1,1501549200000,0.074399,0.074514,0.072721,0.073383,506.858
2,1501552800000,0.073357,0.073645,0.072588,0.073437,424.002
3,1501556400000,0.073579,0.074142,0.073064,0.073869,500.44
4,1501560000000,0.073895,0.076981,0.073869,0.075423,1011.387


### Add Technical Indicators as features

In [5]:
price_df = dataset_df.open
volume_df = dataset_df.volume

ema10_list = ema(price_df.tolist(), n=10)
ema20_list = ema(price_df.tolist(), n=20)
ema50_list = ema(price_df.tolist(), n=20)

sma10_list = sma(price_df.tolist(), n=20)
sma20_list = sma(price_df.tolist(), n=20)
sma50_list = sma(price_df.tolist(), n=20)

bb10 = bollinger_bands(price_df.tolist(), 10, mult=2)
bb10_low = [x[0] for x in bb10]
bb10_mid = [x[1] for x in bb10]
bb10_up = [x[2] for x in bb10]

bb20 = bollinger_bands(price_df.tolist(), 20, mult=2)
bb20_low = [x[0] for x in bb20]
bb20_mid = [x[1] for x in bb20]
bb20_up = [x[2] for x in bb20]

macd_list = macd(price_df.tolist())

obv_list = obv(volume_df.tolist(), price_df.tolist())

### We have indicators for a recent slice of data, so we cut out previous candlesticks

In [6]:
ti_dict = {'ema10': ema10_list, 'ema20': ema20_list, 'ema50': ema50_list, 'sma10': sma10_list,
          'sma20': sma20_list, 'sma50': sma50_list, 'macd': macd_list, 'obv': obv_list,
          'bb10_low': bb10_low, 'bb10_mid': bb10_mid, 'bb10_up': bb10_up, 'bb20_low': bb20_low,
          'bb20_mid': bb20_mid, 'bb20_up': bb20_up}
l = ''
c = 100 ** 10
for label, data in ti_dict.items():
    if len(data) < c:
        l, c = label, len(data)
l, c

('macd', 17438)

In [7]:
# First cut down our TI's
for label, data in ti_dict.items():
    cut_amount = len(data) - len(macd_list)
    ti_dict[label] = data[cut_amount:]    

In [8]:
# Next cut down our dataset_df
dataset_df = dataset_df.iloc[dataset_df.shape[0]-len(macd_list):]
dataset_df.shape[0]

17438

In [9]:
for label, data in ti_dict.items():
    dataset_df[label] = data
dataset_df.head()

Unnamed: 0,open_time,open,high,low,close,volume,ema10,ema20,ema50,sma10,sma20,sma50,macd,obv,bb10_low,bb10_mid,bb10_up,bb20_low,bb20_mid,bb20_up
26,1501639200000,0.08303,0.083537,0.082225,0.083236,296.728,0.443623,0.483246,0.483246,0.08047,0.08047,0.08047,0.34272,3009.083,0.082662,0.085992,0.079332,0.08047,0.086044,0.074896
27,1501642800000,0.083236,0.083276,0.082689,0.083271,396.149,0.4462,0.520459,0.520459,0.080812,0.080812,0.080812,0.277345,3405.232,0.083066,0.085321,0.080811,0.080812,0.086125,0.075499
28,1501646400000,0.083191,0.083276,0.082163,0.082594,501.075,0.448264,0.554082,0.554082,0.08109,0.08109,0.08109,0.216324,2904.157,0.083321,0.084929,0.081712,0.08109,0.086298,0.075883
29,1501650000000,0.082453,0.082928,0.082192,0.082425,453.661,0.449215,0.583766,0.583766,0.081353,0.081353,0.081353,0.159415,2450.496,0.083512,0.084398,0.082625,0.081353,0.086427,0.076279
30,1501653600000,0.082327,0.082941,0.081902,0.082113,484.671,0.449866,0.610496,0.610496,0.0815,0.0815,0.0815,0.106433,1965.825,0.083429,0.084517,0.082341,0.0815,0.086522,0.076478


### Add previous candle data as features

In [25]:
# Add columns with default values into the DataFrame
features = ['prev_high', 'prev2_high', 'prev3_high', 'prev4_high', 'prev5_high', 'prev6_high',
            'prev7_high', 'prev8_high', 'prev9_high' ,'prev10_high', 'prev_low', 'prev2_low', 
            'prev3_low', 'prev4_low', 'prev5_low', 'prev6_low', 'prev7_low', 'prev8_low',
            'prev9_low', 'prev10_low', 
            'prev_open', 'prev2_open', 'prev3_open', 'prev4_open', 'prev5_open', 'prev6_open',
            'prev7_open', 'prev8_open', 'prev9_open' ,'prev10_open']

for f in features:
    dataset_df[f] = 'default'
    
new_features = []
    
# Add the features one by one
for feature in features:
    num = feature[4] # Get the number
    if num == '1': num = 10 
    if num == '_': num = 1
    num = int(num)
    
    if 'high' in feature:
        t = 'high'
    elif 'low' in feature:
        t = 'low'
    elif 'open' in feature:
        t = 'open'

    temp = []
    for i in tqdm(range(dataset_df.shape[0])):
        if i >= num:
            temp.append(dataset_df.iloc[i-num][t])
            
    new_features.append(temp)
            

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17438), HTML(value='')))

### Once again cut down our candlesticks, this time for the OPHCV data

In [26]:
x = len(new_features[0])
# determine shortest length
for feature in new_features:
    if len(feature) < x:
        x = len(feature)
x        

17428

In [27]:
# Cut down all datasets
for index, feature in enumerate(new_features):
    new_features[index] = feature[len(feature)-x:]

In [31]:
# Next cut down our dataset_df
dataset_df = dataset_df.iloc[dataset_df.shape[0]-x:]
dataset_df.shape[0]

17428

In [32]:
# Add the new columns into the dataset
for idx, feature in enumerate(features):
    dataset_df[feature] = new_features[idx]

In [39]:
print(dataset_df.shape)
dataset_df.to_csv('training_data.csv')

dataset_df.head()

(17428, 50)


Unnamed: 0,open_time,open,high,low,close,volume,ema10,ema20,ema50,sma10,...,prev_open,prev2_open,prev3_open,prev4_open,prev5_open,prev6_open,prev7_open,prev8_open,prev9_open,prev10_open
36,1501675200000,0.080901,0.08102,0.079679,0.079893,622.04,0.449574,0.722661,0.722661,0.082563,...,0.081359,0.081898,0.083004,0.082033,0.082113,0.082327,0.082453,0.083191,0.083236,0.08303
37,1501678800000,0.079893,0.082663,0.079893,0.082294,645.395,0.447726,0.733729,0.733729,0.082659,...,0.080901,0.081359,0.081898,0.083004,0.082033,0.082113,0.082327,0.082453,0.083191,0.083236
38,1501682400000,0.082294,0.082563,0.080099,0.08096,423.548,0.448616,0.746144,0.746144,0.082619,...,0.079893,0.080901,0.081359,0.081898,0.083004,0.082033,0.082113,0.082327,0.082453,0.083191
39,1501686000000,0.080988,0.081093,0.080187,0.080383,575.297,0.448037,0.756071,0.756071,0.08267,...,0.082294,0.079893,0.080901,0.081359,0.081898,0.083004,0.082033,0.082113,0.082327,0.082453
40,1501689600000,0.080383,0.081341,0.080244,0.081341,492.436,0.446959,0.764447,0.764447,0.082555,...,0.080988,0.082294,0.079893,0.080901,0.081359,0.081898,0.083004,0.082033,0.082113,0.082327


### Add our dependant variable - the trading signal

### Train the model

In [37]:
procs = [FillMissing, Categorify, Normalize]
valid_idx = range(int(0.7 * dataset_df.shape[0]), len(dataset_df))
valid_idx

range(12199, 17428)

In [None]:
dep_var = ''