In [None]:
%load_ext autoreload
%autoreload 2
import sys, os, time, json, re
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# from func_tools import import_px_data, standardize, fetch_s3_trade_files, cnn_data_reshaping, reshape_lob_levels, back_to_labels, intraday_vol_ret
import data_preprocessing as dp
import visualization_tools as viz_t
from labelling_class import Labels_Generator, cleaned_labels, label_insights, get_strategy_pnl

import inspect

import plotly_express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import tensorflow as tf
from keras.utils import np_utils
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Reshape, Conv2D, LSTM, Dense, MaxPooling2D, BatchNormalization, LeakyReLU, concatenate, add, Dropout, Flatten
from tensorflow.keras.optimizers import Adam

from configuration import config

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict tf to only allocate xGB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=6024)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)
tf.test.gpu_device_name()

## Data

In [None]:
# assert that start date is < than end date
# assert lob levels

In [None]:
# import dask.dataframe as dd

In [None]:
frequency = timedelta(seconds=60)
pair = 'USDT_BTC'
date_start = '2020-11-11'
date_end = '2021-03-31'
lob_depth = 10
norm_type = 'dyn_z_score'
roll = 7200#10 mins#7200 * 6

In [None]:
%%time
train_dyn_df, test_dyn_df, top_ob_train, top_ob_test = dp.import_px_data(frequency, pair, date_start, date_end, lob_depth, norm_type, roll)

In [None]:
train_dyn_df.shape, test_dyn_df.shape, top_ob_train.shape, top_ob_test.shape

## Labels

In [None]:
start_plot = 0
end_plot = 50000

In [None]:
# get mid price timeseries with clean index
px_ts_train = top_ob_train.reset_index()['Mid_Price']
px_ts_test = top_ob_test.reset_index()['Mid_Price']

In [None]:
# series with datetime index for plotting
mid_px_series =  pd.Series(data=px_ts_train.values, index=top_ob_train['Datetime'].values, name='Mid_Price')

In [None]:
window = 31
poly = 1
smoothed_px = Labels_Generator(mid_px_series, window, poly).get_smooth_px()
smooth_px_series = pd.Series(data=smoothed_px.values, index=top_ob_train['Datetime'].values)

In [None]:
# start_plot = 100000
# end_plot = start_plot + 10000
# viz_t.plot_timeseries(ts_list=[mid_px_series[start_plot:end_plot], smooth_px_series[start_plot:end_plot]], primary_axis=[True, True], legend=['Original price', f'Smoothed {window} {poly}'], sample_size=1, width=900, height=500)

In [None]:
labels_train, smoothed_px_train, df_trades_train = cleaned_labels(px_ts_train,  sg_window=31, sg_poly=1, method='two_steps_50bps', print_details=True)

# encode
encoded_train_labels = np_utils.to_categorical(labels_train.values,3) 

# get transaction df
strategy_df_train = get_strategy_pnl(px_ts_train, labels_train)

viz_t.plot_labels_line(px_ts_train[start_plot:end_plot], 
    labels_train[start_plot:end_plot], 
    title='Train Labels', 
    smoothed_signal=smoothed_px_train[start_plot:end_plot])

In [None]:
labels_test, smoothed_px_test, df_trades_test = cleaned_labels(px_ts_test, sg_window=31, sg_poly=1, method='two_steps_50bps', print_details=True)

# encode
encoded_test_labels = np_utils.to_categorical(labels_test.values,3) 

# get transaction df
strategy_df_test = get_strategy_pnl(px_ts_test, labels_test)

viz_t.plot_labels_line(px_ts_test[start_plot:end_plot], 
    labels_test[start_plot:end_plot], 
    title='Test Labels', 
    smoothed_signal=smoothed_px_test[start_plot:end_plot])

## Visual checks

In [None]:
viz_t.plot_trades_distribution(df_trades_train[df_trades_train['cleaned_labels']!=0], bin_size=0.0001, metric='gross_returns', fig_width=900, fig_height=550)

viz_t.plot_trades_length_overview(df_trades_train[df_trades_train['cleaned_labels']!=0], x='trade_len',  y='gross_returns')

In [None]:
fig = px.histogram()
fig.add_trace(go.Histogram(x=df_trades_train['trade_len'].values, name='train', autobinx = False, xbins={'size':5}))
fig.add_trace(go.Histogram(x=df_trades_test['trade_len'].values, name='test', autobinx = False, xbins={'size':5}))

# The two histograms are drawn on top of another
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
# reshape data - is it needed?
# train
train_depth_dyn, train_dt_index_dyn = dp.reshape_lob_levels(train_dyn_df.reset_index(), output_type='array') # 1 train dataset
mid_px_train_dyn = pd.Series((train_depth_dyn[:,2] + train_depth_dyn[:,0]) / 2, index=train_dt_index_dyn) # 2
px_ts_train = top_ob_train.reset_index()[['Mid_Price']]

# test
test_depth_dyn, test_dt_index_dyn = dp.reshape_lob_levels(test_dyn_df.reset_index(), output_type='array') # 1 test dataset
mid_px_test_dyn = pd.Series((test_depth_dyn[:,2] + test_depth_dyn[:,0]) / 2, index=test_dt_index_dyn) # 2
px_ts_test = top_ob_test.reset_index()[['Mid_Price']]

In [None]:
viz_t.plot_timeseries(ts_list=[top_ob_train.set_index('Datetime')['Mid_Price'], top_ob_test.set_index('Datetime')['Mid_Price'], mid_px_train_dyn, mid_px_test_dyn], primary_axis=[True, True, False, False], legend=['train-px', 'test-px', 'train-dyn', 'test-dyn'], sample_size=180)

In [None]:
train_dyn_df[train_dyn_df['Level']==0].describe()

In [None]:
# import trades
raw_file_path = './Experiments/input/raw/trades/USDT_BTC/USDT_BTC-20210330.csv.gz'
day_data = pd.read_csv(raw_file_path, parse_dates=['date'])
day_data

In [None]:
# process trades
df_trades_grp = day_data.groupby([pd.Grouper(key='date', freq='1min'), 'type']).agg({'amount':np.sum, 'rate':np.mean, 'orderNumber':pd.Series.nunique,  'globalTradeID':'count'})


wtavg = lambda x: np.average(x['rate'], weights=x['amount'], axis=0)
dfwavg = day_data.groupby([pd.Grouper(key='date', freq='1min'), 'type']).apply(wtavg)
dfwavg.name = 'wav_price'

df_trades_grp = pd.merge(df_trades_grp, dfwavg, left_index=True, right_index=True).reset_index()
df_trades_grp.rename(columns={'date':'Datetime', 'rate':'av_price', 'orderNumber':'unique_orders', 'globalTradeID':'clips'}, inplace=True)

df_trades_piv = df_trades_grp.pivot(values=['amount', 'av_price', 'wav_price', 'unique_orders', 'clips'], columns='type',index='Datetime').reset_index()

df_trades_piv.columns = list(map("_".join, df_trades_piv.columns)) # "flatten" column names

df_trades_piv.rename(columns={'Datetime_':'Datetime'}, inplace=True)
df_trades_piv.set_index('Datetime', inplace=True)

In [None]:
import gzip

In [None]:
%%time
# import lob
# Load all files in to a dictionary
configuration = config()
raw_data_folder = configuration['folders']['raw_lob_data']

pair = 'USDT_BTC'
date_to_process = datetime.strptime('2021-03-30', '%Y-%m-%d')
day_folder = datetime.strftime(date_to_process, '%Y/%m/%d')
lob_depth = 100

raw_data = {} # empty dict to update with incoming json
for file_name in os.listdir(f'{raw_data_folder}/{pair}/{day_folder}'):

    try:
        with gzip.open(f'{raw_data_folder}/{pair}/{day_folder}/{file_name}', 'r') as f:
            json_string = f.read().decode('utf-8')
            frozen = json_string.count('"isFrozen": "1"')
            if frozen > 0:
                print(f'Frozen {frozen} snapshots')
        raw_data_temp = dp.load_lob_json(json_string)

    except Exception as e:
        print(e.errno)
        print(e)

    raw_data.update(raw_data_temp)

In [None]:
%%time
processed_data = []
raw_data_frame = pd.DataFrame.from_dict(raw_data, orient='index')
raw_data_frame.reset_index(inplace=True)
raw_data_frame['index'] = raw_data_frame['index'].str[-15:]
raw_data_frame['index'] = pd.to_datetime(raw_data_frame['index'], format='%Y%m%d_%H%M%S')
raw_data_frame.set_index('index',drop=True,inplace=True)
raw_data_frame.sort_index(inplace=True)
idx_start = date_to_process
idx_end = date_to_process + timedelta(days=1) - timedelta(seconds=1)
idx = pd.date_range(idx_start, idx_end, freq='1s')
raw_data_frame = raw_data_frame.reindex(idx).ffill().fillna(method='bfill') # forward fill gaps and back fill first item if missing

# Convert hierarchical json data in to tabular format
levels = list(range(lob_depth))
for row in raw_data_frame.itertuples():

    ask_price, ask_volume = zip(* row.asks[0:lob_depth])
    bid_price, bid_volume = zip(* row.bids[0:lob_depth])
    sequences = [row.seq] * lob_depth
    datetimes = [row.Index] * lob_depth

    processed_data.append(list(zip(
        ask_price,
        ask_volume,
        bid_price,
        bid_volume,
        levels,
        sequences,
        datetimes
    )))

# unravel nested structure and force data types
day_data = pd.DataFrame([y for x in processed_data for y in x], #flatten the list of lists structure
                columns = ['Ask_Price', 'Ask_Size', 'Bid_Price', 'Bid_Size','Level', 'Sequence','Datetime'])

day_data['Ask_Price'] = day_data['Ask_Price'].astype('float64')
day_data['Bid_Price'] = day_data['Bid_Price'].astype('float64')
day_data['Sequence'] = day_data['Sequence'].astype('int64')

In [None]:
day_data.head()

In [None]:
%%time
day_data.sort_values(by=['Datetime', 'Level'], inplace=True)
day_data['Mid_Price'] = (day_data['Ask_Price'] + day_data['Bid_Price'])/2

In [None]:
day_data['Prevailing_Mid'] = day_data.groupby('Datetime')['Mid_Price'].transform('first')
day_data['Bid_Cum_Size'] = day_data.groupby('Datetime')['Bid_Size'].transform(np.cumsum)
day_data['Ask_Cum_Size'] = day_data.groupby('Datetime')['Ask_Size'].transform(np.cumsum)

day_data['Bid_Spread'] = (day_data['Bid_Price'] - day_data['Prevailing_Mid']) / day_data['Prevailing_Mid'] * -1
day_data['Ask_Spread'] = (day_data['Ask_Price'] - day_data['Prevailing_Mid']) / day_data['Prevailing_Mid']

In [None]:
day_data.groupby([pd.Grouper(key='Datetime', freq='1min'), 'Level']).agg({
    'Bid_Spread':np.mean, 
    'Ask_Spread':np.mean, 
    'Bid_Cum_Size':np.mean, 
    'Ask_Cum_Size':np.mean, 
    'Bid_Price':np.mean, 
    'Ask_Price':np.mean,
    'Bid_Size':np.mean, 
    'Ask_Size':np.mean
})

In [None]:
day_data.head(103)#['Bid_Spread'].min()

In [None]:
df_plot1 = day_data[day_data['Level']==0].copy()
df_plot1.set_index('Datetime', inplace=True)
df_plot100 = day_data[day_data['Level']==10].copy()
df_plot100.set_index('Datetime', inplace=True)
fig = px.line(render_mode='svg')
fig.add_scatter(y=df_plot1['Ask_Price'], x=df_plot1.index, name='ask')
fig.add_scatter(y=df_plot100['Ask_Price'], x=df_plot100.index, name='ask')
fig.show()

In [None]:
px.line(day_data.set_index('Datetime')['Ask_Price'])

In [None]:
datetime.strptime('2021-03-30', '%Y-%m-%d')

In [None]:
px.line(df_trades_piv['wav_price_buy'], render_mode='svg')

In [None]:
px.line(df_trades_piv['av_price_buy'], render_mode='svg')

In [None]:
df_trades_piv

In [None]:
dask_trades = dp.get_trade_data(pair, date_start, date_end, frequency=frequency)
processed_trades = dask_trades.compute()

In [None]:
processed_trades['Datetime'] = pd.to_datetime(processed_trades['Datetime'])
processed_trades.set_index('Datetime', inplace=True)

In [None]:
# noticed that before we were taking last of the aggregation
# savitzky golay to trades
# savitzky golay to training set, one filter per time series
# size weighted average price when grouping is better than simple average
# add number of trades as well
# add trade spread from prevailing mid?


# volume weighted average price for trades
# no savitki filter, info leakage
# no order book levels, but feed aggregated spread weighted, like +/- bps

In [None]:
start_plot = 100000
end_plot = start_plot + 10000
trades_ask_size = processed_trades.sort_values('Datetime')['Bid_Size'].rolling(5).mean()
viz_t.plot_timeseries(ts_list=[mid_px_series[start_plot:end_plot], smooth_px_series[start_plot:end_plot], trades_ask_size[start_plot+7200:end_plot+7200]], primary_axis=[True, True, False], legend=['Original price', f'Smoothed {window} {poly}', 'Trades ask size'], sample_size=1, width=900, height=500)

In [None]:
px.line(processed_trades.sort_values('Datetime')['Ask_Size'].rolling(60).mean())

In [None]:
processed_trades.sort_values('Datetime')['Ask_Price'][:140000].rolling(60).mean().plot()

In [None]:
processed_trades.sort_values('Datetime')['Bid_Size'][:140000].rolling(60).mean().plot()

In [None]:
# reminder: 'amount_buy':'Ask_Size', 'amount_sell':'Bid_Size', 'rate_buy':'Ask_Price', 'rate_sell':'Bid_Price'
df_trading_activity = train_dyn_df[train_dyn_df['Level']==-1]

In [None]:
px.line(y=df_trading_activity['Ask_Size'].values, x=df_trading_activity['Datetime'].values)

In [None]:
df_trading_activity

In [None]:
test_depth_dyn.shape

## Model Setup

In [None]:
def create_light_deeplob(T, lob_depth):
    ## just a test

    input_lmd = Input(shape=(T, lob_depth * 4, 1))
    conv_first1 = Conv2D(16, (1, 2), strides=(1, 2))(input_lmd)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)    
    conv_first1 = BatchNormalization()(conv_first1)
    # conv_first1 = Dropout(.5)(conv_first1)
    
    # note on learnable parameters: CONV2(filter shape =1*2, stride=1) layer is: ((shape of width of filter * shape of height filter * number of filters in the previous layer+1) * number of filters) = 2080 or ((2*1*32)+1)*32
    conv_first1 = Conv2D(16, (1, 2), strides=(1, 2))(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = BatchNormalization()(conv_first1)

    conv_first1 = Conv2D(16, (1, lob_depth))(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = BatchNormalization()(conv_first1)
    print(conv_first1.shape)

    convfirst_output = Reshape((int(conv_first1.shape[1])* int(conv_first1.shape[3]),))(conv_first1)
    print(convfirst_output.shape)
    # note on learnable parameters:FC3 layer is((current layer c*previous layer p)+1*c) with c being number of neurons
    out = Dense(3, activation='softmax')(convfirst_output)
    print(out.shape)
    model = Model(inputs=input_lmd, outputs=out)
    adam = Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

    return model

model_code = inspect.getsource(create_light_deeplob)
lines_with_short_desription = [line for line in model_code.split('\n') if "##" in line]
short_description = re.sub(r'\W+', '_', lines_with_short_desription[0])

create_light_deeplob(length, lob_depth).summary()

In [None]:
date_time_now = datetime.now().strftime("%y%m%d-%H%M%S")
experiment_id = f'{date_time_now}-{pair}-{frequency.seconds}s-{lob_depth}l-{length}-{date_start}-{date_end}{short_description}'
results_folder = f'{experiments_folder}/{pair}/{experiment_id}'
os.makedirs(f'{results_folder}', exist_ok=True)
batch_size=256

config = {
  'pair': pair,
  'frequency': frequency.seconds,
  'lob_depth': lob_depth,
  'length': length,
  'date_start': date_start,
  'date_end': date_end,
  'norm_type': norm_type,
  'roll': roll,
  'batch_size': batch_size,
  'label_technique': label_technique
#   'min_profit': min_profit,
#   'k_plus': k_plus,
#   'k_minus': k_minus,
#   'alpha': alpha,
#   'trading_fee': trading_fee,

#   'input': input_file_name,
#   'normalized_train_file': normalized_train_file,
#   'normalized_test_file':   normalized_test_file,
#   'top_ob_train_file': top_ob_train_file,
#   'top_ob_test_file': top_ob_test_file
}

with open(f'{results_folder}/config.json', 'w') as fp:
    json.dump(config, fp, default=str)

with open(f'{results_folder}/model_code.py', 'w') as fp:
    fp.write(model_code)

light_deeplob = create_light_deeplob(length, lob_depth)
with open(f'{results_folder}/model_summary.txt', 'w') as fp:
    light_deeplob.summary(print_fn=lambda x: fp.write(x + '\n'))


In [None]:
# try to train the model on smoother version of the data

## Training

In [None]:
light_deeplob = create_light_deeplob(length, lob_depth)

model_checkpoint_path = f'{results_folder}/{experiment_id}.h5'

# Learning rate callback. Reduce on Plateau multiply the lr by the factor if val loss does not improve for n epochs (patience)
lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                                   factor=0.2, 
                                                   patience=20)

# Checkpoint callback. Saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(model_checkpoint_path,
                                                 save_best_only=True,
                                                 save_weights_only=False,
                                                 verbose=2,
                                                 save_freq='epoch') # every epoch

# Early stopping callback. When sees no progress on the validation set
es_callback = tf.keras.callbacks.EarlyStopping(patience=50,
                                               restore_best_weights=True)

# Tensorboard callback
tb_callback = tf.keras.callbacks.TensorBoard(results_folder)

# Train and Test time series generators
generator_train = TimeseriesGenerator(
    train_depth_dyn,
    encoded_train_labels,
    length,
    batch_size=batch_size,
    shuffle=True
)

# to be replaced with validation?
generator_test = TimeseriesGenerator(
    test_depth_dyn,
    encoded_test_labels,
    length,
    batch_size=batch_size,
    shuffle=True
)


# This may generate warnings related to saving the state of the optimizer.
# These warnings (and similar warnings throughout this notebook)
# are in place to discourage outdated usage, and can be ignored.

In [None]:
# model_name = '/home/federico/Python_vsc_dir/RL_Trader/Experiments/USDT_BTC/210119-184504-USDT_BTC-10s-10l-300-2020_04_04-2021_01_03_binary_classification_full_inception_lighter_deep_lob_model_with_longer_timesteps_300_/210119-184504-USDT_BTC-10s-10l-300-2020_04_04-2021_01_03_binary_classification_full_inception_lighter_deep_lob_model_with_longer_timesteps_300_.h5'
# loaded_light_deep_lob = tf.keras.models.load_model(model_name)


In [None]:
# Train the model
light_deeplob.fit(generator_train, 
            epochs=200, 
            verbose=0,
            validation_data=generator_test,
            callbacks=[lr_callback, cp_callback, es_callback, tb_callback])

## Evaluating

In [None]:
model_name = '/home/federico/Python_vsc_dir/RL_Trader/Experiments/USDT_BTC/210221-200759-USDT_BTC-10s-10l-100-2020_04_04-2021_01_03_big_lr_big_batch_size_16_filter_size_shuffle/210221-200759-USDT_BTC-10s-10l-100-2020_04_04-2021_01_03_big_lr_big_batch_size_16_filter_size_shuffle.h5'

In [None]:
# Load the previously saved weights and evaluate model performance
deep_lob_loaded = tf.keras.models.load_model(model_name)
generator_test = TimeseriesGenerator(
    test_depth_dyn,
    encoded_test_labels,
    length,
    batch_size=batch_size,
    shuffle=False
)

def evaluate_model(model):
    # Re-evaluate the model
    loss, acc = model.evaluate(generator_test, verbose=2)
    print("Restored model, accuracy: {:5.2f}%".format(100*acc))

#evaluate_model(deep_lob_loaded)

In [None]:
# Get predicted labels
predictions_prob = deep_lob_loaded.predict(generator_test, verbose=1)
map_labels = np.vectorize(back_to_labels) # vectorize back to labels from func_tools
predicted_labels = pd.Series(map_labels(np.argmax(predictions_prob,axis=1)), name='predicted_labels')

In [None]:
# Experimental: predicted labels on rolling avg
predictions_prob_wa = pd.DataFrame(predictions_prob).rolling(window=10).mean().values
map_labels = np.vectorize(back_to_labels) # vectorize back to labels from func_tools
predicted_labels_wa = pd.Series(map_labels(np.argmax(predictions_prob_wa,axis=1)), name='predicted_labels_wa') # back to original 1,0,-1

In [None]:
print('##### Predicted labels #####')
label_insights(predicted_labels)


In [None]:
print('##### Weighted average predicted labels #####')
label_insights(predicted_labels_wa)

In [None]:
predicted_labels.shape, test_depth_dyn.shape

In [None]:
# dangerous assigning offset here, wrap it into a function
# align prediction with "length" offset
index_range = np.arange(length, predicted_labels.shape[0] + length) # offset ts length fed to ts generator
predicted_labels.index = index_range

# generate timeseries with buy, sell, zero prob
buy_prob = pd.Series(predictions_prob[:,1], index=index_range)
sell_prob = pd.Series(predictions_prob[:,2], index=index_range)
zero_prob = pd.Series(predictions_prob[:,0], index=index_range)

buy_prob_wa = pd.Series(predictions_prob_wa[:,1], index=index_range)

viz_t.plot_labels_line(top_ob_test['Mid_Price'][start_plot:end_plot], 
    labels_test[start_plot:end_plot], # original labels
    title='Train Set Labels', 
    #smoothed_signal=smoothed_px_test[start_plot:end_plot],
    predicted_labels=predicted_labels[start_plot:end_plot],
    buy_prob_labels=buy_prob[start_plot:end_plot],
    #sell_prob_labels=sell_prob[start:end],
    predictions_prob_wa=buy_prob_wa[start_plot:end_plot],
    width=1100, height=600
    )

In [None]:
### DOUBLE CHECK that labels and px_ts are correctly aligned
px_ts = top_ob_test['Mid_Price']# adjust prediction offsset
datetime_ts = top_ob_test['Datetime']
trades_timeseries = get_strategy_pnl(px_ts, predicted_labels)
df_trades = trades_timeseries.dropna(subset=['gross_returns'])

In [None]:
df_trades

In [None]:
# px_ts = top_ob_test['Mid_Price'][length:].reset_index()['Mid_Price']# adjust prediction offsset
# datetime_ts = top_ob_test['Datetime'][length:].reset_index()['Datetime']
# trades_timeseries = get_strategy_pnl(px_ts, predicted_labels)
# df_trades = trades_timeseries.dropna(subset=['gross_returns'])

In [None]:
### to do:
# seek for patterns in prediction probability
# day vs night - weekday vs weekend - model certainty before long trades vs short trades
# determine if predictions are naive