In [None]:
%load_ext autoreload
%autoreload 2
import sys, os, time, json, re
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# from func_tools import import_px_data, standardize, fetch_s3_trade_files, cnn_data_reshaping, reshape_lob_levels, back_to_labels, intraday_vol_ret
import data_preprocessing as dp
import visualization_tools as viz_t
from labelling_class import Labels_Generator, cleaned_labels, label_insights, get_strategy_pnl

import inspect

import plotly_express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import tensorflow as tf
from keras.utils import np_utils
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Reshape, Conv2D, LSTM, Dense, MaxPooling2D, BatchNormalization, LeakyReLU, concatenate, add, Dropout, Flatten
from tensorflow.keras.optimizers import Adam

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict tf to only allocate xGB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=6024)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)
tf.test.gpu_device_name()

## Data

In [None]:
# assert that start date is < than end date
# assert lob levels

In [None]:
# import dask.dataframe as dd

In [None]:
frequency = timedelta(minutes=10)
pair = 'USDT_BTC'
date_start = '2020-11-11'
date_end = '2021-05-15'
lob_depth = 10
norm_type = 'dyn_z_score'
roll = 720#10 mins#7200 * 6

In [None]:
%%time
train_dyn_df, test_dyn_df, top_ob_train, top_ob_test = dp.import_px_data(frequency, pair, date_start, date_end, lob_depth, norm_type, roll)

In [None]:
frequency = timedelta(minutes=10)
pair = 'BTC_AAVE'
date_start = '2020-11-11'
date_end = '2021-05-08'
lob_depth = 10
norm_type = 'dyn_z_score'
roll = 720#10 mins#7200 * 6

In [None]:
%%time
train_dyn_df, test_dyn_df, top_ob_train, top_ob_test = dp.import_px_data(frequency, pair, date_start, date_end, lob_depth, norm_type, roll)

In [None]:
# check different file sizes between previous run and latest

In [None]:
from configuration import config
configuration = config()

In [None]:
resampled_data_folder = configuration['folders']['resampled_data']
processed_file_path = f'{resampled_data_folder}/{pair}/{lob_depth}_levels/{int(frequency.total_seconds())}s/2020-11-28.csv.gz'
resmpld_small = pd.read_csv(processed_file_path)

In [None]:
resampled_data_folder = configuration['folders']['resampled_data']
processed_file_path = f'{resampled_data_folder}/{pair}/{lob_depth}_levels/{int(frequency.total_seconds())}s/2020-12-09.csv.gz'
resmpld_big = pd.read_csv(processed_file_path)

In [None]:
resmpld_small.iloc[0]

In [None]:
resmpld_big.iloc[0]['Sequence']

In [None]:
resampled_data_folder = configuration['folders']['resampled_data']
processed_file_path = f'{resampled_data_folder}/{pair}/{lob_depth}_levels/original_frequency/2021-01-03.csv.gz'
original_freq = pd.read_csv(processed_file_path)
original_freq['Datetime'] = pd.to_datetime(original_freq['Datetime'], format='%Y-%m-%d %H:%M:%S')

In [None]:
original_freq[0:6000].groupby('Level').mean()

In [None]:
original_freq['Datetime']= pd.to_datetime(original_freq['Datetime'])
original_freq.groupby([pd.Grouper(key='Datetime', freq=frequency), pd.Grouper(key='Level')]).agg(
    {'Ask_Price':'mean', 'Ask_Size':'mean', 'Bid_Price':'mean', 'Bid_Size':'mean', 'Sequence':'last'}).reset_index().head(20)

In [None]:
# reshape data - is it needed?
# train
train_depth_dyn, train_dt_index_dyn = dp.reshape_lob_levels(train_dyn_df.reset_index(), output_type='array') # 1 train dataset
mid_px_train_dyn = pd.Series((train_depth_dyn[:,2] + train_depth_dyn[:,0]) / 2, index=train_dt_index_dyn) # 2
px_ts_train = top_ob_train.reset_index()[['Mid_Price']]

# test
test_depth_dyn, test_dt_index_dyn = dp.reshape_lob_levels(test_dyn_df.reset_index(), output_type='array') # 1 test dataset
mid_px_test_dyn = pd.Series((test_depth_dyn[:,2] + test_depth_dyn[:,0]) / 2, index=test_dt_index_dyn) # 2
px_ts_test = top_ob_test.reset_index()[['Mid_Price']]

## Labels

In [None]:
start_plot = 0
end_plot = 20000

In [None]:
# train labels
mid_px_train = px_ts_train['Mid_Price']
labels_train, smoothed_px_train, df_trades_train = cleaned_labels(mid_px_train, method='three_steps', print_details=False)

# encode
encoded_train_labels = np_utils.to_categorical(labels_train.values,3) 

# # get transaction df
# strategy_df_train = get_strategy_pnl(mid_px_train, labels_train)

viz_t.plot_labels_line(mid_px_train[start_plot:end_plot], 
    labels_train[start_plot:end_plot], 
    title='Train Labels', 
    smoothed_signal=smoothed_px_train[start_plot:end_plot])

In [None]:
# test labels
mid_px_test = px_ts_test['Mid_Price']
labels_test, smoothed_px_test, df_trades_test = cleaned_labels(mid_px_test, method='three_steps', print_details=False)

# encode
encoded_test_labels = np_utils.to_categorical(labels_test.values,3) 

# # get transaction df
# strategy_df_test = get_strategy_pnl(mid_px_test, labels_test)

viz_t.plot_labels_line(mid_px_test[start_plot:end_plot], 
    labels_test[start_plot:end_plot], 
    title='Test Labels', 
    smoothed_signal=smoothed_px_test[start_plot:end_plot])

## Visual checks

In [None]:
viz_t.plot_trades_distribution(df_trades_train[df_trades_train['cleaned_labels']!=0], bin_size=0.0001, metric='gross_returns', fig_width=900, fig_height=550)

viz_t.plot_trades_length_overview(df_trades_train[df_trades_train['cleaned_labels']!=0], x='trade_len',  y='gross_returns')

In [None]:
fig = px.histogram()
fig.add_trace(go.Histogram(x=df_trades_train['trade_len'].values, name='train', autobinx = False, xbins={'size':5}))
fig.add_trace(go.Histogram(x=df_trades_test['trade_len'].values, name='test', autobinx = False, xbins={'size':5}))

# The two histograms are drawn on top of another
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
viz_t.plot_timeseries(ts_list=[top_ob_train.set_index('Datetime')['Mid_Price'], top_ob_test.set_index('Datetime')['Mid_Price'], mid_px_train_dyn, mid_px_test_dyn], primary_axis=[True, True, False, False], legend=['train-px', 'test-px', 'train-dyn', 'test-dyn'], sample_size=180)

## Model Setup

In [None]:
def create_light_deeplob(T, lob_depth):
    ## just a test

    input_lmd = Input(shape=(T, lob_depth * 4, 1))
    conv_first1 = Conv2D(16, (1, 2), strides=(1, 2))(input_lmd)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)    
    conv_first1 = BatchNormalization()(conv_first1)
    # conv_first1 = Dropout(.5)(conv_first1)
    
    # note on learnable parameters: CONV2(filter shape =1*2, stride=1) layer is: ((shape of width of filter * shape of height filter * number of filters in the previous layer+1) * number of filters) = 2080 or ((2*1*32)+1)*32
    conv_first1 = Conv2D(16, (1, 2), strides=(1, 2))(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = BatchNormalization()(conv_first1)

    conv_first1 = Conv2D(16, (1, lob_depth))(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = BatchNormalization()(conv_first1)
    print(conv_first1.shape)

    convfirst_output = Reshape((int(conv_first1.shape[1])* int(conv_first1.shape[3]),))(conv_first1)
    print(convfirst_output.shape)
    # note on learnable parameters:FC3 layer is((current layer c*previous layer p)+1*c) with c being number of neurons
    out = Dense(3, activation='softmax')(convfirst_output)
    print(out.shape)
    model = Model(inputs=input_lmd, outputs=out)
    adam = Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

    return model

model_code = inspect.getsource(create_light_deeplob)
lines_with_short_desription = [line for line in model_code.split('\n') if "##" in line]
short_description = re.sub(r'\W+', '_', lines_with_short_desription[0])

create_light_deeplob(length, lob_depth).summary()

In [None]:
date_time_now = datetime.now().strftime("%y%m%d-%H%M%S")
experiment_id = f'{date_time_now}-{pair}-{frequency.seconds}s-{lob_depth}l-{length}-{date_start}-{date_end}{short_description}'
results_folder = f'{experiments_folder}/{pair}/{experiment_id}'
os.makedirs(f'{results_folder}', exist_ok=True)
batch_size=256

config = {
  'pair': pair,
  'frequency': frequency.seconds,
  'lob_depth': lob_depth,
  'length': length,
  'date_start': date_start,
  'date_end': date_end,
  'norm_type': norm_type,
  'roll': roll,
  'batch_size': batch_size,
  'label_technique': label_technique
#   'min_profit': min_profit,
#   'k_plus': k_plus,
#   'k_minus': k_minus,
#   'alpha': alpha,
#   'trading_fee': trading_fee,

#   'input': input_file_name,
#   'normalized_train_file': normalized_train_file,
#   'normalized_test_file':   normalized_test_file,
#   'top_ob_train_file': top_ob_train_file,
#   'top_ob_test_file': top_ob_test_file
}

with open(f'{results_folder}/config.json', 'w') as fp:
    json.dump(config, fp, default=str)

with open(f'{results_folder}/model_code.py', 'w') as fp:
    fp.write(model_code)

light_deeplob = create_light_deeplob(length, lob_depth)
with open(f'{results_folder}/model_summary.txt', 'w') as fp:
    light_deeplob.summary(print_fn=lambda x: fp.write(x + '\n'))


In [None]:
# try to train the model on smoother version of the data

## Training

In [None]:
light_deeplob = create_light_deeplob(length, lob_depth)

model_checkpoint_path = f'{results_folder}/{experiment_id}.h5'

# Learning rate callback. Reduce on Plateau multiply the lr by the factor if val loss does not improve for n epochs (patience)
lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                                   factor=0.2, 
                                                   patience=20)

# Checkpoint callback. Saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(model_checkpoint_path,
                                                 save_best_only=True,
                                                 save_weights_only=False,
                                                 verbose=2,
                                                 save_freq='epoch') # every epoch

# Early stopping callback. When sees no progress on the validation set
es_callback = tf.keras.callbacks.EarlyStopping(patience=50,
                                               restore_best_weights=True)

# Tensorboard callback
tb_callback = tf.keras.callbacks.TensorBoard(results_folder)

# Train and Test time series generators
generator_train = TimeseriesGenerator(
    train_depth_dyn,
    encoded_train_labels,
    length,
    batch_size=batch_size,
    shuffle=True
)

# to be replaced with validation?
generator_test = TimeseriesGenerator(
    test_depth_dyn,
    encoded_test_labels,
    length,
    batch_size=batch_size,
    shuffle=True
)


# This may generate warnings related to saving the state of the optimizer.
# These warnings (and similar warnings throughout this notebook)
# are in place to discourage outdated usage, and can be ignored.

In [None]:
# model_name = '/home/federico/Python_vsc_dir/RL_Trader/Experiments/USDT_BTC/210119-184504-USDT_BTC-10s-10l-300-2020_04_04-2021_01_03_binary_classification_full_inception_lighter_deep_lob_model_with_longer_timesteps_300_/210119-184504-USDT_BTC-10s-10l-300-2020_04_04-2021_01_03_binary_classification_full_inception_lighter_deep_lob_model_with_longer_timesteps_300_.h5'
# loaded_light_deep_lob = tf.keras.models.load_model(model_name)


In [None]:
# Train the model
light_deeplob.fit(generator_train, 
            epochs=200, 
            verbose=0,
            validation_data=generator_test,
            callbacks=[lr_callback, cp_callback, es_callback, tb_callback])

## Evaluating

In [None]:
model_name = '/home/federico/Python_vsc_dir/RL_Trader/Experiments/USDT_BTC/210221-200759-USDT_BTC-10s-10l-100-2020_04_04-2021_01_03_big_lr_big_batch_size_16_filter_size_shuffle/210221-200759-USDT_BTC-10s-10l-100-2020_04_04-2021_01_03_big_lr_big_batch_size_16_filter_size_shuffle.h5'

In [None]:
# Load the previously saved weights and evaluate model performance
deep_lob_loaded = tf.keras.models.load_model(model_name)
generator_test = TimeseriesGenerator(
    test_depth_dyn,
    encoded_test_labels,
    length,
    batch_size=batch_size,
    shuffle=False
)

def evaluate_model(model):
    # Re-evaluate the model
    loss, acc = model.evaluate(generator_test, verbose=2)
    print("Restored model, accuracy: {:5.2f}%".format(100*acc))

#evaluate_model(deep_lob_loaded)

In [None]:
# Get predicted labels
predictions_prob = deep_lob_loaded.predict(generator_test, verbose=1)
map_labels = np.vectorize(back_to_labels) # vectorize back to labels from func_tools
predicted_labels = pd.Series(map_labels(np.argmax(predictions_prob,axis=1)), name='predicted_labels')

In [None]:
# Experimental: predicted labels on rolling avg
predictions_prob_wa = pd.DataFrame(predictions_prob).rolling(window=10).mean().values
map_labels = np.vectorize(back_to_labels) # vectorize back to labels from func_tools
predicted_labels_wa = pd.Series(map_labels(np.argmax(predictions_prob_wa,axis=1)), name='predicted_labels_wa') # back to original 1,0,-1

In [None]:
print('##### Predicted labels #####')
label_insights(predicted_labels)


In [None]:
print('##### Weighted average predicted labels #####')
label_insights(predicted_labels_wa)

In [None]:
predicted_labels.shape, test_depth_dyn.shape

In [None]:
# dangerous assigning offset here, wrap it into a function
# align prediction with "length" offset
index_range = np.arange(length, predicted_labels.shape[0] + length) # offset ts length fed to ts generator
predicted_labels.index = index_range

# generate timeseries with buy, sell, zero prob
buy_prob = pd.Series(predictions_prob[:,1], index=index_range)
sell_prob = pd.Series(predictions_prob[:,2], index=index_range)
zero_prob = pd.Series(predictions_prob[:,0], index=index_range)

buy_prob_wa = pd.Series(predictions_prob_wa[:,1], index=index_range)

viz_t.plot_labels_line(top_ob_test['Mid_Price'][start_plot:end_plot], 
    labels_test[start_plot:end_plot], # original labels
    title='Train Set Labels', 
    #smoothed_signal=smoothed_px_test[start_plot:end_plot],
    predicted_labels=predicted_labels[start_plot:end_plot],
    buy_prob_labels=buy_prob[start_plot:end_plot],
    #sell_prob_labels=sell_prob[start:end],
    predictions_prob_wa=buy_prob_wa[start_plot:end_plot],
    width=1100, height=600
    )

In [None]:
### DOUBLE CHECK that labels and px_ts are correctly aligned
px_ts = top_ob_test['Mid_Price']# adjust prediction offsset
datetime_ts = top_ob_test['Datetime']
trades_timeseries = get_strategy_pnl(px_ts, predicted_labels)
df_trades = trades_timeseries.dropna(subset=['gross_returns'])

In [None]:
df_trades

In [None]:
# px_ts = top_ob_test['Mid_Price'][length:].reset_index()['Mid_Price']# adjust prediction offsset
# datetime_ts = top_ob_test['Datetime'][length:].reset_index()['Datetime']
# trades_timeseries = get_strategy_pnl(px_ts, predicted_labels)
# df_trades = trades_timeseries.dropna(subset=['gross_returns'])

In [None]:
### to do:
# seek for patterns in prediction probability
# day vs night - weekday vs weekend - model certainty before long trades vs short trades
# determine if predictions are naive