In [None]:
# import libraries
import pandas as pd
import numpy as np
import math
import datetime
from datetime import timedelta

import scipy.stats

# charting libraries
import plotly.offline as py
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
import ndjson

In [None]:
#################
# Definitions
#################

# Where to read data from
s3_bucket    = 'mfx-sagemaker-dev'

citadel = 'go-data/AUDUSD_20200212-060000_20200212-180000/2020-02-12T06:00:00.051Z-2020-02-12T18:00:00.560Z-Input_CITADEL_top5-1.csv.gz'
jpm = 'go-data/AUDUSD_20200212-060000_20200212-180000/2020-02-12T06:00:00.681Z-2020-02-12T17:59:59.970Z-Input_JPM_top5-1.csv.gz'

market_1 = 'go-data/AUDUSD_20200212-060000_20200212-180000/2020-02-12T06:00:00.045Z-2020-02-12T18:00:00.897Z-Input_FASTMATCH-JPM-UBS-CITADEL-JEFFERIES-STATESTREET-XTX-HC_TECH-GOLDMAN_SACHS_top5-1.csv.gz'


# Bar size
resample_period = '30S'

# Chart settings
chart_padding_secs = 10

# Breakout settings
breakout_sigma = 1.5

# MR settings
mr_sigma = 25

# Hedging level
hedge_level = 3

# Hedger interest to fill time in ms
interest_to_fill = 10000

# n samples for moving average
ma_samples = 3

# optimise exits
optimise_exit = True

In [None]:
market_1 = pd.read_csv('go-data/USDJPY_20200511-000001_20200515-220000/2020-05-11T00:00:01.050Z-2020-05-15T21:00:10.021Z-Input_FASTMATCH-JPM-UBS-CITADEL-JEFFERIES-STATESTREET-XTX-HC_TECH-GOLDMAN_SACHS_top5-1.csv.gz', index_col='t', parse_dates=['t'])
# market_1 = market_1.resample('100ms').last().ffill()
market_1['bid'] = (market_1.loc[:, 'Bid0'])
market_1['offer'] = market_1.loc[:, 'Offer0']
market_1['spread'] = market_1['offer'] - market_1['bid'] 
market_1['market_mid'] = (market_1['bid'] + market_1['offer']) / 2


In [None]:
hotspot = market_1.loc[market_1['Offer0Mkt'] == "HOTSPOT"]

In [None]:
market_1.tail()

In [None]:
hotspot['Offer0Qty'].between_time('06:00', '06:10').plot()

In [None]:
#################
# Citadel data
#################

citadel = pd.read_csv('s3://{}/{}'.format(s3_bucket, citadel), index_col='t', parse_dates=['t'])
citadel = citadel.resample("100ms").last().ffill()
citadel['bid'] = (citadel.loc[:, 'Bid0'])
citadel['offer'] = citadel.loc[:, 'Offer0']
citadel['spread'] = citadel['offer'] - citadel['bid'] 
citadel['citadel_mid'] = (citadel['bid'] + citadel['offer']) / 2


In [None]:
# market_1 = pd.read_csv('s3://{}/{}'.format(s3_bucket, market_5))


In [None]:
# market_1.to_csv("market_5.csv")

In [None]:
#################
# JPM data
#################

jpm = pd.read_csv('s3://{}/{}'.format(s3_bucket, jpm), index_col='t', parse_dates=['t'])
jpm = jpm.resample("100ms").last().ffill()
jpm['bid'] = (jpm.loc[:, 'Bid0'])
jpm['offer'] = jpm.loc[:, 'Offer0']
jpm['spread'] = jpm['offer'] - jpm['bid'] 
jpm['jpm_mid'] = (jpm['bid'] + jpm['offer']) / 2

In [None]:
mids = pd.concat([market_1['market_mid'], citadel['citadel_mid'], jpm['jpm_mid']], axis=1)

In [None]:
cit_skew = citadel['citadel_mid'] / market_1['market_mid'] -1
jpm_skew = jpm['jpm_mid'] / market_1['market_mid'] -1

In [None]:
##### Descriptive stats for skew
np.abs(cit_skew).describe()

In [None]:
np.abs(jpm_skew).describe()

In [None]:
citadel['spread'].describe()

In [None]:
#################
# Citadel's skew and its predictive value
#################
mids['return'] = mids['market_mid'].pct_change()
mids['citadel_skew'] = mids['citadel_mid'] / mids['market_mid'] - 1
mids['citadel_return'] = np.where(mids['citadel_mid'].shift(1) > mids['market_mid'].shift(1), mids['return'], -mids['return'])
mids['citadel_return_sum'] = mids['citadel_return'].cumsum()
mids['citadel_return_sum'].plot()

In [None]:
#################
# Examine the order book and effect of VWM
#################

# Read the order book
df_orderbook = market_1

In [None]:
# Best bidders
pd.pivot_table(df_orderbook,index=["Bid0Mkt"],values=["Bid0Qty",],aggfunc='count')

In [None]:
# Best offers
pd.pivot_table(df_orderbook,index=["Offer0Mkt"],values=["Offer0Qty",],aggfunc='count')

In [None]:
# How oftern there is a choice market or better
np.sum(df_orderbook["Offer0"] <= df_orderbook["Bid0"])/len(df_orderbook["Offer0"])

In [None]:
#############################
# Volume weighted mids 
#############################


# TODO
# vwm where amount on bid or offer > x
# vwm where spread < y
# vwm using top 5
# thickening of the orderbook - exponential weighting of the levels 
# threshold to get in, threshold to change 

df_orderbook['MID'] = 0.5 * df_orderbook["Offer0"] + 0.5 * df_orderbook["Bid0"]
df_orderbook['vwm'] = (df_orderbook["Bid0Qty"] * df_orderbook["Offer0"] + df_orderbook["Bid0"] * df_orderbook["Offer0Qty"])  / (df_orderbook["Bid0Qty"] + df_orderbook["Offer0Qty"])
df_orderbook['vwm_diff'] = (df_orderbook['vwm'] / df_orderbook['MID'] -1)

In [None]:
# Positive order book imbalance
df_orderbook["Positive"] =  df_orderbook['vwm'] > df_orderbook['MID']
df_orderbook["Uptick"] = df_orderbook['MID'] <= df_orderbook['MID'].shift(-1)
np.sum(df_orderbook["Positive"] * df_orderbook["Uptick"]) / np.sum(df_orderbook["Positive"])

In [None]:
# Negative order book imbalance
df_orderbook["DownTick"] = df_orderbook['MID'] >= df_orderbook['MID'].shift(-1)
print(np.sum(df_orderbook["DownTick"] *~df_orderbook["Positive"]) / np.sum(~df_orderbook["Positive"]))

In [None]:
#How often does bid improve if the signal is positive 
df_orderbook["Bid_improved"] = df_orderbook['Bid0'] <= df_orderbook['Bid0'].shift(-1)
np.sum(df_orderbook["Positive"] * df_orderbook["Bid_improved"]) / np.sum(df_orderbook["Positive"])

In [None]:
df_orderbook["Offer_improved"] = df_orderbook['Offer0'] >= df_orderbook['Offer0'].shift(-1)
np.sum(~df_orderbook["Positive"] * df_orderbook["Offer_improved"]) / np.sum(~df_orderbook["Positive"])

In [None]:
df_orderbook['obs'] = (df_orderbook["Bid0Qty"] - df_orderbook["Offer0Qty"])  / (df_orderbook["Bid0Qty"] + df_orderbook["Offer0Qty"])

threshold = 0.1
bid_qquantile  = df_orderbook['obs'].quantile(1 - threshold) # the market is really bid
low_qquantile  = df_orderbook['obs'].quantile(threshold) # the market is really offered

In [None]:
####################
# Trading models 
# Trend and VWM
####################

# calculate returns
df_orderbook['return'] = df_orderbook['MID'].pct_change()


# simple trend following model for mid generation
df_orderbook['trend_sig'] = np.where(df_orderbook['MID'] > df_orderbook['MID'].rolling(50).mean(), 1, -1)
df_orderbook['trend_sig'] = df_orderbook['trend_sig'].shift(1)
df_orderbook['trend_return'] = df_orderbook['trend_sig'] * df_orderbook['return']

# ma crossover
df_orderbook['crossover_sig'] = np.where(df_orderbook['MID'].rolling(10).mean() > df_orderbook['MID'].rolling(200).mean(), 1, -1)
df_orderbook['crossover_sig'] = df_orderbook['crossover_sig'].shift(1)
df_orderbook['crossover_return'] = df_orderbook['crossover_sig'] * df_orderbook['return']


# # ma crossover filtered
# df_orderbook['crossover_filtered'] = np.where((df_orderbook['crossover_sig'] == 1) & (df_orderbook['obs'].shift(1)> low_qquantile), df_orderbook['return'], 0)
# df_orderbook['crossover_filtered'] = np.where((df_orderbook['crossover_sig'] == -1) & (df_orderbook['obs'].shift(1)< bid_qquantile), -df_orderbook['return'], df_orderbook['crossover_filtered'])


# # simple VWM mids model
# df_orderbook['vwm_return'] = np.where(df_orderbook['vwm'].shift(1) > df_orderbook['MID'].shift(1), df_orderbook['return'], -df_orderbook['return'])
# df_orderbook['vwm_return_sum'] = df_orderbook['vwm_return'].cumsum()


# # trading model that adds trend
# df_orderbook['vwm_trend_long_signal'] = np.where((df_orderbook['vwm'].shift(1) > df_orderbook['MID'].shift(1)) & (df_orderbook['trend_sig'] == 1), 1, 0)
# df_orderbook['vwm_trend_short_signal'] = np.where((df_orderbook['vwm'].shift(1) < df_orderbook['MID'].shift(1)) & (df_orderbook['trend_sig'] == -1), -1, 0)

# df_orderbook['vwm_trend_long_return'] = df_orderbook['vwm_trend_long_signal'] * df_orderbook['return']
# df_orderbook['vwm_trend_short_return'] = df_orderbook['vwm_trend_short_signal'] * df_orderbook['return']

# # this version uses 2* the trend element and 1* the vwm - seems close to Citadel
# df_orderbook['vwm_trend_return'] = (df_orderbook['trend_return']*2 + df_orderbook['vwm_return']) / 3

# # vwm signal - where vwm > mean(diff)
# df_orderbook['vwm_long_signal'] = np.where(df_orderbook['vwm_diff'].shift(1) >= 0.000001, 1, 0)
# df_orderbook['vwm_short_signal'] = np.where(df_orderbook['vwm_diff'].shift(1) <= 0.000001, -1, 0)

# # filter bigger diffs
# df_orderbook['vwm_diff_long_return'] = df_orderbook['vwm_long_signal'] * df_orderbook['return']
# df_orderbook['vwm_diff_short_return'] = df_orderbook['vwm_short_signal'] * df_orderbook['return']
# df_orderbook['vwm_diff_total_return'] = df_orderbook['vwm_diff_long_return'] + df_orderbook['vwm_diff_short_return']
# df_orderbook['vwm_diff_total_return_sum'] = df_orderbook['vwm_diff_total_return'].cumsum()


In [None]:
df_orderbook['crossover_return'].between_time('07:00', '18:00').cumsum().resample("1T").last().plot()


In [None]:
df_orderbook.tail()

In [None]:
import scipy.stats
scipy.stats.norm(100, 12).pdf(98)

In [None]:
scipy.stats.norm

In [None]:
#################
# Choice market, go the way of greater volume or greater trend
# Trend seems to work best 
#################

In [None]:
df_orderbook['choice_signal'] = np.where((df_orderbook["Offer0"].shift(1) <= df_orderbook["Bid0"].shift(1)), 1, 0)


In [None]:
df_orderbook['volume_signal'] = np.where(df_orderbook["Bid0Qty"] > df_orderbook["Offer0Qty"], 1, -1)

In [None]:
df_orderbook['choice_volume_strategy'] = (df_orderbook['choice_signal'] * df_orderbook['trend_sig']) * df_orderbook['return'].shift(1)

In [None]:
# df_orderbook['choice_volume_strategy'].cumsum().plot()

In [None]:
print(f"pnl per trade {df_orderbook['choice_volume_strategy'].replace(0, np.NaN).mean()}")

In [None]:
pnl = pd.DataFrame(df_orderbook['trend_return'])


In [None]:
#############################
# order flow imbalance  
#############################

# create a df called quotes 
quotes = pd.DataFrame()
quotes['bid_price'] = df_orderbook["Bid0"]
quotes['ask_price'] = df_orderbook["Offer0"]
quotes['bid_size'] = df_orderbook['Bid0Qty']
quotes['ask_size'] = df_orderbook['Offer0Qty']

In [None]:
def ofi(quotes):
    qdf = quotes.copy()

    qdf['mid_change'] = ((qdf['bid_price'] + qdf['ask_price']) / 2.0).pct_change()
    qdf['prev_bidprice'] = qdf['bid_price'].shift()
    qdf['prev_bidsize'] = qdf['bid_size'].shift()
    qdf['prev_askprice'] = qdf['ask_price'].shift()
    qdf['prev_asksize'] = qdf['ask_size'].shift()

    # Fix any missing/invalid data
    qdf.replace([np.inf, np.NINF], np.nan, inplace=True)
    qdf.fillna(method="ffill", inplace=True)
    qdf.fillna(method="bfill", inplace=True)
    
    bid_geq = qdf['bid_price'] >= qdf['prev_bidprice']
    bid_leq = qdf['bid_price'] <= qdf['prev_bidprice']
    ask_geq = qdf['ask_price'] >= qdf['prev_askprice']
    ask_leq = qdf['ask_price'] <= qdf['prev_askprice']
    
    qdf['ofi'] = np.zeros(len(qdf))
    qdf['ofi'].loc[bid_geq] += qdf['bid_size'].loc[bid_geq]
    qdf['ofi'].loc[bid_leq] -= qdf['prev_bidsize'].loc[bid_leq]
    qdf['ofi'].loc[ask_geq] += qdf['prev_asksize'].loc[ask_geq]
    qdf['ofi'].loc[ask_leq] -= qdf['ask_size'].loc[ask_leq]    
    
    return qdf

In [None]:
# run the order flow imbalance function 
order_flow = ofi(quotes)
# simple model that uses ofi to go long or short for the next tick 
order_flow['signal'] = np.where(order_flow['ofi'].rolling(50).mean() > 0, 1, -1)
order_flow['signal'] = order_flow['signal'].shift(1)

order_flow['pnl'] = order_flow['signal'] * order_flow['mid_change']

# 
order_flow['obs'] = (order_flow["bid_size"] - order_flow["ask_size"])  / (order_flow["bid_size"] + order_flow["ask_size"])
threshold = 0.1
bid_qquantile  = order_flow['obs'].quantile(1 - threshold) # the market is really bid
low_qquantile  = order_flow['obs'].quantile(threshold) # the market is really offered


# ofi filtered
order_flow['pnl_filtered'] = np.where((order_flow['signal'] == 1) & (order_flow['obs'].shift(1)> low_qquantile), order_flow['mid_change'], 0)
order_flow['pnl_filtered'] = np.where((order_flow['signal'] == -1) & (order_flow['obs'].shift(1)< bid_qquantile), -order_flow['mid_change'], order_flow['pnl_filtered'])

order_flow['pnl'].cumsum().resample("1T").last().plot()

In [None]:
df_orderbook['trend_return'].replace(0, np.NaN).std()


In [None]:
###############
# plots the pnl of various order book models 
# vwm is volume weighted mid
# ofi is order flow imbalance
###############
# pnl = pd.DataFrame(mids['citadel_return_sum'])
# pnl['ofi'] = order_flow['pnl'].cumsum()

pnl['crossover'] = df_orderbook['crossover_return'].cumsum()

pnl_resample = pnl.resample("1T").last()


In [None]:
pnl.corr(method='pearson')

In [None]:
########################
# OFI using top x levels 
########################

df_all = pd.read_csv('/Volumes/GoogleDrive/Shared drives/Data/fx_tick/data.csv')
df_all['t'] = pd.to_datetime(df_all['t'],errors='coerce')

df_all.set_index("t",inplace=True)

df_all = df_all.resample("100ms").last()

df_all["Offer0Qty"] = df_all["Offer0Qty"].astype('float')
df_all["Offer0"] = df_all["Offer0"].astype('float')
df_all["Bid0"] = df_all["Bid0"].astype('float')
df_all["Bid0Qty"] = df_all["Bid0Qty"].astype('float')

df_all["Offer1Qty"] = df_all["Offer1Qty"].astype('float')
df_all["Offer1"] = df_all["Offer1"].astype('float')
df_all["Bid1"] = df_all["Bid1"].astype('float')
df_all["Bid1Qty"] = df_all["Bid1Qty"].astype('float')

df_all["Offer2Qty"] = df_all["Offer2Qty"].astype('float')
df_all["Offer2"] = df_all["Offer2"].astype('float')
df_all["Bid2"] = df_all["Bid2"].astype('float')
df_all["Bid2Qty"] = df_all["Bid2Qty"].astype('float')

df_all["Offer3Qty"] = df_all["Offer3Qty"].astype('float')
df_all["Offer3"] = df_all["Offer3"].astype('float')
df_all["Bid3"] = df_all["Bid3"].astype('float')
df_all["Bid3Qty"] = df_all["Bid3Qty"].astype('float')

df_all["Offer4Qty"] = df_all["Offer4Qty"].astype('float')
df_all["Offer4"] = df_all["Offer4"].astype('float')
df_all["Bid4"] = df_all["Bid4"].astype('float')
df_all["Bid4Qty"] = df_all["Bid4Qty"].astype('float')

In [None]:

def ofi(quotes,level):
    qdf = quotes.copy()
    bid_price_label = 'Bid' + str(level)
    offer_price_label = 'Offer' + str(level)
    bid_qty_label = 'Bid' +str(level) + 'Qty'
    offer_qty_label = 'Offer' + str(level)+'Qty'

    qdf['prev_bidprice'] = qdf[bid_price_label].shift()
    qdf['prev_bidsize'] = qdf[bid_qty_label].shift()
    qdf['prev_askprice'] = qdf[offer_price_label].shift()
    qdf['prev_asksize'] = qdf[offer_qty_label].shift()

    # Fix any missing/invalid data
    qdf.replace([np.inf, np.NINF], np.nan, inplace=True)
    qdf.fillna(method="ffill", inplace=True)
    qdf.fillna(method="bfill", inplace=True)
    
    bid_geq = qdf[bid_price_label] >= qdf['prev_bidprice']
    bid_leq = qdf[bid_price_label] <= qdf['prev_bidprice']
    ask_geq = qdf[offer_price_label] >= qdf['prev_askprice']
    ask_leq = qdf[offer_price_label] <= qdf['prev_askprice']
    
    qdf['ofi'] = np.zeros(len(qdf))
    qdf['ofi'].loc[bid_geq] += qdf[bid_qty_label].loc[bid_geq]
    qdf['ofi'].loc[bid_leq] -= qdf['prev_bidsize'].loc[bid_leq]
    qdf['ofi'].loc[ask_geq] += qdf['prev_asksize'].loc[ask_geq]
    qdf['ofi'].loc[ask_leq] -= qdf[offer_qty_label].loc[ask_leq]    
    return qdf['ofi']




In [None]:
# OFI with levels 1, 2 and 3 (works better than 0,1,2)
# 100 period MA works well

df_all['ofi'] = ofi(df_all,1) +ofi(df_all,2) +ofi(df_all,3) 
df_all['ofi_signal'] = np.where(df_all['ofi'].rolling(100).mean() > 0, 1, -1)
df_all['mid'] = ((df_all['Bid0'] + df_all['Offer0']) / 2.0)
df_all['mid_change'] = ((df_all['Bid0'] + df_all['Offer0']) / 2.0).pct_change()
df_all['ofi_signal'] = df_all['ofi_signal'].shift(1)
df_all['pnl'] = df_all['ofi_signal'] * df_all['mid_change']

print("Cumulative PnL " + str(df_all['pnl'].cumsum().iloc[-1]))

In [None]:
(df_all['ofi']).between_time('07:00', '07:01').plot()

In [None]:
df_all['pnl'].cumsum().resample("1T").last().plot()

In [None]:
per_trade_avg = df_all['pnl'].replace(0, np.NaN).mean()
per_trade_avg = per_trade_avg * 34
print(per_trade_avg)

In [None]:
df_all['pnl'].cumsum().resample("1T").last().plot()

In [None]:
df_all['vwm'] = (df_all["Bid1Qty"] * df_all["Offer1"] + df_all["Bid1"] * df_all["Offer1Qty"])  / (df_all["Bid1Qty"] + df_all["Offer1Qty"])

In [None]:
df_all['vwm_signal'] = np.where(df_all['vwm'] > df_all['mid'], 1, -1)
df_all['vwm_signal'] = df_all['vwm_signal'].shift(1)
df_all['vwm_pnl'] = df_all['vwm_signal'] * df_all['mid_change']

In [None]:
df_all['combined_signal'] = np.sign(df_all['vwm_signal'] + df_orderbook['crossover_sig'] + df_all['ofi_signal'])
df_all['combined_pnl'] = df_all['combined_signal'] * df_all['mid_change']

In [None]:
df_all['combined_pnl'].between_time("07:00", "17:00").cumsum().resample("1T").last().plot()

In [None]:
df_all.tail()

In [None]:
spike = 0.000001
df_all['rolling_change'] = df_all['mid_change'].rolling(10).sum()
df_all['vwm_adj'] = np.where(np.abs(df_all['rolling_change'].shift(1)) >= spike, 0, df_all['vwm_signal'])
df_all['vwm_adj'] = np.where(df_all['vwm_adj'] == 0, df_all['vwm_signal'].shift(1), df_all['vwm_adj'])

In [None]:
df_all['vwm_pnl'] = df_all['vwm_adj'] * df_all['mid_change']
df_all['vwm_pnl'].between_time("07:00", "17:00").cumsum().resample("1T").last().plot()

In [None]:
pnl2 = pd.DataFrame(df_all['vwm_pnl'])
pnl2['ofi_pnl'] = df_all['pnl']

In [None]:
pnl2.corr(method='pearson')

In [None]:
pnl2['combined'] = (df_all['vwm_pnl'] + df_all['pnl'])/2

In [None]:
pnl3 = pd.DataFrame(df_orderbook['crossover_return'].between_time('06:00', '18:00'))

In [None]:
pnl3['ofi'] = df_all['pnl'].between_time('06:00', '18:00')

In [None]:
pnl3.cumsum().resample("1T").last().plot()

In [None]:
pnl3.corr(method='pearson')

In [None]:
h = pnl2['combined'].resample("B").sum() 

In [None]:
dr = h * 24 *252

In [None]:
m = h.mean() * 252


In [None]:
std = h.std() * np.sqrt(252)

In [None]:
m/std

In [None]:
df_all['pnl_filtered'].replace(0, np.NaN).mean()

In [None]:
np.abs(df_all['mid_change']).describe()

In [None]:
#############################
# build a regression to explain Citadel's mid
#############################
cit_regression = pd.concat([df_orderbook['vwm'], citadel['citadel_mid'], df_orderbook['MID']], axis=1)

cit_regression['citadel_mid_next'] = cit_regression['citadel_mid'].shift(-1)
cit_regression['citadel_mid_prev'] = cit_regression['citadel_mid'].shift(1)
cit_regression['citadel_mid_next_change'] = cit_regression['citadel_mid'].shift(-1).pct_change()

cit_regression['trend'] = cit_regression['MID'].rolling(50).mean()
cit_regression['trend_change'] = cit_regression['trend'].pct_change()
cit_regression['vwm_change'] = cit_regression['vwm'].pct_change()

## Logistic 
cit_regression['vwm_sign'] = np.sign(cit_regression['vwm'] - cit_regression['MID']).shift(1)
cit_regression['citadel_skew'] = (cit_regression['citadel_mid'] / cit_regression['MID'] -1)
cit_regression['citadel_skew_sign'] = np.sign(cit_regression['citadel_mid'] / cit_regression['MID'] -1)
cit_regression['trend_sign'] = np.sign(cit_regression['MID'] - cit_regression['trend']).shift(1)

# Clean the data to remove infs and nans
cit_regression = cit_regression.replace([np.inf, np.NaN], 0)


import statsmodels.api as sm
cit_regression = cit_regression.replace(np.NaN, 0)
X = cit_regression[['trend_sign', 'vwm_sign']]
y = cit_regression['citadel_skew_sign']
## fit a OLS model with intercept on TV and Radio
# X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print(est.summary())

In [None]:
# trend following predicting the next mid
df_orderbook['trend_return'].cumsum().plot()

In [None]:
# trend and VWM predicting the next mid
df_orderbook['vwm_return_sum'].plot()

In [None]:
# average "return" trend model
df_orderbook['trend_return'].replace(0, np.NaN).mean()

In [None]:
df_orderbook.tail()

In [None]:
df_all['vwm0'] = (df_all["Bid0Qty"] * df_all["Offer0"] + df_all["Bid0"] * df_all["Offer0Qty"])  / (df_all["Bid0Qty"] + df_all["Offer0Qty"])
df_all['vwm1'] = (df_all["Bid1Qty"] * df_all["Offer1"] + df_all["Bid1"] * df_all["Offer1Qty"])  / (df_all["Bid1Qty"] + df_all["Offer1Qty"])
df_all['vwm_diff'] = df_all['vwm1'] - df_all['vwm0']

# two levels
df_all['weighted_bid_2'] = (df_all["Bid0Qty"] * df_all["Bid0"] + df_all["Bid1Qty"] * df_all["Bid1"]) / (df_all["Bid0Qty"] + df_all["Bid1Qty"])
df_all['weighted_offer_2'] = (df_all["Offer0Qty"] * df_all["Offer0"] + df_all["Offer1Qty"] * df_all["Offer1"]) / (df_all["Offer0Qty"] + df_all["Offer1Qty"])
df_all['conventionally_weighted_mid_2'] = (df_all['weighted_bid_2'] + df_all['weighted_offer_2']) / 2

df_all['weighted_bid_notional_2'] = df_all["Bid0Qty"] + df_all["Bid1Qty"] 
df_all['weighted_offer_notional_2'] = df_all["Offer0Qty"] + df_all["Offer1Qty"] 

# three levels
df_all['weighted_bid_3'] = (df_all["Bid0Qty"] * df_all["Bid0"] + df_all["Bid1Qty"] * df_all["Bid1"] + df_all["Bid2Qty"] * df_all["Bid2"]) / (df_all["Bid0Qty"] + df_all["Bid1Qty"] + df_all["Bid2Qty"])
df_all['weighted_offer_3'] = (df_all["Offer0Qty"] * df_all["Offer0"] + df_all["Offer1Qty"] * df_all["Offer1"] + df_all["Offer2Qty"] * df_all["Offer2"]) / (df_all["Offer0Qty"] + df_all["Offer1Qty"] + df_all["Offer2Qty"])
df_all['conventionally_weighted_mid_3'] = (df_all['weighted_bid_3'] + df_all['weighted_offer_3']) / 2

df_all['weighted_bid_notional_3'] = df_all["Bid0Qty"] + df_all["Bid1Qty"] + df_all["Bid2Qty"]
df_all['weighted_offer_notional_3'] = df_all["Offer0Qty"] + df_all["Offer1Qty"] + df_all["Offer2Qty"]

# four levels
df_all['weighted_bid_4'] = (df_all["Bid0Qty"] * df_all["Bid0"] + df_all["Bid1Qty"] * df_all["Bid1"] + df_all["Bid2Qty"] * df_all["Bid2"] + df_all["Bid3Qty"] * df_all["Bid3"]) / (df_all["Bid0Qty"] + df_all["Bid1Qty"] + df_all["Bid2Qty"] + df_all["Bid3Qty"])
df_all['weighted_offer_4'] = (df_all["Offer0Qty"] * df_all["Offer0"] + df_all["Offer1Qty"] * df_all["Offer1"] + df_all["Offer2Qty"] * df_all["Offer2"] + df_all["Offer3Qty"] * df_all["Offer3"]) / (df_all["Offer0Qty"] + df_all["Offer1Qty"] + df_all["Offer2Qty"] + df_all["Offer3Qty"])
df_all['conventionally_weighted_mid_4'] = (df_all['weighted_bid_4'] + df_all['weighted_offer_4']) / 2

df_all['weighted_bid_notional_4'] = df_all["Bid0Qty"] + df_all["Bid1Qty"] + df_all["Bid2Qty"] + df_all["Bid3Qty"]
df_all['weighted_offer_notional_4'] = df_all["Offer0Qty"] + df_all["Offer1Qty"] + df_all["Offer2Qty"] + df_all["Offer3Qty"]

# five levels
# four levels
df_all['weighted_bid_5'] = (df_all["Bid0Qty"] * df_all["Bid0"] + df_all["Bid1Qty"] * df_all["Bid1"] + df_all["Bid2Qty"] * df_all["Bid2"] + df_all["Bid3Qty"] * df_all["Bid3"] + df_all["Bid4Qty"] * df_all["Bid4"]) / (df_all["Bid0Qty"] + df_all["Bid1Qty"] + df_all["Bid2Qty"] + df_all["Bid3Qty"] + df_all["Bid4Qty"])
df_all['weighted_offer_5'] = (df_all["Offer0Qty"] * df_all["Offer0"] + df_all["Offer1Qty"] * df_all["Offer1"] + df_all["Offer2Qty"] * df_all["Offer2"] + df_all["Offer3Qty"] * df_all["Offer3"] + df_all["Offer4Qty"] * df_all["Offer4"]) / (df_all["Offer0Qty"] + df_all["Offer1Qty"] + df_all["Offer2Qty"] + df_all["Offer3Qty"] + df_all["Offer4Qty"])
df_all['conventionally_weighted_mid_5'] = (df_all['weighted_bid_5'] + df_all['weighted_offer_5']) / 2

df_all['weighted_bid_notional_5'] = df_all["Bid0Qty"] + df_all["Bid1Qty"] + df_all["Bid2Qty"] + df_all["Bid3Qty"] + df_all["Bid4Qty"]
df_all['weighted_offer_notional_5'] = df_all["Offer0Qty"] + df_all["Offer1Qty"] + df_all["Offer2Qty"] + df_all["Offer3Qty"] + df_all["Offer4Qty"]

df_all['vwm'] = df_all['vwm1']
# df_all['vwm'] = df_all['vwm1']



df_all['vwm_signal'] = np.where(df_all['vwm'] > df_all['mid'], 1, -1)
df_all['vwm_signal'] = df_all['vwm_signal'].shift(1)
df_all['vwm_pnl'] = df_all['vwm_signal'] * df_all['mid_change']

# df_all.to_csv('stack.csv')

In [None]:
df_all['vwm_pnl'].cumsum().resample("1T").last().plot()

In [None]:
# Dave skip 0
df_all['weighted_bid_notional_5'] =  df_all["Bid1Qty"] + df_all["Bid2Qty"] + df_all["Bid3Qty"] + df_all["Bid4Qty"]
df_all['weighted_offer_notional_5'] =  df_all["Offer1Qty"] + df_all["Offer2Qty"] + df_all["Offer3Qty"] + df_all["Offer4Qty"]
df_all['weighted_bid_5'] = (df_all["Bid1Qty"] * df_all["Bid1"] + df_all["Bid2Qty"] * df_all["Bid2"] + df_all["Bid3Qty"] * df_all["Bid3"] + df_all["Bid4Qty"] * df_all["Bid4"]) / df_all['weighted_bid_notional_5'] 
df_all['weighted_offer_5'] = ( df_all["Offer1Qty"] * df_all["Offer1"] + df_all["Offer2Qty"] * df_all["Offer2"] + df_all["Offer3Qty"] * df_all["Offer3"] + df_all["Offer4Qty"] * df_all["Offer4"]) / df_all['weighted_offer_notional_5'] 
df_all['conventionally_weighted_mid_5'] = (df_all['weighted_bid_5'] + df_all['weighted_offer_5']) / 2
df_all['vwm'] = df_all['conventionally_weighted_mid_5']
df_all['signal'] = np.where(df_all['vwm'] >= df_all['mid'], 1, -1)


In [None]:
#VWM - level 0 inverse
df_all['vwm'] = (df_all["Bid0Qty"] * df_all["Offer0"] + df_all["Bid0"] * df_all["Offer0Qty"])  / (df_all["Bid0Qty"] + df_all["Offer0Qty"])
df_all['signal'] = np.where(df_all['vwm'] >= df_all['mid'], -1, 1)


In [None]:
#VWM - level 1 
df_all['vwm'] = (df_all["Bid1Qty"] * df_all["Offer1"] + df_all["Bid1"] * df_all["Offer1Qty"])  / (df_all["Bid1Qty"] + df_all["Offer1Qty"])
df_all['signal'] = np.where(df_all['vwm'] >= df_all['mid'], 1, -1)

In [None]:
df_all['vwm_signal'] = np.where(df_all['vwm'] > df_all['mid'], 1, -1)
df_all['vwm_signal'] = df_all['vwm_signal'].shift(1)
df_all['vwm_pnl'] = df_all['vwm_signal'] * df_all['mid_change']
df_all['vwm_pnl'].cumsum().resample("1T").last().plot()

In [None]:
###############
# Counts the number of ticks between changes in the signal
# Calculates pnl per trade 
###############

# these are the df and column for the signals 
signal_df = df_all
signal_column = 'vwm_adj'

# the df and column for the pnl
pnl_column = df_all['vwm_pnl']


def SignalPersisenceFast(df,column_name): 
    array= df[column_name].values
    previous_signal  = False 
    Counter = 0
    Times = []
    for x in range(len(array)):
        if((array[x] == previous_signal or Counter == 0) and array[x] != 0):
            Counter = Counter + 1
        else:
            Times.append(Counter)
            if array[x] != 0 : 
                Counter =  1
        previous_signal = array[x]
    return Times

Times = SignalPersisenceFast(df_all,signal_column)
print("Mean time in trade " + str(np.mean(Times)))
print("Number of trades " + str(len(Times)))
print("Cumulative PnL " + str(pnl_column.cumsum().iloc[-1]))
print("Average Trade PnL " + str(pnl_column.cumsum().iloc[-1] / len(Times)))

In [None]:
Mean time in trade 3.7650935337087468
Number of trades 57359
Cumulative PnL 0.10923349792949288
Average Trade PnL 1.9043828855017152e-06

In [None]:
# VWM
Mean time in trade 4.6480225730355516
Number of trades 743542
Cumulative PnL 0.4734955060643815
Average Trade PnL 6.368107061395072e-07

In [None]:
# Dave
Mean time in trade 4.722161267006164
Number of trades 731867
Cumulative PnL 0.622094526730822
Average Trade PnL 8.50010352606173e-07

In [None]:
# have a look at the average length of each tick
df_all['time'] = df_all.index

In [None]:
# data needs cleaning 
(df_orderbook['time'] - df_orderbook['time'].shift(1)).describe()

In [None]:
#############################
# Citadel update price time 
#############################

In [None]:
df_orderbook = citadel.reset_index()

df_orderbook['t'] = pd.to_datetime(df_orderbook['t'])

#check for new levels
df_orderbook['newlevel'] = ((df_orderbook["Offer0"] != df_orderbook["Offer0"].shift(1)) |  (df_orderbook["Bid0"] != df_orderbook["Bid0"].shift(1)))
df_orderbook_nupdates = df_orderbook.loc[df_orderbook['newlevel']]
((df_orderbook_nupdates['t'] -  df_orderbook_nupdates['t'].shift(1)) / np.timedelta64(1, 'ms')).describe()

In [None]:
######################
# Rollng regressions
######################

In [None]:
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS

In [None]:
endog = pd.DataFrame(df['return'])

exog = pd.DataFrame(df['signal'])
exog['lagged'] = df['return'].shift(1)
exog['diff'] = df['c_ma_diff'] 

In [None]:
rols = RollingOLS(endog, exog, window=240)

In [None]:
rres = rols.fit()
params = rres.params
print(params.tail())


In [None]:
new_mod = pd.DataFrame(np.where(params.signal>0, df['return'], -df['return']))

In [None]:
fig = rres.plot_recursive_coefficient()

In [None]:
#############################
# resample the vwm - try to reduce the noise 
#############################

In [None]:
df_vwm_resampled = pd.DataFrame(df_orderbook['vwm'].resample("5s").mean()).ffill()
df_vwm_resampled['mid'] = df_orderbook['MID'].resample("5s").last().ffill()
df_vwm_resampled['return'] = df_vwm_resampled['mid'].pct_change()
df_vwm_resampled['diff'] = (df_vwm_resampled['vwm'] / df_vwm_resampled['mid'] -1)
df_vwm_resampled['vwm_return'] = np.where(df_vwm_resampled['vwm'].shift(1) > df_vwm_resampled['mid'].shift(1), df_vwm_resampled['return'], -df_vwm_resampled['return'])
df_vwm_resampled['vwm_return_sum'] = df_vwm_resampled['vwm_return'].cumsum()

In [None]:
df_vwm_resampled['vwm__long_signal'] = np.where(df_vwm_resampled['diff'].shift(1) >= 0.000007, 1, 0)

In [None]:
# filter bigger diffs
df_vwm_resampled['vwm_diff_long_return'] = df_vwm_resampled['vwm__long_signal'] * df_vwm_resampled['return']
df_vwm_resampled['vwm_diff_long_return_sum'] = df_vwm_resampled['vwm_return'].cumsum()

In [None]:
df_vwm_resampled['vwm_diff_long_return_sum'].plot()

In [None]:
#################
# Aggregate tick data
#################
mid_price_series = (tick_data.loc[:, 'Bid0'] + tick_data.loc[:, 'Offer0']) / 2
bar_sampler = mid_price_series.resample(resample_period)
hloc = pd.DataFrame()
hloc['high'] = bar_sampler.max()
hloc['low'] = bar_sampler.min()
hloc['open'] = bar_sampler.first() # **FIXME** First tick in bar not last tick in previous bar?
hloc['close'] = bar_sampler.last()

In [None]:
#################
# Derived columns
#################
hloc['o_to_h'] = (hloc['high'] / hloc['open'] - 1)
hloc['o_to_l'] = (hloc['low'] / hloc['open'] - 1)
hloc['c_to_c'] = hloc['close'].pct_change()
hloc['o_to_h_vol'] = hloc['o_to_h'].rolling(12).std()
hloc['o_to_l_vol'] = hloc['o_to_l'].rolling(12).std()
hloc['c_to_c_vol'] = hloc['c_to_c'].rolling(12).std()