## Trade Distribution

In this notebook we look at the distribution of number of trades on the futures over aggregated futures (May expiry, June expiry,... ) over different time scales.

In [1]:
import pandas as pd
import numpy as np
import calendar
from datetime import datetime

import matplotlib.pyplot as plt
import mpld3

In [125]:
futures = '''BTCH0,BTCJ0,BTCK0,BTCM0,BTCN0,BTCQ0,BTCU0,BTCV0,BTCX0'''.split(',')

df = pd.DataFrame()
for f in futures:
    df2 = pd.read_csv(f"../data_dumps/{f}_Curncy.csv", parse_dates=["time"])
    df2["index"] = df2.index
    df2 = df2.set_index('time')
    df2.index = df2.index.tz_localize('UTC')
    df2 = df2[(df2["conditionCodes"] == "TSUM") | (df2["conditionCodes"] == "ST")]
    #df2 = df2.groupby('size').resample('1h').sum()
    if df.empty:
        df = df2
    else:
        df = pd.concat([df,df2],axis=0)#.groupby('time', sort=False)['size'].sum(min_count=1).reset_index()
        

In [126]:
df[["size"]].groupby(pd.Grouper(freq="D")).sum()#.reset_index()

Unnamed: 0_level_0,size
time,Unnamed: 1_level_1
2019-10-01 00:00:00+00:00,9
2019-10-02 00:00:00+00:00,5
2019-10-03 00:00:00+00:00,24
2019-10-04 00:00:00+00:00,0
2019-10-05 00:00:00+00:00,0
...,...
2020-07-26 00:00:00+00:00,675
2020-07-27 00:00:00+00:00,9518
2020-07-28 00:00:00+00:00,4744
2020-07-29 00:00:00+00:00,4449


In [128]:
fig, ax = plt.subplots(figsize=(12,8))
plt.hist(df["size"])
mpld3.display(fig)
#df.describe()

In [131]:
val_bins, bins = np.histogram(df["size"],bins=np.logspace(1.5,8,num=250))


fig, ax = plt.subplots(figsize=(12,8))

ax.bar(x=bins[:-1], height=val_bins, width=bins[1:] - bins[:-1])
ax.set_xscale('log')
ax.set_xlabel("trade size (log scale)")

ax.set_ylabel("counts")
ax.set_title(f"trade size distributions (counts at the 1 day level)")

mpld3.display(fig)
#plt.savefig("trade_distribution.png")

In [38]:
# ---------------testing---------------
df_feb = df[(df.index > pd.to_datetime("2020-02-01").tz_localize("GMT")) & (df.index < pd.to_datetime("2020-02-28").tz_localize("GMT"))]
df_feb

time
2020-02-02 00:00:00+00:00     0
2020-02-03 00:00:00+00:00     0
2020-02-04 00:00:00+00:00     0
2020-02-05 00:00:00+00:00    14
2020-02-06 00:00:00+00:00     4
2020-02-07 00:00:00+00:00     8
2020-02-08 00:00:00+00:00     0
2020-02-09 00:00:00+00:00     9
2020-02-10 00:00:00+00:00    27
2020-02-11 00:00:00+00:00    11
2020-02-12 00:00:00+00:00     3
2020-02-13 00:00:00+00:00     3
2020-02-14 00:00:00+00:00     4
2020-02-15 00:00:00+00:00     0
2020-02-16 00:00:00+00:00     0
2020-02-17 00:00:00+00:00     4
2020-02-18 00:00:00+00:00     3
2020-02-19 00:00:00+00:00     7
2020-02-20 00:00:00+00:00     4
2020-02-21 00:00:00+00:00     1
2020-02-22 00:00:00+00:00     0
2020-02-23 00:00:00+00:00     0
2020-02-24 00:00:00+00:00     1
2020-02-25 00:00:00+00:00     7
2020-02-26 00:00:00+00:00    35
2020-02-27 00:00:00+00:00     7
Name: type, dtype: int64

In [13]:
dfa = pd.read_csv(f"../data_dumps/BTCM0_Curncy.csv", parse_dates=["time"])
dfa["index"] = dfa.index
dfa = dfa.set_index('time')
dfa.index = dfa.index.tz_localize('UTC')
dfa = dfa[(dfa["type"] == "TRADE") & (dfa["conditionCodes"] == "TSUM")]
dfa

Unnamed: 0_level_0,type,value,size,conditionCodes,index
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-03 14:42:08+00:00,TRADE,7490.0,1,TSUM,28175
2020-01-03 16:27:56+00:00,TRADE,7650.0,1,TSUM,30151
2020-01-07 23:40:38+00:00,TRADE,8470.0,1,TSUM,71570
2020-01-09 07:48:11+00:00,TRADE,8195.0,1,TSUM,117305
2020-01-10 08:40:52+00:00,TRADE,8005.0,1,TSUM,142223
...,...,...,...,...,...
2020-06-26 14:58:05+00:00,TRADE,9090.0,1,TSUM,7672229
2020-06-26 14:58:05+00:00,TRADE,9090.0,1,TSUM,7672233
2020-06-26 14:59:20+00:00,TRADE,9090.0,1,TSUM,7672243
2020-06-26 14:59:41+00:00,TRADE,9085.0,1,TSUM,7672248


In [15]:
dfb = pd.read_csv(f"../data_dumps/BTCN0_Curncy.csv", parse_dates=["time"])
dfb["index"] = dfb.index
dfb = dfb.set_index('time')
dfb.index = dfb.index.tz_localize('UTC')
dfb = dfb[(dfb["type"] == "TRADE") & (dfb["conditionCodes"] == "TSUM")]
dfb

Unnamed: 0_level_0,type,value,size,conditionCodes,index
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-11 15:38:37+00:00,TRADE,10685.0,1,TSUM,24715
2020-02-11 19:02:32+00:00,TRADE,10920.0,1,TSUM,45611
2020-02-13 16:05:23+00:00,TRADE,11060.0,1,TSUM,102778
2020-02-27 15:08:32+00:00,TRADE,9180.0,1,TSUM,117449
2020-02-27 15:08:32+00:00,TRADE,9180.0,1,TSUM,117455
...,...,...,...,...,...
2020-07-21 01:12:39+00:00,TRADE,9220.0,1,TSUM,5197887
2020-07-21 01:12:39+00:00,TRADE,9220.0,1,TSUM,5197892
2020-07-21 01:12:39+00:00,TRADE,9220.0,1,TSUM,5197897
2020-07-21 01:13:23+00:00,TRADE,9220.0,1,TSUM,5197945


In [85]:
dfb_feb = dfb[(dfb.index > pd.to_datetime("2020-02-01").tz_localize("GMT")) & (dfb.index < pd.to_datetime("2020-02-28").tz_localize("GMT"))]
b = dfb_feb.groupby('type').resample('1d').sum()
b

Unnamed: 0_level_0,Unnamed: 1_level_0,value,size,index
type,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TRADE,2020-02-11 00:00:00+00:00,21605.0,2,70326
TRADE,2020-02-12 00:00:00+00:00,0.0,0,0
TRADE,2020-02-13 00:00:00+00:00,11060.0,1,102778
TRADE,2020-02-14 00:00:00+00:00,0.0,0,0
TRADE,2020-02-15 00:00:00+00:00,0.0,0,0
TRADE,2020-02-16 00:00:00+00:00,0.0,0,0
TRADE,2020-02-17 00:00:00+00:00,0.0,0,0
TRADE,2020-02-18 00:00:00+00:00,0.0,0,0
TRADE,2020-02-19 00:00:00+00:00,0.0,0,0
TRADE,2020-02-20 00:00:00+00:00,0.0,0,0


In [87]:
dfa_feb = dfa[(dfa.index > pd.to_datetime("2020-02-11").tz_localize("GMT")) & (dfa.index < pd.to_datetime("2020-02-28").tz_localize("GMT"))]
a = dfa_feb.groupby('type').resample('1d').sum()
a

Unnamed: 0_level_0,Unnamed: 1_level_0,value,size,index
type,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TRADE,2020-02-11 00:00:00+00:00,95470.0,11,5959109
TRADE,2020-02-12 00:00:00+00:00,33110.0,3,2154681
TRADE,2020-02-13 00:00:00+00:00,21890.0,2,1454800
TRADE,2020-02-14 00:00:00+00:00,44045.0,4,3093060
TRADE,2020-02-15 00:00:00+00:00,0.0,0,0
TRADE,2020-02-16 00:00:00+00:00,0.0,0,0
TRADE,2020-02-17 00:00:00+00:00,41120.0,4,3119528
TRADE,2020-02-18 00:00:00+00:00,31100.0,4,2366116
TRADE,2020-02-19 00:00:00+00:00,70060.0,7,5570522
TRADE,2020-02-20 00:00:00+00:00,40055.0,4,3212397


In [91]:
fd = pd.concat([a,b],axis=0).groupby('time', sort=False)['size'].sum(min_count=1).reset_index()
fd

Unnamed: 0,time,size
0,2020-02-11 00:00:00+00:00,13
1,2020-02-12 00:00:00+00:00,3
2,2020-02-13 00:00:00+00:00,3
3,2020-02-14 00:00:00+00:00,4
4,2020-02-15 00:00:00+00:00,0
5,2020-02-16 00:00:00+00:00,0
6,2020-02-17 00:00:00+00:00,4
7,2020-02-18 00:00:00+00:00,4
8,2020-02-19 00:00:00+00:00,7
9,2020-02-20 00:00:00+00:00,4
