# Get Tick-by-tick and snapshot data to the same format 

## Imports

In [92]:
import datetime
import logging

import pandas as pd

import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hpandas as hpandas
import helpers.hprint as hprint
import im_v2.common.data.client.im_raw_data_client as imvcdcimrdc
import copy

In [93]:
! sudo /venv/bin/pip install sortedcontainers



In [94]:
from sortedcontainers import SortedDict

In [95]:
hdbg.init_logger(verbosity=logging.INFO)
log_level = logging.INFO

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

INFO  # Git
  branch_name='CmampTask8666_Collect_1_day_of_orderbook_data_using_our_script_maintain_local_orderbook_copy.py'
  hash='4a5277f28'
  # Last commits:
    * 4a5277f28 jsmerix  Bug fix                                                           (   3 hours ago) Thu Jun 20 16:45:45 2024  (HEAD -> CmampTask8666_Collect_1_day_of_orderbook_data_using_our_script_maintain_local_orderbook_copy.py, origin/CmampTask8666_Collect_1_day_of_orderbook_data_using_our_script_maintain_local_orderbook_copy.py)
    * d92508cbf jsmerix  Add functionality to store snapshots to a file                    (  30 hours ago) Wed Jun 19 14:05:47 2024           
    * 68fead01d Neha Pathak CmTask8626 Smoke tests for get_ccxt_total_balance.py script. (#8633) (  31 hours ago) Wed Jun 19 12:50:36 2024  (origin/master, origin/HEAD, master)
# Machine info
  system=Linux
  node name=3bfe3a23d5d2
  release=5.15.0-1058-aws
  version=#64~20.04.1-Ubuntu SMP Tue Apr 9 11:12:27 UTC 2024
  machine=x86_64
  processor=x86

## Load Binance Native historical bid/ask data

Refer to `im_v2/binance/notebooks/CmampTask8259_Analysis_of_historical_bidask_data.ipynb` for more details

In [96]:
symbol = "BTCUSDT"

In [97]:
snap_csv = pd.read_csv(f"/app/{symbol}_T_DEPTH_2024-05-06_depth_snap.csv")
update_csv = pd.read_csv(f"/app/{symbol}_T_DEPTH_2024-05-06_depth_update.csv")

In [98]:
snap_csv.shape

(105738, 9)

In [99]:
snap_csv.head()

Unnamed: 0,symbol,timestamp,trans_id,first_update_id,last_update_id,side,update_type,price,qty
0,BTCUSDT,1714953565827,1714643037593395250,4556869216769,4556869216769,a,snap,66583.4,0.002
1,BTCUSDT,1714953565827,1714643037593395250,4556869216769,4556869216769,a,snap,66592.3,0.002
2,BTCUSDT,1714953565827,1714643037593395250,4556869216769,4556869216769,a,snap,66729.4,0.002
3,BTCUSDT,1714953565827,1714643037593395250,4556869216769,4556869216769,a,snap,66791.4,0.002
4,BTCUSDT,1714953565827,1714643037593395250,4556869216769,4556869216769,a,snap,66869.2,0.002


Confirm the snapshot contains unique timestamp

In [100]:
snap_csv.timestamp.nunique()

1

In [101]:
start_timestamp = pd.to_datetime(snap_csv.timestamp.min(), unit="ms")
start_timestamp

Timestamp('2024-05-05 23:59:25.827000')

### Reconstruct order book from snapshot DataFrame

In [141]:
bids = sorted(snap_csv[snap_csv["side"] == "b"][["price", "qty"]].values.tolist(), reverse=True)

In [142]:
asks = sorted(snap_csv[snap_csv["side"] == "a"][["price", "qty"]].values.tolist())

In [144]:
_ORDER_BOOK_SNAPSHOT = { 
    "timestamp": 1714953565827,
    # We only keep top 250 levels
    # Bids will need to be accessed in reverse
    "b": SortedDict(lambda x: -x, {float(price): qty for price, qty in bids[:250]}),
    "a": SortedDict({price: qty for price, qty in asks[:250]})
}

In [106]:
pd.to_datetime(snap_csv.timestamp.max(), unit="ms")

Timestamp('2024-05-05 23:59:25.827000')

In [107]:
update_csv.shape

(712296, 9)

In [108]:
update_csv.head()

Unnamed: 0,symbol,timestamp,trans_id,first_update_id,last_update_id,side,update_type,price,qty
0,BTCUSDT,1714953565827,1714953565827866380,4556869216794,4556869216794,b,set,63930.7,0.444
1,BTCUSDT,1714953565831,1714953565831574540,4556869216931,4556869216931,a,set,64009.1,0.236
2,BTCUSDT,1714953565833,1714953565833602752,4556869217011,4556869217011,b,set,63971.1,2.744
3,BTCUSDT,1714953565834,1714953565834150447,4556869217023,4556869217023,b,set,63968.0,0.118
4,BTCUSDT,1714953565835,1714953565835679966,4556869217064,4556869217064,a,set,64002.4,1.047


In [109]:
update_csv["update_type"].unique()

array(['set', 's'], dtype=object)

In [110]:
pd.to_datetime(update_csv.timestamp.min(), unit="ms")

Timestamp('2024-05-05 23:59:25.827000')

In [111]:
end_timestamp = pd.to_datetime(update_csv.timestamp.max(), unit="ms")
end_timestamp

Timestamp('2024-05-06 00:06:35.690000')

## Load our data

We will use archived data from our realtime websocket collection script
- each timestamp contain information about top of the book

In [112]:
signature = "periodic_daily.airflow.archived_200ms.parquet.bid_ask.futures.v8.binance.binance.v1_0_0"
reader = imvcdcimrdc.RawDataReader(signature, stage="preprod", add_suffix="tokyo")
data = reader.read_data(
    start_timestamp, end_timestamp, currency_pairs=["BTC_USDT"]
)
_LOG.log(log_level, hpandas.df_to_str(data, log_level=log_level))

INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/amp/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3


Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price,exchange_id,level,end_download_timestamp,knowledge_timestamp,currency_pair,year,month,day
2024-05-05 23:59:26.686000+00:00,1714953566686,5.303,63983.9,0.564,63984.0,binance,1,2024-05-05 23:59:26.728295+00:00,2024-05-05 23:59:26.743708+00:00,BTC_USDT,2024,5,5
2024-05-05 23:59:26.838000+00:00,1714953566838,5.228,63983.9,0.564,63984.0,binance,1,2024-05-05 23:59:26.930693+00:00,2024-05-05 23:59:26.945667+00:00,BTC_USDT,2024,5,5
2024-05-05 23:59:27.104000+00:00,1714953567104,1.055,63983.9,4.873,63984.0,binance,1,2024-05-05 23:59:27.132223+00:00,2024-05-05 23:59:27.148182+00:00,BTC_USDT,2024,5,5
,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-06 00:06:35.506000+00:00,1714953995506,1.457,64044.6,7.715,64044.7,binance,1,2024-05-06 00:06:35.548971+00:00,2024-05-06 00:06:35.590357+00:00,BTC_USDT,2024,5,6
2024-05-06 00:06:35.204000+00:00,1714953995204,3.027,64050.0,1.515,64050.1,binance,1,2024-05-06 00:06:35.311070+00:00,2024-05-06 00:06:35.344582+00:00,BTC_USDT,2024,5,6
2024-05-06 00:06:35.608000+00:00,1714953995608,0.081,64044.1,13.408,64044.2,binance,1,2024-05-06 00:06:35.715801+00:00,2024-05-06 00:06:35.741555+00:00,BTC_USDT,2024,5,6


INFO  None


In [113]:
data.shape

(2455, 13)

In [114]:
archived_200ms_data = data

## Transform tick-by-tick data to snapshots

Confirm all timestamp from snapshost are present in tick by tick data

In [115]:
len(set(archived_200ms_data.timestamp))

2455

In [116]:
len(set(update_csv.timestamp))

176567

In [117]:
set(archived_200ms_data.timestamp) - set(update_csv.timestamp)

set()

In [146]:
def transform_tick_by_tick_to_snapshot(tick_by_tick_df: pd.DataFrame, snapshot_df: pd.DataFrame) -> pd.DataFrame:
    """
    TODO(Juraj): Add docstring
    """
    reconstructed_df = []
    
    for i, row in tick_by_tick_df.iterrows():
        curr_side = row["side"]
        if row["price"] in _ORDER_BOOK_SNAPSHOT[curr_side]:
            if row["qty"] == 0:
                _ORDER_BOOK_SNAPSHOT[curr_side].pop(row["price"])
            else:
                _ORDER_BOOK_SNAPSHOT[curr_side][row["price"]] = row["qty"]
        else:
            if row["qty"] != 0:
                _ORDER_BOOK_SNAPSHOT[curr_side][row["price"]] = row["qty"]
                if len(_ORDER_BOOK_SNAPSHOT[curr_side]) > 250:
                    _ORDER_BOOK_SNAPSHOT[curr_side].popitem(-1)
                
        if row["timestamp"] in set(snapshot_df.timestamp):
            reconstructed_df.append({
                'timestamp': row["timestamp"],
                'bid_size': _ORDER_BOOK_SNAPSHOT["b"].peekitem(0)[1],
                'bid_price': _ORDER_BOOK_SNAPSHOT["b"].peekitem(0)[0],
                'ask_size': _ORDER_BOOK_SNAPSHOT["a"].peekitem(0)[1],
                'ask_price': _ORDER_BOOK_SNAPSHOT["a"].peekitem(0)[0]
            })
    return pd.DataFrame(reconstructed_df)

In [None]:
transformed_df = transform_tick_by_tick_to_snapshot(update_csv, archived_200ms_data)

In [148]:
transformed_df.to_csv(f"/app/{symbol}_T_DEPTH_2024-05-06_transformed.csv")

In [150]:
transformed_df.head()

Unnamed: 0,timestamp,bid_size,bid_price,ask_size,ask_price
0,1714953565849,5.14,63983.9,1.115,63984.0
1,1714953565849,5.14,63983.9,1.115,63984.0
2,1714953565849,5.14,63983.9,1.115,63984.0
3,1714953566108,5.213,63983.9,1.086,63984.0
4,1714953566218,5.279,63983.9,1.086,63984.0


In [152]:
# Timestamp is also in column, will be easier to merge with the other dataset
archived_200ms_data = archived_200ms_data.reset_index(drop=True)

## Compare the data

In [155]:
# Merge the reconstructed data with the third dataset based on timestamp
merged_df = pd.merge(transformed_df, archived_200ms_data, on='timestamp', suffixes=('_reconstructed', '_rt_archived'))

# Calculate the deviation percentage for each column
for column in ['bid_size', 'bid_price', 'ask_size', 'ask_price']:
    merged_df[f'{column}_deviation'] = abs(merged_df[f'{column}_reconstructed'] - merged_df[f'{column}_rt_archived']) / merged_df[f'{column}_rt_archived'] * 100


In [156]:
merged_df.head()

Unnamed: 0,timestamp,bid_size_reconstructed,bid_price_reconstructed,ask_size_reconstructed,ask_price_reconstructed,bid_size_rt_archived,bid_price_rt_archived,ask_size_rt_archived,ask_price_rt_archived,exchange_id,level,end_download_timestamp,knowledge_timestamp,currency_pair,year,month,day,bid_size_deviation,bid_price_deviation,ask_size_deviation,ask_price_deviation
0,1714953565849,5.14,63983.9,1.115,63984.0,5.14,63983.9,1.115,63984.0,binance,1,2024-05-05 23:59:25.923611+00:00,2024-05-05 23:59:25.938259+00:00,BTC_USDT,2024,5,5,0.0,0.0,0.0,0.0
1,1714953565849,5.14,63983.9,1.115,63984.0,5.14,63983.9,1.115,63984.0,binance,1,2024-05-05 23:59:25.923611+00:00,2024-05-05 23:59:25.938259+00:00,BTC_USDT,2024,5,5,0.0,0.0,0.0,0.0
2,1714953565849,5.14,63983.9,1.115,63984.0,5.14,63983.9,1.115,63984.0,binance,1,2024-05-05 23:59:25.923611+00:00,2024-05-05 23:59:25.938259+00:00,BTC_USDT,2024,5,5,0.0,0.0,0.0,0.0
3,1714953566108,5.213,63983.9,1.086,63984.0,5.213,63983.9,1.086,63984.0,binance,1,2024-05-05 23:59:26.120215+00:00,2024-05-05 23:59:26.142422+00:00,BTC_USDT,2024,5,5,0.0,0.0,0.0,0.0
4,1714953566218,5.279,63983.9,1.086,63984.0,5.279,63983.9,1.086,63984.0,binance,1,2024-05-05 23:59:26.325117+00:00,2024-05-05 23:59:26.341712+00:00,BTC_USDT,2024,5,5,0.0,0.0,0.0,0.0


In [159]:
merged_df[["bid_size_deviation", "bid_price_deviation", "ask_size_deviation", "ask_price_deviation"]].describe()

Unnamed: 0,bid_size_deviation,bid_price_deviation,ask_size_deviation,ask_price_deviation
count,7023.0,7023.0,7023.0,7023.0
mean,22.42008,0.000125,15.287032,0.000114
std,277.873941,0.000951,216.252026,0.000977
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,6450.0,0.009989,4450.0,0.015764


The conclusion is that we are able to match the orderbook snapshost to the tick by tick history reasonably well.