In [20]:
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [22]:
df = pd.read_pickle("/Users/jakesharadin/Desktop/blockhouse/df.pkl")

In [23]:
df.head()

Unnamed: 0,ts_event,rtype,publisher_id,instrument_id,action,side,depth,price,size,flags,...,bid_ct_08,ask_ct_08,bid_px_09,ask_px_09,bid_sz_09,ask_sz_09,bid_ct_09,ask_ct_09,symbol,date
0,2024-12-02 09:00:00.006056124+00:00,10,2,12463,A,N,0,164.97,100,130,...,0,0,,,0,0,0,0,PEP,2024-12-02
1,2024-12-02 09:00:00.009936778+00:00,10,2,17709,A,N,0,117.18,100,130,...,0,0,,,0,0,0,0,XOM,2024-12-02
2,2024-12-02 09:00:00.009975632+00:00,10,2,17709,A,A,0,118.62,100,128,...,0,0,,,0,0,0,0,XOM,2024-12-02
3,2024-12-02 09:00:00.073883926+00:00,10,2,16244,A,N,0,960.0,25,128,...,0,0,,,0,0,0,0,TSLA,2024-12-02
4,2024-12-02 09:00:00.074013500+00:00,10,2,11667,A,N,0,132.11,500,130,...,0,0,,,0,0,0,0,NVDA,2024-12-02


In [24]:
df.shape #2,8420,218 rows and 74 columns . Data from week of 12/02/2024 - 12/06/2024

(28420218, 74)

In [7]:
df.columns

Index(['ts_event', 'rtype', 'publisher_id', 'instrument_id', 'action', 'side',
       'depth', 'price', 'size', 'flags', 'ts_in_delta', 'sequence',
       'bid_px_00', 'ask_px_00', 'bid_sz_00', 'ask_sz_00', 'bid_ct_00',
       'ask_ct_00', 'bid_px_01', 'ask_px_01', 'bid_sz_01', 'ask_sz_01',
       'bid_ct_01', 'ask_ct_01', 'bid_px_02', 'ask_px_02', 'bid_sz_02',
       'ask_sz_02', 'bid_ct_02', 'ask_ct_02', 'bid_px_03', 'ask_px_03',
       'bid_sz_03', 'ask_sz_03', 'bid_ct_03', 'ask_ct_03', 'bid_px_04',
       'ask_px_04', 'bid_sz_04', 'ask_sz_04', 'bid_ct_04', 'ask_ct_04',
       'bid_px_05', 'ask_px_05', 'bid_sz_05', 'ask_sz_05', 'bid_ct_05',
       'ask_ct_05', 'bid_px_06', 'ask_px_06', 'bid_sz_06', 'ask_sz_06',
       'bid_ct_06', 'ask_ct_06', 'bid_px_07', 'ask_px_07', 'bid_sz_07',
       'ask_sz_07', 'bid_ct_07', 'ask_ct_07', 'bid_px_08', 'ask_px_08',
       'bid_sz_08', 'ask_sz_08', 'bid_ct_08', 'ask_ct_08', 'bid_px_09',
       'ask_px_09', 'bid_sz_09', 'ask_sz_09', 'bid_ct_09', '

In [31]:
# Link to MBP-10 data schema variable description
# (https://databento.com/docs/schemas-and-data-formats/mbp-10?historical=python&live=python&reference=python)

In [32]:
# Variables of interest

# ts_event/ts_recv: the matching-engine-received timestamp expressed as the number of nanoseconds since the UNIX epoch.
# action: the event action. Can be Add, Cancel, Modify, cleaR book, Trade, or Fill. See Action.
# side: the side that initiates the event. Can be Ask for the sell aggressor in a trade, Bid for the buy aggressor in a trade, or None where no side is specified by the original trade or the record was not a trade.
# depth: the book level where the update event occurred.
# price: the order price where every 1 unit corresponds to 1e-9, i.e. 1/1,000,000,000 or 0.000000001.
# size: the order quantity.
# bid_px_N: the bid price at level N (top level if N = 00).
# ask_px_N: The ask price at level N (top level if N = 00).
# bid_sz_N: The bid size at level N (top level if N = 00).
# ask_sz_N: The ask size at level N (top level if N = 00).

In [25]:
columns_of_interest = ['symbol', 'ts_event',
    'action', 'side', 'depth', 'price', 'size',
    'bid_px_00', 'ask_px_00', 'bid_sz_00', 'ask_sz_00',
    'bid_px_01', 'ask_px_01', 'bid_sz_01', 'ask_sz_01',
    'bid_px_02', 'ask_px_02', 'bid_sz_02', 'ask_sz_02',
    'bid_px_03', 'ask_px_03', 'bid_sz_03', 'ask_sz_03',
    'bid_px_04', 'ask_px_04', 'bid_sz_04', 'ask_sz_04'
]

# Drop columns not in columns_of_interest
df = df[columns_of_interest]
df.head()

Unnamed: 0,symbol,ts_event,action,side,depth,price,size,bid_px_00,ask_px_00,bid_sz_00,...,bid_sz_02,ask_sz_02,bid_px_03,ask_px_03,bid_sz_03,ask_sz_03,bid_px_04,ask_px_04,bid_sz_04,ask_sz_04
0,PEP,2024-12-02 09:00:00.006056124+00:00,A,N,0,164.97,100,,164.97,0,...,0,0,,,0,0,,,0,0
1,XOM,2024-12-02 09:00:00.009936778+00:00,A,N,0,117.18,100,117.18,,100,...,0,0,,,0,0,,,0,0
2,XOM,2024-12-02 09:00:00.009975632+00:00,A,A,0,118.62,100,117.18,118.62,100,...,0,0,,,0,0,,,0,0
3,TSLA,2024-12-02 09:00:00.073883926+00:00,A,N,0,960.0,25,,960.0,0,...,0,0,,,0,0,,,0,0
4,NVDA,2024-12-02 09:00:00.074013500+00:00,A,N,0,132.11,500,132.11,,500,...,0,0,,,0,0,,,0,0


In [40]:
df["action"].value_counts()

action
A    3135234
C    2819291
T     600508
F          1
Name: count, dtype: int64

In [41]:
# Action labels

# A (add): insert a new order into the book
# C (cancel): fully or partially cancel an order from the book.
# T (trade): an aggressing order traded. Does not affect the book.
# F (fill): a resting order was filled. Does not affect the book. 

In [42]:
df["side"].value_counts()

side
A    3190794
B    3182841
N     181399
Name: count, dtype: int64

In [43]:
# Side labels: The side that initiates the event

# A (ask): the sell aggressor in a trade 
# B (buy): the buy aggressor in a trade
# N (none): no side is specified by the original trade or the record was not a trade

In [26]:
# Save df to a pickle file
with open('df.pkl', 'wb') as f:
    pkl.dump(df, f)