#### **LOB Factors**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from datetime import datetime
import glob 
from pathlib import Path
import csv

plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (8,5) 

%config InlineBackend.figure_format = 'svg'

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')  

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display
pd.set_option('expand_frame_repr', False)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.width', 180)


parquet_dir = Path("/Users/roddy/ORIE5640P2/lob_preprocessed_parquet")
files = sorted(glob.glob(str(parquet_dir/"part.*.parquet")))


dfs = []
for f in files:
    print("Loading", f)
    df = pd.read_parquet(f)
    dfs.append(df)
lob_pre = pd.concat(dfs, ignore_index=True)
print("Combined shape:", lob_pre.shape)


lob_pre["time_of_day_s"] = (
    lob_pre["timestamp"].dt.hour   * 3600
  + lob_pre["timestamp"].dt.minute *   60
  + lob_pre["timestamp"].dt.second
)

lob_pre["time_str"] = lob_pre["timestamp"].dt.strftime("%H:%M:%S")


Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.0.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.1.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.10.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.11.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.12.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.13.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.14.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.15.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.16.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.17.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.18.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.19.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parquet/part.2.parquet
Loading /Users/roddy/ORIE5640P2/lob_preprocessed_parqu

In [2]:
lob_pre

Unnamed: 0,timestamp,Price,Quantity,BidOrAsk,OrderbookPosition,QuantityDifference,time_of_day_s,time_str
0,2024-12-28 00:00:00.425,94256.8,0.001,Ask,0,1.748,0,00:00:00
1,2024-12-28 00:00:00.924,94251.1,0.810,Ask,0,-0.799,0,00:00:00
2,2024-12-28 00:00:01.424,94251.1,0.713,Ask,0,-0.712,1,00:00:01
3,2024-12-28 00:00:01.924,94251.1,0.864,Ask,0,-0.863,1,00:00:01
4,2024-12-28 00:00:02.424,94251.1,0.864,Ask,0,-0.863,2,00:00:02
...,...,...,...,...,...,...,...,...
85174495,2024-12-02 23:59:57.524,95881.6,0.001,Bid,9,-0.049,86397,23:59:57
85174496,2024-12-02 23:59:57.924,95881.6,0.001,Bid,9,-0.049,86397,23:59:57
85174497,2024-12-02 23:59:58.464,95881.6,0.001,Bid,9,-0.049,86398,23:59:58
85174498,2024-12-02 23:59:59.084,95881.6,0.001,Bid,9,-0.049,86399,23:59:59


In [None]:
qty_wide = lob_pre.pivot_table(
    index="timestamp",
    columns=["BidOrAsk","OrderbookPosition"],
    values="Quantity",
    aggfunc="first"   
).fillna(0)


best = pd.DataFrame(index=qty_wide.index)
for side in ["Bid","Ask"]:
    for lvl, w in zip([0,1,2], [0.6,0.3,0.1]):
        best[f"{side}{lvl+1}_qty"] = qty_wide[(side, lvl)]
        

best["WQ_B"] = sum(w*best[f"Bid{i+1}_qty"] for i,w in enumerate([0.6,0.3,0.1]))
best["WQ_A"] = sum(w*best[f"Ask{i+1}_qty"] for i,w in enumerate([0.6,0.3,0.1]))
best["DepthRatio"] = best["WQ_A"] / best["WQ_B"]
best["OBI"]        = (best["WQ_B"] - best["WQ_A"]) / (best["WQ_B"] + best["WQ_A"])

price_wide = lob_pre.pivot_table(
    index="timestamp",
    columns=["BidOrAsk","OrderbookPosition"],
    values="Price",
    aggfunc="first"
).fillna(method="ffill") 

best["Ask0_price"] = price_wide[("Ask",0)]
best["Bid0_price"] = price_wide[("Bid",0)]
best["spread"]     = best["Ask0_price"] - best["Bid0_price"]
best["midprice"]   = 0.5*(best["Ask0_price"] + best["Bid0_price"])


# online_normalization
def online_normalize(df, cols, lookback):
    roll_mean = df[cols].rolling(window=lookback, min_periods=1).mean()
    roll_std  = df[cols].rolling(window=lookback, min_periods=1).std().replace(0,1)
    df_norm = (df[cols] - roll_mean) / roll_std
    return df_norm

feature_cols = [
    "Bid1_qty","Bid2_qty","Bid3_qty",
    "Ask1_qty","Ask2_qty","Ask3_qty",
    "DepthRatio","OBI",
    "spread","midprice"
]

L = 300  # window 
best_norm = best.copy()
best_norm[feature_cols] = online_normalize(best, feature_cols, lookback=L)

print(best_norm[feature_cols].head(10).round(3))
print(best_norm[feature_cols].rolling(window=L, min_periods=L).mean().dropna().head())
print(best_norm[feature_cols].rolling(window=L, min_periods=L).std().dropna().head())

                         Bid1_qty  Bid2_qty  Bid3_qty  Ask1_qty  Ask2_qty  Ask3_qty  DepthRatio    OBI  spread  midprice
timestamp                                                                                                               
2024-11-27 14:26:09.185       NaN       NaN       NaN       NaN       NaN       NaN         NaN    NaN     NaN       NaN
2024-11-27 14:26:09.684     0.707     0.707    -0.707    -0.707    -0.707     0.707      -0.707  0.707  -0.707    -0.707
2024-11-27 14:26:10.205     0.655    -1.154    -0.577    -0.902    -0.577     0.381      -0.589  0.673  -0.577    -0.577
2024-11-27 14:26:10.704     0.960    -0.845    -0.500     0.813    -0.404    -0.934      -0.507  0.560   0.061    -1.141
2024-11-27 14:26:11.204     1.054    -0.715    -0.447    -1.193    -0.365    -0.777      -0.485  0.811   0.056    -0.908
2024-11-27 14:26:11.724     0.780    -0.633    -0.408    -0.873     1.057    -0.706      -0.437  0.664  -0.875    -1.052
2024-11-27 14:26:12.185     0.70

In [4]:
best_norm

Unnamed: 0_level_0,Bid1_qty,Bid2_qty,Bid3_qty,Ask1_qty,Ask2_qty,Ask3_qty,WQ_B,WQ_A,DepthRatio,OBI,Ask0_price,Bid0_price,spread,midprice
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2024-11-27 14:26:09.185,,,,,,,0.0726,0.7217,,,94525.3,94510.6,,
2024-11-27 14:26:09.684,0.707107,0.707107,-0.707107,-0.707107,-0.707107,0.707107,2.2054,0.6844,-0.707107,0.707107,94512.2,94508.0,-0.707107,-0.707107
2024-11-27 14:26:10.205,0.654711,-1.154449,-0.577350,-0.901667,-0.577350,0.381299,2.3992,0.4407,-0.588627,0.673276,94512.2,94508.0,-0.577350,-0.577350
2024-11-27 14:26:10.704,0.960083,-0.844652,-0.500000,0.813274,-0.403847,-0.933895,3.3193,0.6401,-0.507401,0.560313,94508.1,94500.0,0.060560,-1.140673
2024-11-27 14:26:11.204,1.054074,-0.715184,-0.447214,-1.193039,-0.365201,-0.777053,3.9457,0.0281,-0.485153,0.811100,94508.1,94500.0,0.055915,-0.907907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-01-02 23:59:57.804,-0.284899,0.238093,-0.445291,-1.179328,-0.288414,-0.205605,0.0497,0.0090,-0.687406,1.753243,96961.2,96951.2,1.945732,2.254213
2025-01-02 23:59:58.324,-0.262386,-0.040339,-0.231067,-1.179302,-0.288130,-0.205550,0.0511,0.0090,-0.687406,1.751846,96961.2,96952.0,1.528174,2.309058
2025-01-02 23:59:58.804,-0.262800,-0.311024,-0.334743,-1.185094,-0.299145,-0.209001,0.0354,0.0040,-0.687796,1.876015,96962.2,96952.0,2.021643,2.380279
2025-01-02 23:59:59.324,-0.192360,-0.032660,-0.346241,-1.185021,-0.298820,-0.208934,0.0659,0.0040,-0.688112,1.985794,96962.2,96954.4,0.802981,2.571506


In [6]:
best_norm.to_parquet("best_norm.parquet", compression="snappy")
print("Saved best_fill.parquet")

Saved best_fill.parquet


In [None]:
nan_counts = best_norm.isna().sum()
print("NaN per column:\n", nan_counts[nan_counts>0])

best_fill = best_norm.copy()
best_fill = best_fill.ffill()   
best_fill = best_fill.bfill()   

print("Remaining NaNs:", best_fill.isna().sum().sum())


NaN per column:
 Bid1_qty      1
Bid2_qty      1
Bid3_qty      1
Ask1_qty      1
Ask2_qty      1
Ask3_qty      1
DepthRatio    1
OBI           1
spread        1
midprice      1
dtype: int64
Remaining NaNs: 0
