# Feature Investigation
Analyze orderbook features and model behavior using saved tick data.

In [None]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress

sns.set_style("whitegrid")
%matplotlib inline

## 1. Load Tick Data

In [None]:
# Load all tick files
tick_files = sorted(glob.glob("data/ticks/ticks_*.parquet"))
print(f"Found {len(tick_files)} tick files")

if not tick_files:
    raise FileNotFoundError("No tick data found. Run pipeline.py first to collect data.")

# Concatenate all ticks
dfs = [pd.read_parquet(f) for f in tick_files]
df_all = pd.concat(dfs, ignore_index=True).sort_values("ts_exchange")

print(f"Total events: {len(df_all):,}")
print(f"Markets: {df_all['market_id'].nunique()}")
print(f"Event types: {df_all['type'].value_counts().to_dict()}")

## 2. Extract Features from Quote Data

In [None]:
# Filter to quotes only and select a market
df_quotes = df_all[df_all["type"] == "QUOTE"].copy()

# Pick a market with sufficient data
market_counts = df_quotes["market_id"].value_counts()
target_market = market_counts.index[0]
print(f"Analyzing market: {target_market} ({market_counts.iloc[0]:,} quotes)")

df = df_quotes[df_quotes["market_id"] == target_market].copy()

# Compute features from structured tick data
df["best_bid_prob"] = df["best_yes_price"] / 100.0
df["best_ask_prob"] = (100 - df["best_no_price"]) / 100.0

df["midpoint"] = 0.5 * (df["best_bid_prob"] + df["best_ask_prob"])
df["spread"] = df["best_ask_prob"] - df["best_bid_prob"]

# Depth imbalance
df["depth_imb"] = (df["best_yes_size"] - df["best_no_size"]) / \
                  (df["best_yes_size"] + df["best_no_size"] + 1e-9)

# Drop invalid quotes (crossed books)
df = df[(df["spread"] > 0) & (df["midpoint"] > 0) & (df["midpoint"] < 1)]

print(f"\nValid quotes: {len(df):,}")
df[["midpoint", "spread", "depth_imb", "best_yes_size", "best_no_size"]].head()

## 3. Feature Health Check

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# A. Spread distribution
sns.histplot(df["spread"], bins=30, ax=axes[0, 0], kde=True, color="orange")
axes[0, 0].set_title("Spread Distribution")
axes[0, 0].set_xlabel("Spread (prob)")

# B. Depth imbalance distribution
sns.histplot(df["depth_imb"], bins=30, ax=axes[0, 1], kde=True, color="teal")
axes[0, 1].set_title("Depth Imbalance Distribution")
axes[0, 1].set_xlabel("-1 (Sell Heavy) ... +1 (Buy Heavy)")

# C. Spread vs Depth Imbalance
sample = df.sample(min(2000, len(df)))
sns.scatterplot(data=sample, x="spread", y="depth_imb", alpha=0.3, ax=axes[1, 0])
axes[1, 0].set_title("Spread vs Depth Imbalance")

# D. Feature correlation
corr = df[["midpoint", "spread", "depth_imb"]].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", ax=axes[1, 1], vmin=-1, vmax=1)
axes[1, 1].set_title("Feature Correlation")

plt.tight_layout()
plt.show()

## 4. Calibrate Uncertainty (Tau)
Regress spread against realized volatility to find the relationship.

In [None]:
# Compute realized volatility over next N ticks
WINDOW = 20
indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=WINDOW)
df["realized_vol"] = df["midpoint"].rolling(window=indexer).std()

# Filter valid data
valid = df.dropna(subset=["spread", "realized_vol"])
valid = valid[valid["spread"] > 0]  # Exclude zero spreads

if len(valid["spread"].unique()) < 2:
    print("⚠️  Spread is constant - cannot calibrate tau. Market may be illiquid.")
else:
    # Linear regression
    slope, intercept, r_value, _, _ = linregress(valid["spread"], valid["realized_vol"])

    print(f"\n--- TAU CALIBRATION ---")
    print(f"Correlation: {r_value:.4f}")
    print(f"Formula: tau = {slope:.4f} * spread + {intercept:.4f}")

    # Plot
    plt.figure(figsize=(10, 6))
    sample = valid.sample(min(3000, len(valid)))
    plt.scatter(sample["spread"], sample["realized_vol"], alpha=0.2, color="gray", label="Data")

    x_vals = np.linspace(sample["spread"].min(), sample["spread"].max(), 100)
    plt.plot(x_vals, slope * x_vals + intercept, "r-", linewidth=3, label="Calibrated Fit")

    plt.xlabel("Current Spread")
    plt.ylabel(f"Realized Volatility ({WINDOW} ticks)")
    plt.title("Calibrating Uncertainty Parameter (Tau)")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.show()

## 5. Price Dynamics
Analyze how features predict future price movement.

In [None]:
# Compute forward returns
df["future_ret_10"] = df["midpoint"].shift(-10) - df["midpoint"]
df["future_ret_50"] = df["midpoint"].shift(-50) - df["midpoint"]

# Depth imbalance vs future returns
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 10-tick horizon
valid_10 = df.dropna(subset=["depth_imb", "future_ret_10"])
if len(valid_10) > 0:
    bins = pd.cut(valid_10["depth_imb"], bins=5)
    sns.boxplot(x=bins, y=valid_10["future_ret_10"], ax=axes[0])
    axes[0].set_title("Depth Imbalance → Future Return (10 ticks)")
    axes[0].set_xlabel("Depth Imbalance")
    axes[0].set_ylabel("Future Return")
    axes[0].tick_params(axis="x", rotation=45)

# 50-tick horizon
valid_50 = df.dropna(subset=["depth_imb", "future_ret_50"])
if len(valid_50) > 0:
    bins = pd.cut(valid_50["depth_imb"], bins=5)
    sns.boxplot(x=bins, y=valid_50["future_ret_50"], ax=axes[1])
    axes[1].set_title("Depth Imbalance → Future Return (50 ticks)")
    axes[1].set_xlabel("Depth Imbalance")
    axes[1].set_ylabel("Future Return")
    axes[1].tick_params(axis="x", rotation=45)

plt.tight_layout()
plt.show()

## 6. Summary Statistics

In [None]:
print("\n=== FEATURE SUMMARY ===")
print(df[["midpoint", "spread", "depth_imb"]].describe())

print("\n=== BOOK QUALITY ===")
print(f"Mean spread: {df['spread'].mean():.4f}")
print(f"Median spread: {df['spread'].median():.4f}")
print(f"Spread std: {df['spread'].std():.4f}")
print(f"\nMean depth imbalance: {df['depth_imb'].mean():.4f}")
print(f"Abs depth imbalance > 0.5: {(df['depth_imb'].abs() > 0.5).mean():.1%}")