# Model Evaluation
Evaluate trained models on held-out tick data.

In [None]:
import sys
import os
import glob
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import DataLoader, TensorDataset

sys.path.append(os.getcwd())
from models import MarketAgent, make_default_grid

sns.set_style("whitegrid")
%matplotlib inline

## 1. Load Tick Data and Extract Features

In [None]:
# Configuration
DATA_DIR = "data/ticks"
MODEL_DIR = "models_ckpts"
NORM_DIR = "normalization"

# Load tick data
tick_files = sorted(glob.glob(os.path.join(DATA_DIR, "ticks_*.parquet")))
if not tick_files:
    raise FileNotFoundError(f"No tick files found in {DATA_DIR}")

print(f"Loading {len(tick_files)} tick files...")
dfs = [pd.read_parquet(f) for f in tick_files]
df_all = pd.concat(dfs, ignore_index=True).sort_values("ts_exchange")

# Select a market with sufficient data
df_quotes = df_all[df_all["type"] == "QUOTE"].copy()
market_counts = df_quotes["market_id"].value_counts()
MARKET_ID = market_counts.index[0]

print(f"\nEvaluating market: {MARKET_ID}")
print(f"Quote events: {market_counts.iloc[0]:,}")

In [None]:
# Extract features from structured tick data
df = df_quotes[df_quotes["market_id"] == MARKET_ID].copy()

# Compute orderbook features
df["best_bid_prob"] = df["best_yes_price"] / 100.0
df["best_ask_prob"] = (100 - df["best_no_price"]) / 100.0
df["midpoint"] = 0.5 * (df["best_bid_prob"] + df["best_ask_prob"])
df["spread"] = df["best_ask_prob"] - df["best_bid_prob"]
df["depth_imb"] = (df["best_yes_size"] - df["best_no_size"]) / \
                  (df["best_yes_size"] + df["best_no_size"] + 1e-9)

# Filter invalid quotes
df = df[(df["spread"] > 0) & (df["midpoint"] > 0) & (df["midpoint"] < 1)].reset_index(drop=True)

# Build feature matrix (simplified - matches 9 features from compute_micro_features)
# For full evaluation, use compute_micro_features with proper TickStore replay
X_features = torch.tensor([
    df["midpoint"].values,
    df["spread"].values,
    df["depth_imb"].values,
    np.zeros(len(df)),  # trade_imbalance (requires full replay)
    np.zeros(len(df)),  # quote_velocity
    np.zeros(len(df)),  # q_stale
    np.zeros(len(df)),  # t_stale
    np.zeros(len(df)),  # time_to_start
    np.zeros(len(df)),  # rating_diff
]).T.float()

# Target: future price (N ticks ahead)
HORIZON = 50
df["target_future"] = df["midpoint"].shift(-HORIZON)
valid_idx = df["target_future"].notna()

X_eval = X_features[valid_idx]
df_eval = df[valid_idx].reset_index(drop=True)

print(f"\nValid evaluation samples: {len(X_eval):,}")

## 2. Load Trained Model

In [None]:
# Load agent
agent = MarketAgent(MARKET_ID)
if not agent.load(base_dir=MODEL_DIR, norm_dir=NORM_DIR):
    raise ValueError(f"Could not load model for {MARKET_ID}. Train it first using train_offline.py!")

print(f"Loaded model for {MARKET_ID}")

## 3. Run Inference

In [None]:
print("Running inference...")

agent.model.eval()
all_probs = []
all_means = []
all_vars = []

with torch.no_grad():
    X_norm = agent.normalize(X_eval)
    loader = DataLoader(TensorDataset(X_norm), batch_size=1024)
    
    for (xb,) in loader:
        logits = agent.model(xb)
        probs = torch.softmax(logits, dim=-1)
        
        grid_vals = agent.grid.values.to(probs.device)
        means = (probs * grid_vals).sum(dim=-1)
        vars_ = (probs * (grid_vals ** 2)).sum(dim=-1) - means ** 2
        
        all_probs.append(probs.cpu())
        all_means.append(means.cpu())
        all_vars.append(vars_.cpu())

probs_tensor = torch.cat(all_probs)
df_eval["pred_mean"] = torch.cat(all_means).numpy()
df_eval["pred_std"] = torch.sqrt(torch.cat(all_vars)).numpy()

print("Inference complete.")

## 4. Evaluation Metrics

In [None]:
# Scalar metrics
rmse = np.sqrt(np.mean((df_eval["pred_mean"] - df_eval["target_future"])**2))
mae = np.mean(np.abs(df_eval["pred_mean"] - df_eval["target_future"]))

print(f"=== Scalar Accuracy ({HORIZON}-tick horizon) ===")
print(f"MAE:  {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

# Probabilistic score (Negative Log-Likelihood)
grid_vals = agent.grid.values.numpy()
target_indices = np.abs(df_eval["target_future"].values[:, None] - grid_vals[None, :]).argmin(axis=1)
pred_probs_at_target = probs_tensor[np.arange(len(probs_tensor)), target_indices].numpy()
pred_probs_at_target = np.clip(pred_probs_at_target, 1e-6, 1.0)
nll = -np.mean(np.log(pred_probs_at_target))

print(f"\n=== Probabilistic Score ===")
print(f"NLL: {nll:.4f}")

## 5. Prediction vs Reality

In [None]:
# Plot a time slice
slice_df = df_eval.iloc[1000:1500] if len(df_eval) > 1500 else df_eval.tail(500)

plt.figure(figsize=(14, 7))

plt.plot(slice_df["ts_exchange"], slice_df["target_future"], 
         color="black", label="True Future Price", linewidth=1.5)
plt.plot(slice_df["ts_exchange"], slice_df["pred_mean"], 
         color="#FF8C00", linestyle="--", label="Model Prediction")

# Uncertainty bands
upper = slice_df["pred_mean"] + 2 * slice_df["pred_std"]
lower = slice_df["pred_mean"] - 2 * slice_df["pred_std"]
plt.fill_between(slice_df["ts_exchange"], lower, upper, 
                 color="#FF8C00", alpha=0.2, label="95% Confidence")

plt.title(f"Forecast vs Reality: {MARKET_ID}")
plt.xlabel("Timestamp")
plt.ylabel("Probability")
plt.ylim(0, 1)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Distribution Quality Checks

In [None]:
# Compute distribution statistics
from scipy.stats import entropy

df_eval["entropy"] = [entropy(p.numpy()) for p in probs_tensor]
df_eval["width"] = df_eval["pred_std"]  # Already computed

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# A. Spread vs Entropy
axes[0, 0].scatter(df_eval["spread"], df_eval["entropy"], s=5, alpha=0.3)
axes[0, 0].set_xlabel("Spread")
axes[0, 0].set_ylabel("Distribution Entropy")
axes[0, 0].set_title("Uncertainty vs Spread")
axes[0, 0].grid(alpha=0.3)

# B. Mean Prediction vs True Midpoint
axes[0, 1].scatter(df_eval["midpoint"], df_eval["pred_mean"], s=5, alpha=0.3)
axes[0, 1].plot([0, 1], [0, 1], 'k--', alpha=0.5)
axes[0, 1].set_xlabel("Market Midpoint")
axes[0, 1].set_ylabel("Model Prediction")
axes[0, 1].set_title("Calibration")
axes[0, 1].grid(alpha=0.3)

# C. Prediction Error Distribution
errors = df_eval["pred_mean"] - df_eval["target_future"]
axes[1, 0].hist(errors, bins=50, edgecolor="black", alpha=0.7)
axes[1, 0].axvline(0, color="red", linestyle="--", linewidth=2)
axes[1, 0].set_xlabel("Prediction Error")
axes[1, 0].set_ylabel("Frequency")
axes[1, 0].set_title(f"Error Distribution (mean={errors.mean():.4f})")
axes[1, 0].grid(alpha=0.3)

# D. Uncertainty vs Error
abs_errors = np.abs(errors)
axes[1, 1].scatter(df_eval["pred_std"], abs_errors, s=5, alpha=0.3)
axes[1, 1].set_xlabel("Model Uncertainty (Ïƒ)")
axes[1, 1].set_ylabel("Absolute Error")
axes[1, 1].set_title("Uncertainty Calibration")
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Sample Probability Distributions

In [None]:
# Visualize a few example distributions
sample_indices = np.linspace(0, len(df_eval)-1, 5, dtype=int)

plt.figure(figsize=(12, 6))
for idx in sample_indices:
    probs = probs_tensor[idx].numpy()
    true_val = df_eval.iloc[idx]["target_future"]
    
    plt.plot(grid_vals, probs, 
             label=f"t={df_eval.iloc[idx]['ts_exchange']:.0f}, truth={true_val:.2f}")

plt.xlabel("Probability Grid")
plt.ylabel("Probability Mass")
plt.title("Example Predicted Distributions")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()