# Week 4 – Baseline Model Performance Evaluation

## Objective
Establish a **BSM chooser baseline** with quantitative error metrics and limitation analysis.

## Practical Data Strategy
Direct public CME chooser transaction prices are not readily available. For a reproducible baseline, we use a **historical realized proxy**:

1. At date $t$, compute predicted chooser price using BSM (Rubinstein closed form).
2. Use realized JPM prices at $t+T_1$ and $t+T_2$ to compute ex-post chooser payoff.
3. Discount payoff to $t$ as proxy actual value.
4. Compute MAE/RMSE between prediction and proxy actual.

This provides a robust baseline for Week 5–6 ML comparisons.

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import yaml
import matplotlib.pyplot as plt

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.models.bsm_chooser import (
    rubinstein_chooser,
    compute_error_metrics,
    realized_proxy_pv,
    vix_regime_label,
    summarize_metrics_by_regime,
)

# plotting style
plt.style.use("seaborn-v0_8-whitegrid")

In [None]:
# Load configuration
with open(PROJECT_ROOT / "config" / "model_params.yaml", "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

params = cfg["model"]
S0_CFG, K = params["s0"], params["k"]
R_CFG, Q_CFG = params["r"], params["q"]
SIGMA_CFG, T1, T2 = params["sigma"], params["t1"], params["t2"]

# Load market data
jpm = pd.read_parquet(PROJECT_ROOT / "data" / "raw" / "yahoo_finance" / "JPM_daily_ohlcv.parquet")
vix = pd.read_parquet(PROJECT_ROOT / "data" / "raw" / "yahoo_finance" / "VIX_daily.parquet")

# Optional: FRED 10Y Treasury
fred_path = PROJECT_ROOT / "data" / "raw" / "fred" / "DGS10.parquet"
if fred_path.exists():
    dgs10 = pd.read_parquet(fred_path)
else:
    dgs10 = None

# Normalize index
for df in [jpm, vix]:
    if hasattr(df.index, "tz") and df.index.tz is not None:
        df.index = df.index.tz_localize(None)
    df.index = pd.to_datetime(df.index).normalize()

if dgs10 is not None:
    dgs10.index = pd.to_datetime(dgs10.index).normalize()

print("Loaded data shapes:")
print("  JPM:", jpm.shape)
print("  VIX:", vix.shape)
print("  DGS10:", dgs10.shape if dgs10 is not None else "N/A")

In [None]:
# Build aligned frame
frame = pd.DataFrame(index=jpm.index.copy())
frame["close"] = jpm["Close"].astype(float)
frame["vix"] = vix["Close"].reindex(frame.index).ffill().bfill().astype(float)

# Risk-free rate at valuation date (fallback to config if FRED missing)
if dgs10 is not None:
    dgs = dgs10.iloc[:, 0].reindex(frame.index).ffill().bfill().astype(float)
    # DGS10 usually in percent units (e.g., 4.25), convert to decimal
    frame["r"] = np.where(dgs > 1.0, dgs / 100.0, dgs)
else:
    frame["r"] = R_CFG

# Rolling realized volatility for prediction (252d annualized)
frame["log_ret"] = np.log(frame["close"] / frame["close"].shift(1))
frame["sigma_252d"] = frame["log_ret"].rolling(252, min_periods=200).std() * np.sqrt(252)

# Sentiment proxy from VIX (inverse rolling min-max)
vix_min = frame["vix"].rolling(252, min_periods=63).min()
vix_max = frame["vix"].rolling(252, min_periods=63).max()
spread = (vix_max - vix_min).replace(0, np.nan)
frame["sentiment_proxy"] = 1 - (frame["vix"] - vix_min) / spread

frame = frame.dropna(subset=["close", "sigma_252d", "r", "vix"]).copy()
print("Usable rows after feature prep:", len(frame))
frame.head()

## Backtest Construction

We approximate 0.5 year and 1.0 year horizons using trading-day offsets:
- $T_1 \approx 126$ trading days
- $T_2 \approx 252$ trading days

For each valuation date $t$:
- Predict chooser price using `rubinstein_chooser(S_t, K, r_t, q, \sigma_t, T_1, T_2)`
- Compute realized proxy PV from observed $S_{t+126}$ and $S_{t+252}$

In [None]:
T1_DAYS = 126
T2_DAYS = 252
VIX_THRESHOLD = 30.0

dates = frame.index.to_list()
rows = []

for i in range(len(dates) - T2_DAYS):
    t = dates[i]
    t1_date = dates[i + T1_DAYS]
    t2_date = dates[i + T2_DAYS]

    s_t = float(frame.iloc[i]["close"])
    s_t1 = float(frame.iloc[i + T1_DAYS]["close"])
    s_t2 = float(frame.iloc[i + T2_DAYS]["close"])
    sigma_t = float(frame.iloc[i]["sigma_252d"])
    r_t = float(frame.iloc[i]["r"])
    vix_t = float(frame.iloc[i]["vix"])
    senti_t = float(frame.iloc[i]["sentiment_proxy"]) if pd.notna(frame.iloc[i]["sentiment_proxy"]) else np.nan

    if not np.isfinite(sigma_t) or sigma_t <= 0:
        continue

    pred_price = rubinstein_chooser(s_t, K, r_t, Q_CFG, sigma_t, T1, T2)
    actual_proxy = realized_proxy_pv(
        s_t1_realized=s_t1,
        s_t2_realized=s_t2,
        k=K,
        r=r_t,
        t2=T2,
        use_proper_rule=False,
    )

    rows.append(
        {
            "date": t,
            "t1_date": t1_date,
            "t2_date": t2_date,
            "s_t": s_t,
            "s_t1": s_t1,
            "s_t2": s_t2,
            "sigma_t": sigma_t,
            "r_t": r_t,
            "vix_t": vix_t,
            "sentiment_proxy": senti_t,
            "pred_price": pred_price,
            "actual_proxy_pv": actual_proxy,
        }
    )

bt = pd.DataFrame(rows)
bt["error"] = bt["pred_price"] - bt["actual_proxy_pv"]
bt["abs_error"] = bt["error"].abs()
bt["sq_error"] = bt["error"] ** 2
bt["vix_regime"] = bt["vix_t"].apply(lambda x: vix_regime_label(x, VIX_THRESHOLD))
bt["sentiment_regime"] = np.where(bt["sentiment_proxy"] < 0.2, "low_sentiment", "normal_sentiment")

print(f"Backtest samples: {len(bt)}")
print(f"Date range: {bt['date'].min().date()} to {bt['date'].max().date()}")
bt.head()

## Error Metrics (Overall + Regime Split)

In [None]:
overall = compute_error_metrics(bt["actual_proxy_pv"].values, bt["pred_price"].values)
print("Overall metrics:")
print(f"  MAE : {overall['mae']:.4f}")
print(f"  RMSE: {overall['rmse']:.4f}")
print(f"  MAPE: {overall['mape']:.2%}")

regime_metrics = summarize_metrics_by_regime(
    bt, true_col="actual_proxy_pv", pred_col="pred_price", regime_col="vix_regime"
)

sent_metrics = summarize_metrics_by_regime(
    bt, true_col="actual_proxy_pv", pred_col="pred_price", regime_col="sentiment_regime"
)

print("\nMetrics by VIX regime:")
display(regime_metrics)

print("\nMetrics by sentiment regime:")
display(sent_metrics)

## Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# (1) Predicted vs actual scatter
ax = axes[0, 0]
ax.scatter(bt["actual_proxy_pv"], bt["pred_price"], alpha=0.35, s=16)
min_v = min(bt["actual_proxy_pv"].min(), bt["pred_price"].min())
max_v = max(bt["actual_proxy_pv"].max(), bt["pred_price"].max())
ax.plot([min_v, max_v], [min_v, max_v], "r--", lw=1)
ax.set_title("Predicted vs Proxy Actual")
ax.set_xlabel("Proxy Actual PV")
ax.set_ylabel("Predicted Price")

# (2) Error over time
ax = axes[0, 1]
ax.plot(bt["date"], bt["error"], lw=1)
ax.axhline(0, color="black", ls="--", lw=1)
ax.set_title("Pricing Error Over Time")
ax.set_xlabel("Date")
ax.set_ylabel("Pred - Actual")

# (3) Rolling MAE
ax = axes[1, 0]
bt_sorted = bt.sort_values("date").copy()
bt_sorted["rolling_mae_60"] = bt_sorted["abs_error"].rolling(60, min_periods=20).mean()
ax.plot(bt_sorted["date"], bt_sorted["rolling_mae_60"], color="tab:orange", lw=1.5)
ax.set_title("60-Day Rolling MAE")
ax.set_xlabel("Date")
ax.set_ylabel("MAE")

# (4) Regime MAE bars
ax = axes[1, 1]
ax.bar(regime_metrics["regime"], regime_metrics["mae"], alpha=0.8)
ax.set_title("MAE by VIX Regime")
ax.set_xlabel("Regime")
ax.set_ylabel("MAE")

plt.tight_layout()
out_png = PROJECT_ROOT / "notebooks" / "week4_validation_plots.png"
plt.savefig(out_png, dpi=150, bbox_inches="tight")
plt.show()
print(f"Saved: {out_png}")

In [None]:
# Export benchmark tables for documentation
metrics_table = pd.DataFrame([
    {"metric": "MAE", "value": overall["mae"]},
    {"metric": "RMSE", "value": overall["rmse"]},
    {"metric": "MAPE", "value": overall["mape"]},
    {"metric": "samples", "value": len(bt)},
])

out_dir = PROJECT_ROOT / "docs"
out_dir.mkdir(parents=True, exist_ok=True)
metrics_csv = out_dir / "week4_metrics_summary.csv"
regime_csv = out_dir / "week4_regime_metrics.csv"

metrics_table.to_csv(metrics_csv, index=False)
regime_metrics.to_csv(regime_csv, index=False)

print(f"Saved: {metrics_csv}")
print(f"Saved: {regime_csv}")
metrics_table

## Findings Snapshot

- Baseline metrics are computed from `pred_price` vs `actual_proxy_pv`.
- Regime split highlights larger errors in high-VIX periods (if present in sample).
- Sentiment proxy split provides a first-pass view of the model gap under stress-like sentiment.

These outputs are used directly in:
- `docs/week4_validation_report.md`
- `docs/bsm_benchmark.md`