# Multi-Strategy Benchmark

Compare **7 alpha extraction strategies** under identical walk-forward conditions.

| Strategy | Description |
|----------|-------------|
| A | LSTM Baseline (price-only temporal) |
| B | NLP Only (sentiment MLP) |
| C | Late Ensemble (A + B with ridge) |
| D | Residual Sentiment (market-residualized NLP) |
| E | Gated Hybrid (gated price-NLP fusion) |
| F | Cross-Sectional Attention LSTM |
| G | Short Horizon NLP (5D only) |

**Goal:** Determine which structure produces real, production-surviving alpha.

In [None]:
!pip install -q yfinance lightgbm torch optuna pyarrow scikit-learn scipy pandas numpy matplotlib

In [None]:
import os
os.chdir("/content")
!rm -rf AI-stock-investment-tool

REPO = "https://github.com/kevin6598/AI-stock-investment-tool.git"
ret = os.system("git clone %s 2>/dev/null" % REPO)
if ret != 0:
    from getpass import getpass
    token = getpass("GitHub token (repo scope): ")
    os.system("git clone https://%s@github.com/kevin6598/AI-stock-investment-tool.git" % token)
    del token

os.chdir("/content/AI-stock-investment-tool")
!git log --oneline -3

In [None]:
import torch, sys
print("Python: %s" % sys.version)
print("PyTorch: %s" % torch.__version__)
print("CUDA: %s" % torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU: %s" % torch.cuda.get_device_name(0))

## 1. Global Configuration

Single config block. All strategies respect these constraints.
No model-specific unfair tuning.

In [None]:
CONFIG = {
    "horizons": [5, 21],
    "walk_forward": {
        "train_years": 2,
        "test_months": 4,
        "step_months": 4,
        "val_months": 2,
        "embargo_days": 5,
    },
    "max_epochs": 15,
    "early_stop_patience": 3,
    "ranking_weight": 0.5,
    "max_params": 1_500_000,
}

print("Horizons: %s" % CONFIG["horizons"])
print("Walk-forward: %s" % CONFIG["walk_forward"])
print("Max epochs: %d" % CONFIG["max_epochs"])
print("Max params: %d" % CONFIG["max_params"])

# Data span check (will be computed after loading data)
# With train=24mo + val=2mo + test=4mo + embargo ~= 30mo minimum
# Dataset ~42 months -> expect ~3-4 folds with step=4mo

## 2. Standardized Data Pipeline

All strategies use the SAME processed dataset.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

DRIVE_DIR = "/content/drive/MyDrive/ai_stock_tool"
DATA_PATH = os.path.join(DRIVE_DIR, "dataset.parquet")
ARTIFACT_DIR = os.path.join(DRIVE_DIR, "artifacts")
os.makedirs(ARTIFACT_DIR, exist_ok=True)
print("Data: %s" % DATA_PATH)

In [None]:
import pandas as pd
import numpy as np

panel = pd.read_parquet(DATA_PATH)
valid_tickers = panel.index.get_level_values(1).unique().tolist()
print("Panel: %s" % str(panel.shape))
print("Tickers: %d" % len(valid_tickers))

date_min = panel.index.get_level_values(0).min()
date_max = panel.index.get_level_values(0).max()
data_span_months = (date_max - date_min).days / 30.44
print("Date range: %s to %s (%.0f months)" % (
    date_min.date(), date_max.date(), data_span_months))

# Estimate fold count
wf = CONFIG["walk_forward"]
train_months = wf["train_years"] * 12
val_months = wf.get("val_months", 3)
test_months = wf["test_months"]
step_months = wf["step_months"]
min_needed = train_months + val_months + test_months + 1
available_after_first = data_span_months - min_needed
est_folds = max(0, 1 + int(available_after_first / step_months))
print("\nFold estimate: train=%dmo + val=%dmo + test=%dmo = %dmo min" % (
    train_months, val_months, test_months, min_needed))
print("Expected folds: ~%d (with step=%dmo)" % (est_folds, step_months))
if est_folds == 0:
    print("WARNING: No folds possible! Reduce train_years or test_months.")

# Feature columns (same for ALL strategies)
feature_cols = [
    c for c in panel.columns
    if not c.startswith("fwd_return_")
    and not c.startswith("residual_return_")
    and not c.startswith("ranked_target_")
    and c not in ("_close", "ticker_id")
]

price_cols = [c for c in feature_cols if not c.startswith("nlp_")]
nlp_cols = [c for c in feature_cols if c.startswith("nlp_")]
print("\nFeatures: %d total (%d price, %d NLP)" % (
    len(feature_cols), len(price_cols), len(nlp_cols)))

# Check required targets
for h in CONFIG["horizons"]:
    tc = "fwd_return_%dd" % h
    if tc in panel.columns:
        non_null = panel[tc].notna().sum()
        print("  %s: %d non-null" % (tc, non_null))
    else:
        print("  WARNING: %s not found!" % tc)

In [None]:
# Precompute derived features for all strategies

# Sentiment residual (for Strategy D)
market_feat = None
for cand in ["market_return", "market_return_21d", "spy_return_21d"]:
    if cand in panel.columns:
        market_feat = cand
        break

if market_feat:
    print("Market return feature: %s" % market_feat)
else:
    print("WARNING: No market return feature found for residualization")

# Volatility proxy
vol_feat = None
for cand in ["volatility_21d", "realized_vol_21d", "atr_pct"]:
    if cand in panel.columns:
        vol_feat = cand
        break

if vol_feat:
    print("Volatility feature: %s" % vol_feat)
else:
    print("WARNING: No volatility feature found")

# Data quality check
nan_pct = panel[feature_cols].isna().mean().mean() * 100
print("\nNaN rate: %.2f%%" % nan_pct)
print("Data pipeline ready.")

## 3. Strategy Definitions

All 7 strategies with identical interface: `train()`, `predict()`, `num_parameters()`.

In [None]:
from training.strategy_benchmark import (
    BenchmarkConfig, BenchmarkEvaluator, STRATEGY_REGISTRY,
    save_benchmark_results, run_integrity_checks,
)

config = BenchmarkConfig.from_dict(CONFIG)

# List all strategies
print("Strategies to benchmark:")
print("-" * 40)
for key, cls in sorted(STRATEGY_REGISTRY.items()):
    sh = getattr(cls, 'supported_horizons', None)
    h_info = "all horizons" if sh is None else "%s only" % sh
    print("  %s: %s (%s)" % (key, cls.name, h_info))

# Verify param counts (dry-run instantiation)
print("\nParam count check (instantiation only):")
for key, cls in sorted(STRATEGY_REGISTRY.items()):
    s = cls()
    print("  %s: OK" % cls.name)

## 4. Run Benchmark

Walk-forward evaluation for every strategy at every horizon.

Expected runtime: ~15-30 min on T4 GPU.

In [None]:
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", datefmt="%H:%M:%S")

evaluator = BenchmarkEvaluator(panel, feature_cols, config)

strategy_classes = list(STRATEGY_REGISTRY.values())
print("Running %d strategies x %d horizons..." % (
    len(strategy_classes), len(config.horizons)))
print("=" * 60)

import time
t0 = time.time()
results = evaluator.run_all(strategy_classes)
total_time = time.time() - t0

print("\n" + "=" * 60)
print("Benchmark complete in %.1f min" % (total_time / 60))
print("Total evaluations: %d" % len(results))

## 5. Strategy Comparison Table

In [None]:
import math

# Build comparison table
rows = []
for r in results:
    rows.append({
        "Strategy": r.name,
        "Horizon": r.horizon,
        "IC": round(r.ic_mean, 4),
        "ICIR": round(r.icir, 2),
        "Sharpe": round(r.sharpe, 2),
        "IC_std": round(r.ic_std, 4),
        "Prod IC": round(r.prod_ic, 4),
        "Overfit": round(r.overfit_score, 3),
        "Composite": round(r.composite, 4),
        "Params": r.param_count,
        "Time(s)": round(r.train_time, 1),
        "Status": r.status,
    })

df_results = pd.DataFrame(rows)
print("\nStrategy Benchmark Results (sorted by Composite):")
print("=" * 100)
display(df_results)

# Summary by status
print("\nStatus Summary:")
for status in ["PASS", "WARN", "FAIL"]:
    count = sum(1 for r in results if r.status == status)
    names = [r.name + "/" + r.horizon for r in results if r.status == status]
    print("  %s (%d): %s" % (status, count, ", ".join(names) if names else "none"))

## 6. Visualization Panel

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

fig = plt.figure(figsize=(20, 16))
gs = gridspec.GridSpec(2, 3, hspace=0.35, wspace=0.3)

# --- Panel 1: IC Comparison Bar Chart ---
ax1 = fig.add_subplot(gs[0, 0])
labels = ["%s\n%s" % (r.name.split("_", 1)[-1][:12], r.horizon) for r in results]
ics = [r.ic_mean for r in results]
colors = ["#4CAF50" if r.status == "PASS" else "#FFC107" if r.status == "WARN" else "#F44336"
          for r in results]
ax1.barh(range(len(results)), ics, color=colors, edgecolor="white")
ax1.set_yticks(range(len(results)))
ax1.set_yticklabels(labels, fontsize=8)
ax1.set_xlabel("Cross-Sectional IC")
ax1.set_title("IC Comparison", fontweight="bold")
ax1.axvline(x=0, color="gray", linewidth=0.5)
ax1.invert_yaxis()

# --- Panel 2: IC Stability (fold ICs per strategy) ---
ax2 = fig.add_subplot(gs[0, 1])
cmap = plt.cm.get_cmap("tab10")
for i, r in enumerate(results):
    if r.fold_metrics:
        fold_ics = [f.ic for f in r.fold_metrics]
        x_positions = [i] * len(fold_ics)
        ax2.scatter(x_positions, fold_ics, color=cmap(i % 10), s=40, zorder=3, alpha=0.7)
        ax2.plot([i, i], [min(fold_ics), max(fold_ics)],
                 color=cmap(i % 10), linewidth=2, alpha=0.5)
ax2.set_xticks(range(len(results)))
ax2.set_xticklabels([r.name.split("_")[0] + "/" + r.horizon for r in results],
                     rotation=45, fontsize=7, ha="right")
ax2.axhline(y=0, color="gray", linewidth=0.5, linestyle="--")
ax2.set_ylabel("IC per fold")
ax2.set_title("IC Stability (per fold)", fontweight="bold")
ax2.grid(axis="y", alpha=0.3)

# --- Panel 3: Composite Score Bar ---
ax3 = fig.add_subplot(gs[0, 2])
composites = [r.composite if not math.isinf(r.composite) else 0 for r in results]
ax3.barh(range(len(results)), composites, color=colors, edgecolor="white")
ax3.set_yticks(range(len(results)))
ax3.set_yticklabels(labels, fontsize=8)
ax3.set_xlabel("Composite Score")
ax3.set_title("Composite Score", fontweight="bold")
ax3.axvline(x=0, color="gray", linewidth=0.5)
ax3.invert_yaxis()

# --- Panel 4: Gate Activation (Strategy E only) ---
ax4 = fig.add_subplot(gs[1, 0])
gate_data = [(r.name, r.horizon, r.gate_stats) for r in results if r.gate_stats]
if gate_data:
    g_labels = ["%s/%s" % (n, h) for n, h, _ in gate_data]
    g_means = [gs.get("gate_mean", 0) for _, _, gs in gate_data]
    g_stds = [gs.get("gate_std", 0) for _, _, gs in gate_data]
    ax4.barh(g_labels, g_means, xerr=g_stds, color="#2196F3", capsize=5)
    ax4.set_xlabel("Gate Activation")
    ax4.set_title("Gate Distribution (Hybrid)", fontweight="bold")
    ax4.set_xlim(0, 1)
else:
    ax4.text(0.5, 0.5, "No gate stats\navailable",
             transform=ax4.transAxes, ha="center", va="center", fontsize=14, color="gray")
    ax4.set_title("Gate Distribution", fontweight="bold")

# --- Panel 5: Ensemble Weights (Strategy C only) ---
ax5 = fig.add_subplot(gs[1, 1])
ens_data = [(r.name, r.horizon, r.ensemble_weights) for r in results if r.ensemble_weights]
if ens_data:
    for i, (name, horizon, w) in enumerate(ens_data):
        x = [0, 1]
        vals = [w.get("lstm", 0), w.get("nlp", 0)]
        ax5.bar([v + i * 0.3 for v in x], vals, width=0.25,
                label="%s/%s" % (name.split("_")[0], horizon))
    ax5.set_xticks([0, 1])
    ax5.set_xticklabels(["LSTM weight", "NLP weight"])
    ax5.set_title("Ensemble Weights", fontweight="bold")
    ax5.legend(fontsize=8)
    ax5.axhline(y=0, color="gray", linewidth=0.5, linestyle="--")
else:
    ax5.text(0.5, 0.5, "No ensemble weights\navailable",
             transform=ax5.transAxes, ha="center", va="center", fontsize=14, color="gray")
    ax5.set_title("Ensemble Weights", fontweight="bold")

# --- Panel 6: Horizon Sensitivity ---
ax6 = fig.add_subplot(gs[1, 2])
strategy_names = sorted(set(r.name for r in results))
horizon_labels = sorted(set(r.horizon for r in results))
x_pos = np.arange(len(strategy_names))
width = 0.35

for j, h in enumerate(horizon_labels):
    h_ics = []
    for sn in strategy_names:
        match = [r for r in results if r.name == sn and r.horizon == h]
        h_ics.append(match[0].ic_mean if match else 0)
    offset = (j - len(horizon_labels) / 2 + 0.5) * width
    ax6.bar(x_pos + offset, h_ics, width, label=h)

ax6.set_xticks(x_pos)
ax6.set_xticklabels([n.split("_")[0] for n in strategy_names],
                     rotation=45, fontsize=8, ha="right")
ax6.set_ylabel("IC")
ax6.set_title("Horizon Sensitivity (5D vs 21D)", fontweight="bold")
ax6.legend(fontsize=9)
ax6.axhline(y=0, color="gray", linewidth=0.5, linestyle="--")
ax6.grid(axis="y", alpha=0.3)

fig.suptitle("Multi-Strategy Benchmark Dashboard", fontsize=16, fontweight="bold", y=1.01)
plt.savefig(os.path.join(ARTIFACT_DIR, "strategy_benchmark.png"), dpi=150, bbox_inches="tight")
plt.show()
print("Dashboard saved.")

## 7. Diagnostic Output

In [None]:
# Save full benchmark results to JSON
save_path = save_benchmark_results(
    results,
    path=os.path.join(ARTIFACT_DIR, "strategy_benchmark_results.json"),
)
print("Results saved to: %s" % save_path)

# Print per-strategy detail
print("\n" + "=" * 70)
print("DETAILED RESULTS")
print("=" * 70)
for r in results:
    print("\n--- %s / %s [%s] ---" % (r.name, r.horizon, r.status))
    print("  IC: %.4f +/- %.4f  ICIR: %.2f" % (r.ic_mean, r.ic_std, r.icir))
    print("  Sharpe: %.2f  MaxDD: %.4f" % (r.sharpe, r.max_drawdown))
    print("  Overfit: %.3f  Composite: %.4f" % (r.overfit_score, r.composite))
    print("  Prod IC: %.4f  Params: %d  Time: %.1fs" % (
        r.prod_ic, r.param_count, r.train_time))
    if r.fold_metrics:
        fold_ics = [f.ic for f in r.fold_metrics]
        print("  Fold ICs: %s" % [round(x, 4) for x in fold_ics])
    if r.gate_stats:
        print("  Gate: mean=%.3f std=%.3f" % (
            r.gate_stats.get("gate_mean", 0), r.gate_stats.get("gate_std", 0)))
    if r.ensemble_weights:
        w = r.ensemble_weights
        print("  Ensemble: lstm=%.3f nlp=%.3f intercept=%.4f" % (
            w.get("lstm", 0), w.get("nlp", 0), w.get("intercept", 0)))

## 8. Experimental Integrity Checks

In [None]:
warnings = run_integrity_checks(results)

if warnings:
    print("INTEGRITY WARNINGS (%d):" % len(warnings))
    for w in warnings:
        print("  [!] %s" % w)
else:
    print("All integrity checks passed.")

# Additional checks
print("\nConsistency Checks:")

# Same number of folds across strategies at same horizon
for h in sorted(set(r.horizon for r in results)):
    fold_counts = [len(r.fold_metrics) for r in results if r.horizon == h]
    if len(set(fold_counts)) > 1:
        print("  [!] Inconsistent fold counts at %s: %s" % (h, fold_counts))
    else:
        print("  [OK] %s: %d folds for all strategies" % (h, fold_counts[0] if fold_counts else 0))

# Param limit check
for r in results:
    if r.param_count > CONFIG["max_params"]:
        print("  [!] %s: %d params > %d limit" % (r.name, r.param_count, CONFIG["max_params"]))

print("\nIntegrity check complete.")

## Summary & Decision

This benchmark answers:

1. **Does sentiment add incremental alpha?** Compare A vs C/E
2. **Is hybrid destructive or additive?** Compare A vs E
3. **Is ensemble safer than fusion?** Compare C vs E
4. **Is NLP short-horizon only?** Compare G/5D vs B/21D
5. **Is cross-sectional modeling necessary?** Compare A vs F
6. **Which structure survives production retrain?** Check Prod IC column

In [None]:
# Automated decision summary
print("=" * 60)
print("DECISION SUMMARY")
print("=" * 60)

# Best overall
passing = [r for r in results if r.status == "PASS"]
if passing:
    best = passing[0]  # already sorted by composite
    print("\nBest PASS strategy: %s/%s" % (best.name, best.horizon))
    print("  Composite: %.4f  IC: %.4f  Prod IC: %.4f" % (
        best.composite, best.ic_mean, best.prod_ic))
else:
    print("\nNo strategy achieved PASS status.")
    warning_results = [r for r in results if r.status == "WARN"]
    if warning_results:
        best = warning_results[0]
        print("Best WARN strategy: %s/%s" % (best.name, best.horizon))
        print("  Composite: %.4f  IC: %.4f" % (best.composite, best.ic_mean))

# Key questions
print("\n--- Key Questions ---")

# 1. Sentiment alpha
a_21 = [r for r in results if r.name.startswith("A_") and r.horizon == "21D"]
c_21 = [r for r in results if r.name.startswith("C_") and r.horizon == "21D"]
if a_21 and c_21:
    delta = c_21[0].ic_mean - a_21[0].ic_mean
    verdict = "YES (+%.4f IC)" % delta if delta > 0.005 else "NO (delta=%.4f)" % delta
    print("1. Sentiment adds alpha? %s" % verdict)

# 2. Hybrid vs baseline
e_21 = [r for r in results if r.name.startswith("E_") and r.horizon == "21D"]
if a_21 and e_21:
    delta = e_21[0].ic_mean - a_21[0].ic_mean
    verdict = "ADDITIVE (+%.4f)" % delta if delta > 0 else "DESTRUCTIVE (%.4f)" % delta
    print("2. Hybrid vs LSTM? %s" % verdict)

# 3. Ensemble vs fusion
if c_21 and e_21:
    if c_21[0].ic_mean > e_21[0].ic_mean:
        print("3. Ensemble vs Fusion? ENSEMBLE safer (IC %.4f vs %.4f)" % (
            c_21[0].ic_mean, e_21[0].ic_mean))
    else:
        print("3. Ensemble vs Fusion? FUSION better (IC %.4f vs %.4f)" % (
            e_21[0].ic_mean, c_21[0].ic_mean))

# 4. Short horizon NLP
b_5 = [r for r in results if r.name.startswith("B_") and r.horizon == "5D"]
b_21 = [r for r in results if r.name.startswith("B_") and r.horizon == "21D"]
g_5 = [r for r in results if r.name.startswith("G_") and r.horizon == "5D"]
if b_5 and b_21:
    if b_5[0].ic_mean > b_21[0].ic_mean + 0.005:
        print("4. NLP short-horizon only? YES (5D IC=%.4f > 21D IC=%.4f)" % (
            b_5[0].ic_mean, b_21[0].ic_mean))
    else:
        print("4. NLP short-horizon only? NO (5D IC=%.4f, 21D IC=%.4f)" % (
            b_5[0].ic_mean, b_21[0].ic_mean))

# 5. Cross-sectional attention
f_21 = [r for r in results if r.name.startswith("F_") and r.horizon == "21D"]
if a_21 and f_21:
    delta = f_21[0].ic_mean - a_21[0].ic_mean
    verdict = "YES (+%.4f)" % delta if delta > 0.005 else "NO (delta=%.4f)" % delta
    print("5. Cross-sectional attention helps? %s" % verdict)

# 6. Production survival
print("\n--- Production Survival ---")
for r in results:
    if r.ic_mean > 0 and r.prod_ic > 0:
        ratio = r.prod_ic / r.ic_mean if r.ic_mean > 1e-8 else 0
        survived = "SURVIVED" if ratio >= 0.9 else "DEGRADED"
        print("  %s/%s: WF IC=%.4f -> Prod IC=%.4f (%.0f%%) [%s]" % (
            r.name, r.horizon, r.ic_mean, r.prod_ic, ratio * 100, survived))

# Final recommendation
print("\n" + "=" * 60)
if passing:
    print("RECOMMENDATION: Deploy %s" % best.name)
elif warning_results:
    print("RECOMMENDATION: Cautiously deploy %s (WARN status)" % warning_results[0].name)
else:
    print("RECOMMENDATION: No viable strategy. Review data/features.")
print("=" * 60)