# KTND-Finance: Full Experiment Pipeline

**Run everything in 2 cells:**
1. **Cell 1** (Setup) - Install deps + clone repo (~2 min)
2. **Cell 2** (Run All) - Downloads data, trains models, runs baselines/robustness/rolling, generates figures (~2-3 hours)

Set runtime to **GPU (T4)** before running: Runtime > Change runtime type > T4 GPU

Then hit **Runtime > Run all** and walk away.

In [None]:
#@title 1. Setup (install + clone + verify) - ~2 min

# Install missing dependencies (torch/numpy/pandas/scipy/sklearn/matplotlib are pre-installed)
!pip install -q yfinance>=1.0.0 hmmlearn>=0.3.0 statsmodels>=0.14.0 arch>=6.0.0 pyyaml>=6.0

# Clone repo
import os, sys
REPO_URL = "https://github.com/keshavkrishnan08/kind_finance.git"
REPO_DIR = "/content/ktnd_finance"

if os.path.exists(REPO_DIR):
    !cd {REPO_DIR} && git pull
else:
    !git clone {REPO_URL} {REPO_DIR}

os.chdir(REPO_DIR)
sys.path.insert(0, REPO_DIR)

# Verify
import torch, numpy as np
from src.model.vampnet import NonEquilibriumVAMPNet
print(f"Python {sys.version.split()[0]} | PyTorch {torch.__version__} | "
      f"CUDA: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print("Setup complete.")

In [None]:
#@title 2. Run ENTIRE experiment pipeline (~2-3 hours) - just run this and walk away

import subprocess, time, json, os, sys

REPO_DIR = "/content/ktnd_finance"
RESULTS_DIR = os.path.join(REPO_DIR, "outputs", "results")
FIGURES_DIR = os.path.join(REPO_DIR, "outputs", "figures")
os.chdir(REPO_DIR)
python = sys.executable

def run(name, cmd):
    """Run a stage, print status, return success."""
    print(f"\n{'='*70}")
    print(f"  STAGE: {name}")
    print(f"{'='*70}")
    t0 = time.time()
    result = subprocess.run(cmd, shell=True, cwd=REPO_DIR)
    elapsed = time.time() - t0
    status = 'OK' if result.returncode == 0 else 'FAILED'
    print(f"  >> {name}: {status} ({elapsed/60:.1f} min)")
    return result.returncode == 0

pipeline_start = time.time()
results = {}

# --- Stage 1: Quick tests (skip slow synthetic tests) ---
results['tests'] = run('Quick tests',
    f'{python} -m pytest tests/ -q --tb=short -k "not test_synthetic" 2>&1 | tail -5')

# --- Stage 2: Download data ---
results['download'] = run('Download data',
    f'{python} data/download.py --mode all')

# --- Stage 3: Train univariate (SPY, 1993-2025) ---
results['train_uni'] = run('Train univariate (SPY)',
    f'{python} experiments/run_main.py --config config/univariate.yaml --mode univariate --seed 42')

# --- Stage 4: Train multiasset (11 ETFs, 2007-2025) ---
results['train_multi'] = run('Train multiasset (11 ETFs)',
    f'{python} experiments/run_main.py --config config/multiasset.yaml --mode multiasset --seed 42')

# --- Stage 5: Baselines (HMM, DMD, PCA, VIX threshold) ---
results['baselines'] = run('Baselines',
    f'{python} experiments/run_baselines.py --config config/default.yaml')

# --- Stage 6: Robustness tests (CK, bootstrap, permutation, etc.) ---
results['robustness'] = run('Robustness tests (univariate)',
    f'{python} experiments/run_robustness.py --config config/default.yaml --mode univariate')

# --- Stage 7: Rolling spectral analysis ---
results['rolling'] = run('Rolling spectral analysis',
    f'{python} experiments/run_rolling.py --config config/default.yaml --mode univariate')

# --- Stage 8: Generate figures (using absolute paths) ---
results['figures'] = run('Generate figures',
    f'{python} experiments/run_figures.py --results-dir {RESULTS_DIR} --figures-dir {FIGURES_DIR}')

# --- Debug: check what files exist ---
print(f"\n{'='*70}")
print(f"  DEBUG: Files in {RESULTS_DIR}")
print(f"{'='*70}")
if os.path.exists(RESULTS_DIR):
    for f in sorted(os.listdir(RESULTS_DIR)):
        fpath = os.path.join(RESULTS_DIR, f)
        size = os.path.getsize(fpath)
        print(f"    {size:>10,} bytes  {f}")
else:
    print("    (directory does not exist)")

print(f"\n  Files in {FIGURES_DIR}")
if os.path.exists(FIGURES_DIR):
    for f in sorted(os.listdir(FIGURES_DIR)):
        print(f"    {f}")
    sup_dir = os.path.join(FIGURES_DIR, "supplemental")
    if os.path.exists(sup_dir):
        for f in sorted(os.listdir(sup_dir)):
            print(f"    supplemental/{f}")
else:
    print("    (directory does not exist)")

# --- Final report ---
total_min = (time.time() - pipeline_start) / 60
n_ok = sum(results.values())
n_total = len(results)

print(f"\n\n{'='*70}")
print(f"  PIPELINE COMPLETE: {n_ok}/{n_total} stages passed ({total_min:.1f} min total)")
print(f"{'='*70}")
for name, ok in results.items():
    print(f"  {'OK' if ok else 'FAIL':6s}  {name}")

# Print key metrics
rpath = os.path.join(RESULTS_DIR, 'analysis_results.json')
if os.path.exists(rpath):
    with open(rpath) as f:
        r = json.load(f)
    print(f"\n  Key Metrics:")
    print(f"    Spectral gap:         {r.get('spectral_gap', 'N/A')}")
    print(f"    Entropy (empirical):  {r.get('entropy_empirical', 'N/A')} "
          f"[{r.get('entropy_ci_lower', '?')}, {r.get('entropy_ci_upper', '?')}] 95% CI")
    print(f"    Mean irreversibility: {r.get('mean_irreversibility', 'N/A')}")
    print(f"    Irrev method:         {r.get('irrev_method', 'N/A')}")
    print(f"    DB violation:         {r.get('detailed_balance_violation', 'N/A')}")
    print(f"    Complex modes:        {r.get('n_complex_modes', 'N/A')}/{r.get('n_modes', 'N/A')}")
    print(f"    FT ratio:             {r.get('fluctuation_theorem_ratio', 'N/A')}")
print(f"{'='*70}")


In [None]:
#@title 3. View figures (run after pipeline finishes)

import glob, os, json
import numpy as np
import pandas as pd
from IPython.display import Image, display

REPO_DIR = "/content/ktnd_finance"
RESULTS_DIR = os.path.join(REPO_DIR, "outputs", "results")
FIGURES_DIR = os.path.join(REPO_DIR, "outputs", "figures")

# Check for existing figures
pngs = sorted(glob.glob(f"{FIGURES_DIR}/*.png"))
sup_pngs = sorted(glob.glob(f"{FIGURES_DIR}/supplemental/*.png"))
all_pngs = pngs + sup_pngs

if all_pngs:
    print(f"Generated {len(all_pngs)} figures:\n")
    for p in all_pngs:
        print(f"--- {os.path.basename(p)} ---")
        display(Image(filename=p, width=800))
        print()
else:
    # Figures weren't generated by run_figures.py â€” generate inline
    print("No pre-generated figures found. Generating inline from results...\n")
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt

    os.makedirs(FIGURES_DIR, exist_ok=True)
    generated = []

    # Load results
    analysis_path = os.path.join(RESULTS_DIR, "analysis_results.json")
    if os.path.exists(analysis_path):
        with open(analysis_path) as f:
            results = json.load(f)

        # Fig 1: Eigenvalue spectrum
        eig_real = results.get("eigenvalues_real")
        eig_imag = results.get("eigenvalues_imag")
        if eig_real and eig_imag:
            eig_real, eig_imag = np.array(eig_real), np.array(eig_imag)
            magnitudes = np.sqrt(eig_real**2 + eig_imag**2)
            fig, ax = plt.subplots(1, 1, figsize=(7, 7))
            theta = np.linspace(0, 2*np.pi, 300)
            ax.plot(np.cos(theta), np.sin(theta), "k--", lw=0.8, alpha=0.5, label="|$\\lambda$|=1")
            sc = ax.scatter(eig_real, eig_imag, c=magnitudes, cmap="viridis",
                            edgecolors="k", linewidths=0.4, s=80, zorder=3)
            plt.colorbar(sc, ax=ax, label="|$\\lambda$|")
            order = np.argsort(-magnitudes)
            for rank in range(min(5, len(order))):
                idx = order[rank]
                ax.annotate(f"$\\lambda_{{{rank}}}$", (eig_real[idx], eig_imag[idx]),
                            textcoords="offset points", xytext=(8, 8), fontsize=9)
            ax.set_xlabel("Re($\\lambda$)"); ax.set_ylabel("Im($\\lambda$)")
            ax.set_title("Koopman Eigenvalue Spectrum"); ax.set_aspect("equal")
            ax.legend(loc="upper left"); ax.grid(True, alpha=0.3)
            fig.tight_layout()
            path = os.path.join(FIGURES_DIR, "fig1_eigenvalue_spectrum.png")
            fig.savefig(path, dpi=300, bbox_inches="tight"); plt.close(fig)
            generated.append(path)

        # Fig: Singular values
        svs = results.get("singular_values")
        if svs:
            svs = np.array(svs)
            fig, ax = plt.subplots(figsize=(8, 5))
            ax.bar(range(len(svs)), svs, color="steelblue", edgecolor="navy", lw=0.3)
            ax.axhline(y=1.0, color="red", ls="--", lw=0.8)
            ax.set_xlabel("Mode $k$"); ax.set_ylabel("$\\sigma_k$")
            ax.set_title("Koopman Singular Value Spectrum"); ax.grid(True, alpha=0.3, axis="y")
            fig.tight_layout()
            path = os.path.join(FIGURES_DIR, "fig_singular_values.png")
            fig.savefig(path, dpi=300, bbox_inches="tight"); plt.close(fig)
            generated.append(path)
    else:
        print(f"  WARNING: {analysis_path} not found")

    # Fig: Eigenvalues bar chart
    eig_csv = os.path.join(RESULTS_DIR, "eigenvalues.csv")
    if os.path.exists(eig_csv):
        edf = pd.read_csv(eig_csv)
        if "magnitude" in edf.columns:
            fig, ax = plt.subplots(figsize=(8, 5))
            ax.bar(range(len(edf)), edf["magnitude"], color="steelblue", edgecolor="navy", lw=0.3)
            ax.axhline(y=1.0, color="red", ls="--", lw=0.8)
            ax.set_xlabel("Mode $k$"); ax.set_ylabel("|$\\lambda_k$|")
            ax.set_title("Eigenvalue Magnitudes"); ax.grid(True, alpha=0.3, axis="y")
            fig.tight_layout()
            path = os.path.join(FIGURES_DIR, "fig_eigenvalue_magnitudes.png")
            fig.savefig(path, dpi=300, bbox_inches="tight"); plt.close(fig)
            generated.append(path)

    # Fig: Entropy decomposition
    entropy_csv = os.path.join(RESULTS_DIR, "entropy_decomposition.csv")
    if os.path.exists(entropy_csv):
        edf = pd.read_csv(entropy_csv)
        if "entropy_production" in edf.columns and "entropy_fraction" in edf.columns:
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
            ax1.bar(edf["mode"], edf["entropy_production"], color="coral", edgecolor="darkred", lw=0.5)
            ax1.set_xlabel("Mode $k$"); ax1.set_ylabel("$\\sigma_k$")
            ax1.set_title("Per-mode Entropy Production"); ax1.grid(True, alpha=0.3, axis="y")
            ax2.bar(edf["mode"], np.cumsum(edf["entropy_fraction"]),
                    color="steelblue", edgecolor="navy", lw=0.5)
            ax2.set_xlabel("Mode $k$"); ax2.set_ylabel("Cumulative Fraction")
            ax2.set_title("Cumulative Entropy Fraction"); ax2.set_ylim(0, 1.05)
            ax2.grid(True, alpha=0.3, axis="y")
            fig.suptitle("Spectral Entropy Decomposition", fontsize=14, y=1.02)
            fig.tight_layout()
            path = os.path.join(FIGURES_DIR, "fig_entropy_decomposition.png")
            fig.savefig(path, dpi=300, bbox_inches="tight"); plt.close(fig)
            generated.append(path)

    # Fig: Irreversibility field
    irrev_path = os.path.join(RESULTS_DIR, "irreversibility_field.npy")
    if os.path.exists(irrev_path):
        irrev = np.load(irrev_path, allow_pickle=True)
        fig, ax = plt.subplots(figsize=(14, 4))
        ax.fill_between(range(len(irrev)), irrev, alpha=0.4, color="darkorange")
        ax.plot(irrev, lw=0.5, color="darkorange")
        ax.set_xlabel("Time index"); ax.set_ylabel("$I(x)$")
        ax.set_title("Irreversibility Field"); ax.grid(True, alpha=0.3)
        fig.tight_layout()
        path = os.path.join(FIGURES_DIR, "fig_irreversibility_field.png")
        fig.savefig(path, dpi=300, bbox_inches="tight"); plt.close(fig)
        generated.append(path)

    # Fig: Rolling spectral gap vs VIX
    rolling_csv = os.path.join(RESULTS_DIR, "spectral_gap_timeseries.csv")
    if os.path.exists(rolling_csv):
        rdf = pd.read_csv(rolling_csv)
        if "spectral_gap" in rdf.columns:
            fig, ax1 = plt.subplots(figsize=(14, 5))
            dates = pd.to_datetime(rdf["center_date"]) if "center_date" in rdf.columns else np.arange(len(rdf))
            ax1.plot(dates, rdf["spectral_gap"], color="steelblue", lw=1.0, label="Spectral Gap")
            ax1.set_xlabel("Date"); ax1.set_ylabel("Spectral Gap", color="steelblue")
            vix_file = os.path.join(REPO_DIR, "data", "vix.csv")
            if os.path.exists(vix_file):
                vdf = pd.read_csv(vix_file, index_col=0, parse_dates=True)
                vc = "Close" if "Close" in vdf.columns else vdf.columns[0]
                ax2 = ax1.twinx()
                ax2.plot(vdf.index, vdf[vc], color="firebrick", lw=0.8, alpha=0.6, label="VIX")
                ax2.set_ylabel("VIX", color="firebrick")
            ax1.set_title("Spectral Gap vs VIX"); ax1.legend(loc="upper left")
            fig.tight_layout()
            path = os.path.join(FIGURES_DIR, "fig_spectral_gap_vix.png")
            fig.savefig(path, dpi=300, bbox_inches="tight"); plt.close(fig)
            generated.append(path)

    # Fig: Baseline comparison
    baseline_csv = os.path.join(RESULTS_DIR, "baseline_comparison.csv")
    if os.path.exists(baseline_csv):
        bdf = pd.read_csv(baseline_csv)
        metrics = [m for m in ["nber_accuracy", "nber_f1", "nber_precision", "nber_recall"] if m in bdf.columns]
        if metrics and "method" in bdf.columns:
            fig, ax = plt.subplots(figsize=(10, 6))
            x = np.arange(len(bdf))
            w = 0.8 / len(metrics)
            colors = ["steelblue", "coral", "seagreen", "orchid"]
            for i, m in enumerate(metrics):
                ax.bar(x + i*w, bdf[m].astype(float), w, label=m.replace("nber_", "").title(),
                       color=colors[i % len(colors)], edgecolor="black", lw=0.3)
            ax.set_xticks(x + w*(len(metrics)-1)/2)
            ax.set_xticklabels(bdf["method"], rotation=15, ha="right")
            ax.set_ylabel("Score"); ax.set_title("Baseline Comparison"); ax.legend()
            ax.set_ylim(0, 1.05); ax.grid(True, alpha=0.3, axis="y")
            fig.tight_layout()
            path = os.path.join(FIGURES_DIR, "fig_baseline_comparison.png")
            fig.savefig(path, dpi=300, bbox_inches="tight"); plt.close(fig)
            generated.append(path)

    # Fig: Bootstrap CI
    stat_path = os.path.join(RESULTS_DIR, "statistical_tests.json")
    if os.path.exists(stat_path):
        with open(stat_path) as f:
            stat = json.load(f)
        bootstrap = stat.get("bootstrap_eigenvalue_ci", {})
        modes_data = bootstrap.get("modes", [])
        if modes_data:
            mode_idx = [m["mode"] for m in modes_data]
            means = [m["mean_magnitude"] for m in modes_data]
            ci_lo = [m["ci_lower"] for m in modes_data]
            ci_hi = [m["ci_upper"] for m in modes_data]
            fig, ax = plt.subplots(figsize=(10, 5))
            ax.errorbar(mode_idx, means,
                        yerr=[np.array(means)-np.array(ci_lo), np.array(ci_hi)-np.array(means)],
                        fmt="o", capsize=4, color="steelblue", markersize=6)
            ax.axhline(y=1.0, color="red", ls="--", lw=0.8, label="|$\\lambda$|=1")
            ax.set_xlabel("Mode $k$"); ax.set_ylabel("|$\\lambda_k$|")
            ax.set_title("Bootstrap Eigenvalue CIs (95%)"); ax.legend(); ax.grid(True, alpha=0.3)
            fig.tight_layout()
            path = os.path.join(FIGURES_DIR, "fig_bootstrap_ci.png")
            fig.savefig(path, dpi=300, bbox_inches="tight"); plt.close(fig)
            generated.append(path)

        # CK consistency
        ck = stat.get("chapman_kolmogorov", {})
        ck_errors = ck.get("ck_errors", [])
        if ck_errors:
            fig, ax = plt.subplots(figsize=(8, 5))
            ax.bar([e["n"] for e in ck_errors], [e["error"] for e in ck_errors],
                   color="steelblue", edgecolor="navy", lw=0.5)
            ax.set_xlabel("$n$ (multiples of $\\tau$)"); ax.set_ylabel("Mean |$\\lambda$ error|")
            ax.set_title(f"Chapman-Kolmogorov Consistency (p={ck.get('p_value', 'N/A'):.4f})")
            ax.grid(True, alpha=0.3, axis="y"); fig.tight_layout()
            path = os.path.join(FIGURES_DIR, "fig_chapman_kolmogorov.png")
            fig.savefig(path, dpi=300, bbox_inches="tight"); plt.close(fig)
            generated.append(path)

    # Display all generated figures
    print(f"Generated {len(generated)} figures:\n")
    for p in generated:
        print(f"--- {os.path.basename(p)} ---")
        display(Image(filename=p, width=800))
        print()

    if not generated:
        print("No result files found to generate figures from.")
        print(f"Check that pipeline stages completed and results exist in: {RESULTS_DIR}")
        if os.path.exists(RESULTS_DIR):
            print(f"Files found: {os.listdir(RESULTS_DIR)}")


In [None]:
#@title 4. Download all results as zip

!cd /content/ktnd_finance && zip -rq /content/ktnd_results.zip outputs/
from google.colab import files
files.download('/content/ktnd_results.zip')
print("Download started.")

In [None]:
#@title 5. Ablation study (~1-2 hours with 3 seeds) - RECOMMENDED for PRE submission

import subprocess, time, sys

python = sys.executable
print("Running 13 ablation variants x 3 seeds...")
print("This tests sensitivity to: architecture, n_modes, lag, embedding, dropout,")
print("window size, shared weights, loss components, standardization, linear features.\n")

t0 = time.time()
result = subprocess.run(
    f'{python} experiments/run_ablations.py --config config/default.yaml --n-seeds 3 --n-jobs 1',
    shell=True, cwd="/content/ktnd_finance"
)
elapsed = (time.time() - t0) / 60
status = 'OK' if result.returncode == 0 else 'FAILED'
print(f"\nAblations: {status} ({elapsed:.1f} min)")

# Show summary
import pandas as pd, os
summary_path = "/content/ktnd_finance/outputs/results/ablation_summary.csv"
if os.path.exists(summary_path):
    df = pd.read_csv(summary_path)
    print(f"\n{len(df)} ablation variants completed:")
    print(df[['name', 'vamp2_mean', 'spectral_gap_mean', 'entropy_total_mean']].to_string(index=False))
