# KTND-Finance: Full Experiment Pipeline

**Run everything in 2 cells:**
1. **Cell 1** (Setup) - Install deps + clone repo (~2 min)
2. **Cell 2** (Run All) - Downloads data, trains models, runs baselines/robustness/rolling, generates figures (~2-3 hours)

Set runtime to **GPU (T4)** before running: Runtime > Change runtime type > T4 GPU

Then hit **Runtime > Run all** and walk away.

In [None]:
#@title 1. Setup (install + clone + verify) - ~2 min

# Install missing dependencies (torch/numpy/pandas/scipy/sklearn/matplotlib are pre-installed)
!pip install -q yfinance>=1.0.0 hmmlearn>=0.3.0 statsmodels>=0.14.0 arch>=6.0.0 pyyaml>=6.0

# Clone repo
import os, sys
REPO_URL = "https://github.com/keshavkrishnan08/kind_finance.git"
REPO_DIR = "/content/ktnd_finance"

if os.path.exists(REPO_DIR):
    !cd {REPO_DIR} && git pull
else:
    !git clone {REPO_URL} {REPO_DIR}

os.chdir(REPO_DIR)
sys.path.insert(0, REPO_DIR)

# Verify
import torch, numpy as np
from src.model.vampnet import NonEquilibriumVAMPNet
print(f"Python {sys.version.split()[0]} | PyTorch {torch.__version__} | "
      f"CUDA: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print("Setup complete.")

In [None]:
#@title 2. Run ENTIRE experiment pipeline (~2-3 hours) - just run this and walk away

import subprocess, time, json, os, sys, glob

REPO_DIR = "/content/ktnd_finance"
OUTPUT_DIR = "/content/ktnd_finance/outputs"
RESULTS_DIR = "/content/ktnd_finance/outputs/results"
MODELS_DIR = "/content/ktnd_finance/outputs/models"
FIGURES_DIR = "/content/ktnd_finance/outputs/figures"
DATA_DIR = "/content/ktnd_finance/data"

for d in [OUTPUT_DIR, RESULTS_DIR, MODELS_DIR, FIGURES_DIR]:
    os.makedirs(d, exist_ok=True)

os.chdir(REPO_DIR)
python = sys.executable

# Verify paths exist
print(f"Python: {python}")
print(f"CWD: {os.getcwd()}")
print(f"Repo dir exists: {os.path.exists(REPO_DIR)}")
print(f"src/ exists: {os.path.isdir(os.path.join(REPO_DIR, 'src'))}")
print(f"experiments/ exists: {os.path.isdir(os.path.join(REPO_DIR, 'experiments'))}")
print(f"config/ exists: {os.path.isdir(os.path.join(REPO_DIR, 'config'))}")

def run(name, cmd, check_files=None):
    """Run a stage, print full output, verify files."""
    print(f"\n{'='*70}")
    print(f"  STAGE: {name}")
    print(f"  CMD: {cmd}")
    print(f"{'='*70}")
    t0 = time.time()
    result = subprocess.run(cmd, shell=True, cwd=REPO_DIR,
                            capture_output=True, text=True)
    elapsed = time.time() - t0

    # Print ALL stdout (not truncated)
    if result.stdout:
        for line in result.stdout.strip().split('\n'):
            print(f"  {line}")

    if result.returncode != 0:
        print(f"\n  === STDERR ===")
        if result.stderr:
            for line in result.stderr.strip().split('\n'):
                print(f"  ! {line}")
        print(f"  >> {name}: FAILED (exit code {result.returncode}, {elapsed/60:.1f} min)")
        return False

    # Print any stderr warnings even on success
    if result.stderr:
        stderr_lines = result.stderr.strip().split('\n')
        # Only print if there are actual errors (not just INFO/WARNING logs)
        error_lines = [l for l in stderr_lines if 'Error' in l or 'Exception' in l or 'Traceback' in l]
        if error_lines:
            print(f"  === STDERR (errors) ===")
            for line in error_lines:
                print(f"  ! {line}")

    # Verify files
    if check_files:
        missing = [f for f in check_files if not os.path.exists(f)]
        if missing:
            print(f"  WARNING: Missing expected output files:")
            for f in missing:
                print(f"    MISSING: {f}")
            # Print full stderr to debug
            if result.stderr:
                print(f"  === FULL STDERR ===")
                for line in result.stderr.strip().split('\n')[-30:]:
                    print(f"  ! {line}")
            print(f"  >> {name}: INCOMPLETE ({elapsed/60:.1f} min)")
            return False
        for f in check_files:
            sz = os.path.getsize(f)
            print(f"  OK: {os.path.basename(f)} ({sz:,} bytes)")

    print(f"  >> {name}: OK ({elapsed/60:.1f} min)")
    return True

pipeline_start = time.time()
results = {}

# --- Stage 1: Quick tests ---
results['tests'] = run('Quick tests',
    f'{python} -m pytest tests/ -q --tb=short -k "not test_synthetic"')

# --- Stage 2: Download data ---
results['download'] = run('Download data',
    f'{python} {REPO_DIR}/data/download.py --mode all',
    check_files=[f'{DATA_DIR}/prices.csv', f'{DATA_DIR}/vix.csv'])

# --- Stage 3: Train univariate (explicit --output-dir) ---
results['train_uni'] = run('Train univariate (SPY)',
    f'{python} {REPO_DIR}/experiments/run_main.py'
    f' --config config/univariate.yaml --mode univariate --seed 42'
    f' --output-dir {OUTPUT_DIR}',
    check_files=[
        f'{RESULTS_DIR}/analysis_results.json',
        f'{RESULTS_DIR}/eigenvalues.csv',
        f'{RESULTS_DIR}/entropy_decomposition.csv',
        f'{RESULTS_DIR}/irreversibility_field.npy',
        f'{MODELS_DIR}/vampnet_univariate.pt',
    ])

# --- Stage 4: Train multiasset (explicit --output-dir) ---
results['train_multi'] = run('Train multiasset (11 ETFs)',
    f'{python} {REPO_DIR}/experiments/run_main.py'
    f' --config config/multiasset.yaml --mode multiasset --seed 42'
    f' --output-dir {OUTPUT_DIR}',
    check_files=[f'{RESULTS_DIR}/analysis_results.json',
                 f'{MODELS_DIR}/vampnet_multiasset.pt'])

# --- Stage 5: Baselines (explicit --output-dir) ---
results['baselines'] = run('Baselines',
    f'{python} {REPO_DIR}/experiments/run_baselines.py'
    f' --config config/default.yaml --output-dir {RESULTS_DIR}',
    check_files=[f'{RESULTS_DIR}/baseline_comparison.csv'])

# --- Stage 6: Robustness (explicit paths) ---
results['robustness'] = run('Robustness tests',
    f'{python} {REPO_DIR}/experiments/run_robustness.py'
    f' --config config/default.yaml --mode univariate'
    f' --checkpoint {MODELS_DIR}/vampnet_univariate.pt'
    f' --output-dir {RESULTS_DIR}',
    check_files=[f'{RESULTS_DIR}/statistical_tests.json'])

# --- Stage 7: Rolling (explicit paths) ---
results['rolling'] = run('Rolling spectral analysis',
    f'{python} {REPO_DIR}/experiments/run_rolling.py'
    f' --config config/default.yaml --mode univariate'
    f' --checkpoint {MODELS_DIR}/vampnet_univariate.pt'
    f' --output-dir {RESULTS_DIR}',
    check_files=[f'{RESULTS_DIR}/spectral_gap_timeseries.csv'])

# --- Stage 8: Generate figures via run_figures.py ---
results['figures'] = run('Generate figures (script)',
    f'{python} {REPO_DIR}/experiments/run_figures.py'
    f' --results-dir {RESULTS_DIR} --figures-dir {FIGURES_DIR}')

# --- Stage 9: Generate figures INLINE as fallback ---
print(f"\n{'='*70}")
print(f"  GENERATING FIGURES INLINE (FALLBACK)")
print(f"{'='*70}")

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

n_figs = 0
analysis_path = f"{RESULTS_DIR}/analysis_results.json"

if os.path.exists(analysis_path):
    with open(analysis_path) as f:
        ar = json.load(f)

    # Eigenvalue spectrum
    er, ei = ar.get("eigenvalues_real"), ar.get("eigenvalues_imag")
    if er and ei:
        er, ei = np.array(er), np.array(ei)
        mags = np.sqrt(er**2 + ei**2)
        fig, ax = plt.subplots(figsize=(7,7))
        th = np.linspace(0, 2*np.pi, 300)
        ax.plot(np.cos(th), np.sin(th), "k--", lw=0.8, alpha=0.5)
        sc = ax.scatter(er, ei, c=mags, cmap="viridis", edgecolors="k", linewidths=0.4, s=80, zorder=3)
        plt.colorbar(sc, ax=ax, label="|$\\lambda$|")
        for i, idx in enumerate(np.argsort(-mags)[:5]):
            ax.annotate(f"$\\lambda_{i}$", (er[idx], ei[idx]), textcoords="offset points", xytext=(8,8), fontsize=9)
        ax.set_xlabel("Re($\\lambda$)"); ax.set_ylabel("Im($\\lambda$)")
        ax.set_title("Koopman Eigenvalue Spectrum"); ax.set_aspect("equal"); ax.grid(True, alpha=0.3)
        fig.savefig(f"{FIGURES_DIR}/fig1_eigenvalue_spectrum.png", dpi=300, bbox_inches="tight"); plt.close(fig); n_figs += 1

    svs = ar.get("singular_values")
    if svs:
        fig, ax = plt.subplots(figsize=(8,5))
        ax.bar(range(len(svs)), svs, color="steelblue", edgecolor="navy", lw=0.3)
        ax.axhline(y=1.0, color="red", ls="--", lw=0.8)
        ax.set_xlabel("Mode"); ax.set_ylabel("$\\sigma_k$"); ax.set_title("Singular Value Spectrum"); ax.grid(True, alpha=0.3, axis="y")
        fig.savefig(f"{FIGURES_DIR}/fig_singular_values.png", dpi=300, bbox_inches="tight"); plt.close(fig); n_figs += 1
else:
    print(f"  analysis_results.json NOT FOUND at {analysis_path}")

for csv_name, title, ycol in [
    ("eigenvalues.csv", "Eigenvalue Magnitudes", "magnitude"),
    ("entropy_decomposition.csv", "Entropy Decomposition", "entropy_production"),
]:
    p = f"{RESULTS_DIR}/{csv_name}"
    if os.path.exists(p):
        df = pd.read_csv(p)
        if ycol in df.columns:
            fig, ax = plt.subplots(figsize=(8,5))
            ax.bar(df["mode"], df[ycol], color="coral" if "entropy" in csv_name else "steelblue", edgecolor="black", lw=0.3)
            ax.set_xlabel("Mode"); ax.set_ylabel(ycol); ax.set_title(title); ax.grid(True, alpha=0.3, axis="y")
            fig.savefig(f"{FIGURES_DIR}/fig_{csv_name.replace('.csv','')}.png", dpi=300, bbox_inches="tight"); plt.close(fig); n_figs += 1

irp = f"{RESULTS_DIR}/irreversibility_field.npy"
if os.path.exists(irp):
    ir = np.load(irp, allow_pickle=True)
    fig, ax = plt.subplots(figsize=(14,4))
    ax.fill_between(range(len(ir)), ir, alpha=0.4, color="darkorange"); ax.plot(ir, lw=0.5, color="darkorange")
    ax.set_xlabel("Time"); ax.set_ylabel("$I(x)$"); ax.set_title("Irreversibility Field"); ax.grid(True, alpha=0.3)
    fig.savefig(f"{FIGURES_DIR}/fig_irreversibility_field.png", dpi=300, bbox_inches="tight"); plt.close(fig); n_figs += 1

rcp = f"{RESULTS_DIR}/spectral_gap_timeseries.csv"
if os.path.exists(rcp):
    rdf = pd.read_csv(rcp)
    if "spectral_gap" in rdf.columns:
        fig, ax = plt.subplots(figsize=(14,5))
        x = pd.to_datetime(rdf["center_date"]) if "center_date" in rdf.columns else range(len(rdf))
        ax.plot(x, rdf["spectral_gap"], color="steelblue", lw=1.0)
        ax.set_xlabel("Date"); ax.set_ylabel("Spectral Gap"); ax.set_title("Rolling Spectral Gap"); ax.grid(True, alpha=0.3)
        fig.savefig(f"{FIGURES_DIR}/fig_spectral_gap.png", dpi=300, bbox_inches="tight"); plt.close(fig); n_figs += 1

bcp = f"{RESULTS_DIR}/baseline_comparison.csv"
if os.path.exists(bcp):
    bdf = pd.read_csv(bcp)
    ms = [m for m in ["nber_accuracy","nber_f1","nber_precision","nber_recall"] if m in bdf.columns]
    if ms and "method" in bdf.columns:
        fig, ax = plt.subplots(figsize=(10,6))
        x = np.arange(len(bdf)); w = 0.8/len(ms)
        for i, m in enumerate(ms):
            ax.bar(x+i*w, bdf[m].astype(float), w, label=m.replace("nber_","").title(),
                   color=["steelblue","coral","seagreen","orchid"][i%4], edgecolor="black", lw=0.3)
        ax.set_xticks(x+w*(len(ms)-1)/2); ax.set_xticklabels(bdf["method"], rotation=15, ha="right")
        ax.set_ylabel("Score"); ax.set_title("Baseline Comparison"); ax.legend(); ax.set_ylim(0,1.05)
        fig.savefig(f"{FIGURES_DIR}/fig_baseline_comparison.png", dpi=300, bbox_inches="tight"); plt.close(fig); n_figs += 1

print(f"  Generated {n_figs} figures inline")

# --- List ALL output files ---
print(f"\n{'='*70}")
print(f"  ALL OUTPUT FILES")
print(f"{'='*70}")
for dirpath, dirnames, filenames in os.walk(OUTPUT_DIR):
    for f in sorted(filenames):
        fp = os.path.join(dirpath, f)
        sz = os.path.getsize(fp)
        rel = os.path.relpath(fp, OUTPUT_DIR)
        print(f"  {sz:>10,} bytes  {rel}")

# --- Final report ---
total_min = (time.time() - pipeline_start) / 60
n_ok = sum(results.values())
n_total = len(results)

print(f"\n{'='*70}")
print(f"  PIPELINE COMPLETE: {n_ok}/{n_total} stages passed ({total_min:.1f} min total)")
print(f"{'='*70}")
for name, ok in results.items():
    print(f"  {'OK' if ok else 'FAIL':6s}  {name}")

if os.path.exists(analysis_path):
    with open(analysis_path) as f:
        r = json.load(f)
    print(f"\n  Key Metrics:")
    print(f"    Spectral gap:         {r.get('spectral_gap', 'N/A')}")
    print(f"    Entropy (empirical):  {r.get('entropy_empirical', 'N/A')} "
          f"[{r.get('entropy_ci_lower', '?')}, {r.get('entropy_ci_upper', '?')}] 95% CI")
    print(f"    Mean irreversibility: {r.get('mean_irreversibility', 'N/A')}")
    print(f"    Irrev method:         {r.get('irrev_method', 'N/A')}")
    print(f"    DB violation:         {r.get('detailed_balance_violation', 'N/A')}")
    print(f"    Complex modes:        {r.get('n_complex_modes', 'N/A')}/{r.get('n_modes', 'N/A')}")
    print(f"    FT ratio:             {r.get('fluctuation_theorem_ratio', 'N/A')}")
else:
    print(f"\n  WARNING: No analysis_results.json found!")
    print(f"  Check STDERR output above for the actual error.")
print(f"{'='*70}")


In [None]:
#@title 3. View figures (run after Cell 2 finishes)

import glob, os
from IPython.display import Image, display

FIGURES_DIR = "/content/ktnd_finance/outputs/figures"

pngs = sorted(glob.glob(f"{FIGURES_DIR}/*.png"))
sup_dir = os.path.join(FIGURES_DIR, "supplemental")
if os.path.exists(sup_dir):
    pngs += sorted(glob.glob(f"{sup_dir}/*.png"))

if pngs:
    print(f"Found {len(pngs)} figures:\n")
    for p in pngs:
        print(f"--- {os.path.basename(p)} ---")
        display(Image(filename=p, width=800))
        print()
else:
    print("No figures found. Make sure Cell 2 has finished running first.")
    print(f"Checked: {FIGURES_DIR}")
    results_dir = "/content/ktnd_finance/outputs/results"
    if os.path.exists(results_dir):
        files = os.listdir(results_dir)
        print(f"Result files available ({len(files)}): {files}")
    else:
        print("No results directory found - Cell 2 needs to run first.")


In [None]:
#@title 4. Download all results as zip

!cd /content/ktnd_finance && zip -rq /content/ktnd_results.zip outputs/
from google.colab import files
files.download('/content/ktnd_results.zip')
print("Download started.")

In [None]:
#@title 5. Ablation study (~1-2 hours with 3 seeds) - RECOMMENDED for PRE submission

import subprocess, time, sys

python = sys.executable
print("Running 13 ablation variants x 3 seeds...")
print("This tests sensitivity to: architecture, n_modes, lag, embedding, dropout,")
print("window size, shared weights, loss components, standardization, linear features.\n")

t0 = time.time()
result = subprocess.run(
    f'{python} experiments/run_ablations.py --config config/default.yaml --n-seeds 3 --n-jobs 1',
    shell=True, cwd="/content/ktnd_finance"
)
elapsed = (time.time() - t0) / 60
status = 'OK' if result.returncode == 0 else 'FAILED'
print(f"\nAblations: {status} ({elapsed:.1f} min)")

# Show summary
import pandas as pd, os
summary_path = "/content/ktnd_finance/outputs/results/ablation_summary.csv"
if os.path.exists(summary_path):
    df = pd.read_csv(summary_path)
    print(f"\n{len(df)} ablation variants completed:")
    print(df[['name', 'vamp2_mean', 'spectral_gap_mean', 'entropy_total_mean']].to_string(index=False))
