# KTND-Finance v1.6.0

1. **Cell 1** -- Setup (~2 min)
2. **Cell 2** -- Full pipeline + multi-seed (~2-3 hours)
3. **Cell 3** -- Ablations (~6 hours, resume-safe)
4. **Cell 4** -- View figures
5. **Cell 5** -- Download zip

**GPU required**: Runtime -> Change runtime type -> T4 GPU

**Resume**: If Colab disconnects, change `RESUME_FROM` in Cell 2 to skip completed stages:
- `1` = start fresh (default)
- `4` = skip to multiasset (tests/download/univariate done)
- `5` = skip to baselines (both models trained)
- `7` = skip to robustness (baselines+rolling done)
- `10` = skip to entropy calibration
- `11` = skip to figures only
- `12` = skip to multi-seed only

In [None]:
#@title 1. Setup (install + clone + verify) - ~2 min

# Install missing dependencies (torch/numpy/pandas/scipy/sklearn/matplotlib are pre-installed)
!pip install -q yfinance>=1.0.0 hmmlearn>=0.3.0 statsmodels>=0.14.0 arch>=6.0.0 pyyaml>=6.0

# Clone repo
import os, sys
REPO_URL = "https://github.com/keshavkrishnan08/kind_finance.git"
REPO_DIR = "/content/ktnd_finance"

if os.path.exists(REPO_DIR):
    !cd {REPO_DIR} && git pull
else:
    !git clone {REPO_URL} {REPO_DIR}

os.chdir(REPO_DIR)
sys.path.insert(0, REPO_DIR)

# Verify
import torch, numpy as np
from src.model.vampnet import NonEquilibriumVAMPNet
print(f"Python {sys.version.split()[0]} | PyTorch {torch.__version__} | "
      f"CUDA: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print("Setup complete.")

In [None]:
#@title 2. Run pipeline + multi-seed + figures (~2-3 hours)

import subprocess, time, json, os, sys, glob

# ==========================================================================
# RESUME CONTROL -- change this to skip completed stages
# ==========================================================================
RESUME_FROM = 1  #@param {type:"integer"}
# 1=fresh, 4=multiasset, 5=baselines, 7=robustness, 11=figures, 12=multi-seed

# ==========================================================================
# PATHS
# ==========================================================================
REPO_DIR = "/content/ktnd_finance"
OUTPUT_DIR = "/content/ktnd_finance/outputs"
RESULTS_DIR = "/content/ktnd_finance/outputs/results"
MODELS_DIR = "/content/ktnd_finance/outputs/models"
FIGURES_DIR = "/content/ktnd_finance/outputs/figures"
DATA_DIR = "/content/ktnd_finance/data"

for d in [OUTPUT_DIR, RESULTS_DIR, MODELS_DIR, FIGURES_DIR]:
    os.makedirs(d, exist_ok=True)

os.chdir(REPO_DIR)
python = sys.executable

def run(name, cmd, check_files=None):
    """Run a stage, print output, verify files."""
    print(f"\n{'='*70}")
    print(f"  STAGE: {name}")
    print(f"{'='*70}")
    t0 = time.time()
    result = subprocess.run(cmd, shell=True, cwd=REPO_DIR,
                            capture_output=True, text=True)
    elapsed = time.time() - t0

    if result.stdout:
        for line in result.stdout.strip().split('\n')[-40:]:
            print(f"  {line}")

    if result.returncode != 0:
        print(f"\n  === STDERR (last 30 lines) ===")
        if result.stderr:
            for line in result.stderr.strip().split('\n')[-30:]:
                print(f"  ! {line}")
        print(f"  >> {name}: FAILED (exit {result.returncode}, {elapsed/60:.1f} min)")
        return False

    if check_files:
        missing = [f for f in check_files if not os.path.exists(f)]
        if missing:
            for f in missing:
                print(f"    MISSING: {f}")
            if result.stderr:
                for line in result.stderr.strip().split('\n')[-15:]:
                    print(f"  ! {line}")
            print(f"  >> {name}: INCOMPLETE ({elapsed/60:.1f} min)")
            return False
        for f in check_files:
            sz = os.path.getsize(f)
            print(f"  OK: {os.path.basename(f)} ({sz:,} bytes)")

    print(f"  >> {name}: OK ({elapsed/60:.1f} min)")
    return True

def skip(stage_num, name, check_files=None):
    """Check if stage should be skipped. Returns True if skipped."""
    if stage_num >= RESUME_FROM:
        return False
    if check_files:
        missing = [f for f in check_files if not os.path.exists(f)]
        if missing:
            print(f"\n  STAGE {stage_num} ({name}): Cannot skip -- missing files: {[os.path.basename(f) for f in missing]}")
            return False
    print(f"  STAGE {stage_num} ({name}): SKIPPED (RESUME_FROM={RESUME_FROM})")
    return True

pipeline_start = time.time()
results = {}

if RESUME_FROM > 1:
    print(f"RESUMING FROM STAGE {RESUME_FROM} (skipping stages 1-{RESUME_FROM-1})")

# ======================================================================
# STAGE 1: Quick tests
# ======================================================================
if skip(1, 'Quick tests'):
    results['tests'] = True
else:
    results['tests'] = run('Quick tests',
        f'{python} -m pytest tests/ -q --tb=short -k "not test_synthetic"')

# ======================================================================
# STAGE 2: Download data
# ======================================================================
if skip(2, 'Download data', [f'{DATA_DIR}/prices.csv', f'{DATA_DIR}/vix.csv']):
    results['download'] = True
else:
    results['download'] = run('Download data',
        f'{python} {REPO_DIR}/data/download.py --mode all',
        check_files=[f'{DATA_DIR}/prices.csv', f'{DATA_DIR}/vix.csv'])

# ======================================================================
# STAGE 3: Train univariate
# ======================================================================
uni_files = [
    f'{RESULTS_DIR}/analysis_results.json',
    f'{RESULTS_DIR}/analysis_results_univariate.json',
    f'{RESULTS_DIR}/eigenvalues.csv',
    f'{RESULTS_DIR}/entropy_decomposition.csv',
    f'{RESULTS_DIR}/irreversibility_field.npy',
    f'{MODELS_DIR}/vampnet_univariate.pt',
    f'{RESULTS_DIR}/training_history_univariate.json',
]
if skip(3, 'Train univariate', uni_files):
    results['train_uni'] = True
else:
    results['train_uni'] = run('Train univariate (SPY)',
        f'{python} {REPO_DIR}/experiments/run_main.py'
        f' --config config/default.yaml --mode univariate --seed 42'
        f' --output-dir {OUTPUT_DIR}',
        check_files=uni_files)

# ======================================================================
# STAGE 4: Train multiasset
# ======================================================================
multi_files = [
    f'{RESULTS_DIR}/analysis_results_multiasset.json',
    f'{MODELS_DIR}/vampnet_multiasset.pt',
    f'{RESULTS_DIR}/training_history_multiasset.json',
]
if skip(4, 'Train multiasset', multi_files):
    results['train_multi'] = True
else:
    results['train_multi'] = run('Train multiasset (11 ETFs)',
        f'{python} {REPO_DIR}/experiments/run_main.py'
        f' --config config/default.yaml --mode multiasset --seed 42'
        f' --output-dir {OUTPUT_DIR}',
        check_files=multi_files)

# ======================================================================
# STAGE 5: Baselines
# ======================================================================
if skip(5, 'Baselines', [f'{RESULTS_DIR}/baseline_comparison.csv']):
    results['baselines'] = True
else:
    results['baselines'] = run('Baselines',
        f'{python} {REPO_DIR}/experiments/run_baselines.py'
        f' --config config/default.yaml --output-dir {RESULTS_DIR}',
        check_files=[f'{RESULTS_DIR}/baseline_comparison.csv'])

# ======================================================================
# STAGE 6: Rolling
# ======================================================================
if skip(6, 'Rolling', [f'{RESULTS_DIR}/spectral_gap_timeseries.csv']):
    results['rolling'] = True
else:
    results['rolling'] = run('Rolling spectral analysis',
        f'{python} {REPO_DIR}/experiments/run_rolling.py'
        f' --config config/default.yaml --mode univariate'
        f' --checkpoint {MODELS_DIR}/vampnet_univariate.pt'
        f' --output-dir {RESULTS_DIR}',
        check_files=[f'{RESULTS_DIR}/spectral_gap_timeseries.csv'])

# ======================================================================
# STAGE 7: Robustness univariate (IAAFT)
# ======================================================================
if skip(7, 'Robustness univariate', [f'{RESULTS_DIR}/statistical_tests.json']):
    results['robustness_uni'] = True
else:
    results['robustness_uni'] = run('Robustness (univariate, IAAFT)',
        f'{python} {REPO_DIR}/experiments/run_robustness.py'
        f' --config config/default.yaml --mode univariate'
        f' --checkpoint {MODELS_DIR}/vampnet_univariate.pt'
        f' --output-dir {RESULTS_DIR}',
        check_files=[f'{RESULTS_DIR}/statistical_tests.json'])

# ======================================================================
# STAGE 8: Robustness multiasset (IAAFT)
# ======================================================================
if skip(8, 'Robustness multiasset', [f'{RESULTS_DIR}/statistical_tests_multiasset.json']):
    results['robustness_multi'] = True
else:
    results['robustness_multi'] = run('Robustness (multiasset, IAAFT)',
        f'{python} {REPO_DIR}/experiments/run_robustness.py'
        f' --config config/default.yaml --mode multiasset'
        f' --checkpoint {MODELS_DIR}/vampnet_multiasset.pt'
        f' --output-dir {RESULTS_DIR}',
        check_files=[f'{RESULTS_DIR}/statistical_tests_multiasset.json'])

# ======================================================================
# STAGE 9: Walk-forward cross-validation
# ======================================================================
for mode_tag in ["univariate", "multiasset"]:
    cv_file = f'{RESULTS_DIR}/cv_results_{mode_tag}.json'
    if skip(9, f'CV {mode_tag}', [cv_file]):
        results[f'cv_{mode_tag}'] = True
    else:
        results[f'cv_{mode_tag}'] = run(f'Walk-forward CV ({mode_tag})',
            f'{python} {REPO_DIR}/experiments/run_cv.py'
            f' --config config/default.yaml --mode {mode_tag} --n-folds 5'
            f' --output-dir {RESULTS_DIR}',
            check_files=[cv_file])

# ======================================================================
# STAGE 10: Entropy calibration
# ======================================================================
if skip(10, 'Entropy calibration', [f'{RESULTS_DIR}/entropy_calibration.json']):
    results['entropy_cal'] = True
else:
    results['entropy_cal'] = run('Entropy calibration (Brownian gyrator)',
        f'{python} {REPO_DIR}/experiments/run_entropy_calibration.py'
        f' --output-dir {RESULTS_DIR} --n-steps 50000',
        check_files=[f'{RESULTS_DIR}/entropy_calibration.json'])

# ======================================================================
# STAGE 11: Figures
# ======================================================================
if skip(11, 'Figures'):
    results['figures'] = True
else:
    results['figures'] = run('Generate figures',
        f'{python} {REPO_DIR}/experiments/run_figures.py'
        f' --results-dir {RESULTS_DIR} --figures-dir {FIGURES_DIR}')

    # Inline fallback figures
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd

    n_figs = 0

    for mode_tag, label in [("univariate", "Univariate (SPY)"), ("multiasset", "Multiasset")]:
        ap = f"{RESULTS_DIR}/analysis_results_{mode_tag}.json"
        if not os.path.exists(ap):
            continue
        with open(ap) as f:
            ar = json.load(f)
        er, ei = ar.get("eigenvalues_real"), ar.get("eigenvalues_imag")
        if er and ei:
            er, ei = np.array(er), np.array(ei)
            mags = np.sqrt(er**2 + ei**2)
            fig, ax = plt.subplots(figsize=(7,7))
            th = np.linspace(0, 2*np.pi, 300)
            ax.plot(np.cos(th), np.sin(th), "k--", lw=0.8, alpha=0.5)
            sc = ax.scatter(er, ei, c=mags, cmap="viridis", edgecolors="k", linewidths=0.4, s=80, zorder=3)
            plt.colorbar(sc, ax=ax, label="|$\\lambda$|")
            for i, idx in enumerate(np.argsort(-mags)[:5]):
                ax.annotate(f"$\\lambda_{i}$", (er[idx], ei[idx]), textcoords="offset points", xytext=(8,8), fontsize=9)
            ax.set_xlabel("Re($\\lambda$)"); ax.set_ylabel("Im($\\lambda$)")
            ax.set_title(f"Koopman Eigenvalue Spectrum -- {label}"); ax.set_aspect("equal"); ax.grid(True, alpha=0.3)
            fig.savefig(f"{FIGURES_DIR}/fig1_eigenvalue_spectrum_{mode_tag}.png", dpi=300, bbox_inches="tight"); plt.close(fig); n_figs += 1

    for csv_name, title, ycol in [
        ("eigenvalues.csv", "Eigenvalue Magnitudes", "magnitude"),
        ("entropy_decomposition.csv", "Entropy Decomposition", "entropy_production"),
    ]:
        p = f"{RESULTS_DIR}/{csv_name}"
        if os.path.exists(p):
            df = pd.read_csv(p)
            if ycol in df.columns:
                fig, ax = plt.subplots(figsize=(8,5))
                ax.bar(df["mode"], df[ycol], color="coral" if "entropy" in csv_name else "steelblue", edgecolor="black", lw=0.3)
                ax.set_xlabel("Mode"); ax.set_ylabel(ycol); ax.set_title(title); ax.grid(True, alpha=0.3, axis="y")
                fig.savefig(f"{FIGURES_DIR}/fig_{csv_name.replace('.csv','')}.png", dpi=300, bbox_inches="tight"); plt.close(fig); n_figs += 1

    irp = f"{RESULTS_DIR}/irreversibility_field.npy"
    if os.path.exists(irp):
        ir = np.load(irp, allow_pickle=True)
        fig, ax = plt.subplots(figsize=(14,4))
        ax.fill_between(range(len(ir)), ir, alpha=0.4, color="darkorange"); ax.plot(ir, lw=0.5, color="darkorange")
        ax.set_xlabel("Time"); ax.set_ylabel("$I(x)$"); ax.set_title("Irreversibility Field"); ax.grid(True, alpha=0.3)
        fig.savefig(f"{FIGURES_DIR}/fig_irreversibility_field.png", dpi=300, bbox_inches="tight"); plt.close(fig); n_figs += 1

    rcp = f"{RESULTS_DIR}/spectral_gap_timeseries.csv"
    if os.path.exists(rcp):
        rdf = pd.read_csv(rcp)
        if "spectral_gap" in rdf.columns:
            fig, ax = plt.subplots(figsize=(14,5))
            x = pd.to_datetime(rdf["center_date"]) if "center_date" in rdf.columns else range(len(rdf))
            ax.plot(x, rdf["spectral_gap"], color="steelblue", lw=1.0)
            ax.set_xlabel("Date"); ax.set_ylabel("Spectral Gap"); ax.set_title("Rolling Spectral Gap"); ax.grid(True, alpha=0.3)
            fig.savefig(f"{FIGURES_DIR}/fig_spectral_gap.png", dpi=300, bbox_inches="tight"); plt.close(fig); n_figs += 1

    bcp = f"{RESULTS_DIR}/baseline_comparison.csv"
    if os.path.exists(bcp):
        bdf = pd.read_csv(bcp)
        ms = [m for m in ["nber_accuracy","nber_f1","nber_precision","nber_recall"] if m in bdf.columns]
        if ms and "method" in bdf.columns:
            fig, ax = plt.subplots(figsize=(10,6))
            x = np.arange(len(bdf)); w = 0.8/len(ms)
            for i, m in enumerate(ms):
                ax.bar(x+i*w, bdf[m].astype(float), w, label=m.replace("nber_","").title(),
                       color=["steelblue","coral","seagreen","orchid"][i%4], edgecolor="black", lw=0.3)
            ax.set_xticks(x+w*(len(ms)-1)/2); ax.set_xticklabels(bdf["method"], rotation=15, ha="right")
            ax.set_ylabel("Score"); ax.set_title("Baseline Comparison"); ax.legend(); ax.set_ylim(0,1.05)
            fig.savefig(f"{FIGURES_DIR}/fig_baseline_comparison.png", dpi=300, bbox_inches="tight"); plt.close(fig); n_figs += 1

    print(f"  Generated {n_figs} inline figures")

# ======================================================================
# STAGE 12: MULTI-SEED ERROR BARS (5 seeds)
# ======================================================================
import numpy as np
import pandas as pd

N_MAIN_SEEDS = 5
EXTRA_SEEDS = [0, 1, 2, 3]

METRICS = [
    'vamp2_score', 'spectral_gap', 'entropy_empirical', 'entropy_total',
    'mean_irreversibility', 'detailed_balance_violation',
    'fluctuation_theorem_ratio', 'n_complex_modes', 'complex_fraction',
    'ktnd_nber_accuracy', 'ktnd_nber_f1',
]

multi_seed_results = {}

if skip(12, 'Multi-seed', [f'{RESULTS_DIR}/multi_seed_summary.json']):
    results['multi_seed'] = True
    if os.path.exists(f'{RESULTS_DIR}/multi_seed_summary.json'):
        with open(f'{RESULTS_DIR}/multi_seed_summary.json') as f:
            multi_seed_summary = json.load(f)
    else:
        multi_seed_summary = {}
else:
    print(f"\n{'#'*70}")
    print(f"#  MULTI-SEED ({N_MAIN_SEEDS} seeds: 42 + {EXTRA_SEEDS})")
    print(f"{'#'*70}")

    for mode_tag in ["univariate", "multiasset"]:
        ap = f"{RESULTS_DIR}/analysis_results_{mode_tag}.json"
        if os.path.exists(ap):
            with open(ap) as f:
                multi_seed_results.setdefault(mode_tag, {})[42] = json.load(f)

    for seed in EXTRA_SEEDS:
        seed_dir = f"{OUTPUT_DIR}/seed_{seed}"
        seed_results = f"{seed_dir}/results"
        seed_models = f"{seed_dir}/models"
        os.makedirs(seed_results, exist_ok=True)
        os.makedirs(seed_models, exist_ok=True)

        for mode_tag in ["univariate", "multiasset"]:
            seed_ap = f"{seed_results}/analysis_results_{mode_tag}.json"
            if os.path.exists(seed_ap):
                print(f"  Seed {seed} {mode_tag}: CACHED")
                with open(seed_ap) as f:
                    multi_seed_results.setdefault(mode_tag, {})[seed] = json.load(f)
                continue

            print(f"  Seed {seed} {mode_tag}: TRAINING...", flush=True)
            ok = run(f'Seed {seed} {mode_tag}',
                f'{python} {REPO_DIR}/experiments/run_main.py'
                f' --config config/default.yaml --mode {mode_tag} --seed {seed}'
                f' --output-dir {seed_dir}')

            if os.path.exists(seed_ap):
                with open(seed_ap) as f:
                    multi_seed_results.setdefault(mode_tag, {})[seed] = json.load(f)
            else:
                print(f"  WARNING: No results for seed {seed} {mode_tag}")

    # Compute summaries
    multi_seed_summary = {}
    ms_summary_rows = []
    for mode_tag in ["univariate", "multiasset"]:
        if mode_tag not in multi_seed_results:
            continue
        seed_data = multi_seed_results[mode_tag]
        seeds_present = sorted(seed_data.keys())
        summary = {'n_seeds': len(seeds_present), 'seeds': seeds_present}
        row = {'mode': mode_tag, 'n_seeds': len(seeds_present)}
        for metric in METRICS:
            vals = [float(seed_data[s][metric]) for s in seeds_present
                    if seed_data[s].get(metric) is not None]
            if vals:
                mean_val, std_val = np.mean(vals), np.std(vals, ddof=1) if len(vals) > 1 else 0.0
                summary[f'{metric}_mean'] = float(mean_val)
                summary[f'{metric}_std'] = float(std_val)
                row[f'{metric}_mean'] = float(mean_val)
                row[f'{metric}_std'] = float(std_val)
        multi_seed_summary[mode_tag] = summary
        ms_summary_rows.append(row)

    ms_json_path = f"{RESULTS_DIR}/multi_seed_summary.json"
    with open(ms_json_path, 'w') as f:
        json.dump(multi_seed_summary, f, indent=2, default=str)
    if ms_summary_rows:
        pd.DataFrame(ms_summary_rows).to_csv(f"{RESULTS_DIR}/multi_seed_summary.csv", index=False)

    # Per-seed detailed CSV
    ms_rows = []
    for mode_tag in ["univariate", "multiasset"]:
        if mode_tag not in multi_seed_results:
            continue
        for seed, data in sorted(multi_seed_results[mode_tag].items()):
            row = {'mode': mode_tag, 'seed': seed}
            for metric in METRICS:
                row[metric] = data.get(metric)
            ms_rows.append(row)
    if ms_rows:
        pd.DataFrame(ms_rows).to_csv(f"{RESULTS_DIR}/multi_seed_detailed.csv", index=False)

    results['multi_seed'] = os.path.exists(f"{RESULTS_DIR}/multi_seed_summary.json")

# ======================================================================
# SAVE ALL RESULTS AS CSV
# ======================================================================
import numpy as np
import pandas as pd

print(f"\n{'#'*70}")
print(f"#  SAVING ALL RESULTS AS CSV")
print(f"{'#'*70}")

# Pipeline summary
summary_rows = []
for mode_tag in ["univariate", "multiasset"]:
    ap = f"{RESULTS_DIR}/analysis_results_{mode_tag}.json"
    if not os.path.exists(ap):
        continue
    with open(ap) as f:
        r = json.load(f)
    row = {'mode': mode_tag, 'seed': 42}
    for k in ['vamp2_score', 'spectral_gap', 'entropy_empirical', 'entropy_ci_lower',
              'entropy_ci_upper', 'entropy_total', 'mean_irreversibility',
              'detailed_balance_violation', 'fluctuation_theorem_ratio',
              'n_complex_modes', 'complex_fraction', 'n_modes',
              'ktnd_nber_accuracy', 'ktnd_nber_f1', 'ktnd_naive_accuracy',
              'ktnd_mean_regime_duration', 'irrev_method']:
        row[k] = r.get(k)
    summary_rows.append(row)
if summary_rows:
    pd.DataFrame(summary_rows).to_csv(f"{RESULTS_DIR}/pipeline_summary.csv", index=False)
    print(f"  Saved: pipeline_summary.csv")

# Statistical tests as CSV
for mode_tag, suffix in [("univariate", ""), ("multiasset", "_multiasset")]:
    stp = f"{RESULTS_DIR}/statistical_tests{suffix}.json"
    if not os.path.exists(stp):
        continue
    with open(stp) as f:
        st = json.load(f)
    stat_rows = []
    for k, v in st.items():
        if isinstance(v, dict):
            row = {'test': k, 'mode': mode_tag}
            for kk, vv in v.items():
                if not isinstance(vv, (dict, list)):
                    row[kk] = vv
            stat_rows.append(row)
    if stat_rows:
        pd.DataFrame(stat_rows).to_csv(f"{RESULTS_DIR}/statistical_tests{suffix}.csv", index=False)
        print(f"  Saved: statistical_tests{suffix}.csv")

# CV as CSV
cv_rows = []
for mode_tag in ["univariate", "multiasset"]:
    cvp = f"{RESULTS_DIR}/cv_results_{mode_tag}.json"
    if not os.path.exists(cvp):
        continue
    with open(cvp) as f:
        cv = json.load(f)
    row = {'mode': mode_tag, 'n_folds': cv.get('n_folds')}
    for metric in ['vamp2', 'spectral_gap', 'db_violation', 'complex_fraction']:
        row[f'{metric}_mean'] = cv.get(f'{metric}_mean')
        row[f'{metric}_std'] = cv.get(f'{metric}_std')
    cv_rows.append(row)
    folds = cv.get('folds', cv.get('fold_results', []))
    if folds:
        pd.DataFrame(folds).to_csv(f"{RESULTS_DIR}/cv_folds_{mode_tag}.csv", index=False)
        print(f"  Saved: cv_folds_{mode_tag}.csv")
if cv_rows:
    pd.DataFrame(cv_rows).to_csv(f"{RESULTS_DIR}/cv_summary.csv", index=False)
    print(f"  Saved: cv_summary.csv")

# Entropy calibration as CSV
ecal_path = f"{RESULTS_DIR}/entropy_calibration.json"
if os.path.exists(ecal_path):
    with open(ecal_path) as f:
        ecal = json.load(f)
    points = ecal.get("calibration_points", [])
    if points:
        pd.DataFrame(points).to_csv(f"{RESULTS_DIR}/entropy_calibration.csv", index=False)
        print(f"  Saved: entropy_calibration.csv")

# ======================================================================
# FINAL REPORT
# ======================================================================
print(f"\n{'='*70}")
print(f"  ALL OUTPUT FILES")
print(f"{'='*70}")
for dirpath, dirnames, filenames in os.walk(OUTPUT_DIR):
    for f in sorted(filenames):
        fp = os.path.join(dirpath, f)
        sz = os.path.getsize(fp)
        rel = os.path.relpath(fp, OUTPUT_DIR)
        print(f"  {sz:>10,} bytes  {rel}")

total_min = (time.time() - pipeline_start) / 60
n_ok = sum(v for v in results.values() if isinstance(v, bool) and v)
n_total = len(results)

print(f"\n{'='*70}")
print(f"  COMPLETE: {n_ok}/{n_total} stages ({total_min:.1f} min)")
print(f"{'='*70}")
for name, ok in results.items():
    print(f"  {'OK' if ok else 'FAIL':6s}  {name}")

for mode_tag, label in [("univariate", "Univariate (SPY)"), ("multiasset", "Multiasset (11 ETFs)")]:
    ap = f"{RESULTS_DIR}/analysis_results_{mode_tag}.json"
    if not os.path.exists(ap):
        continue
    with open(ap) as f:
        r = json.load(f)
    print(f"\n  === {label} (seed 42) ===")
    print(f"    VAMP-2:       {r.get('vamp2_score', 'N/A')}")
    print(f"    Spectral gap: {r.get('spectral_gap', 'N/A')}")
    print(f"    Entropy emp:  {r.get('entropy_empirical', 'N/A')} [{r.get('entropy_ci_lower','?')}, {r.get('entropy_ci_upper','?')}]")
    print(f"    Entropy spec: {r.get('entropy_total', 'N/A')}")
    print(f"    DB violation: {r.get('detailed_balance_violation', 'N/A')}")
    print(f"    Complex:      {r.get('n_complex_modes', 'N/A')}/{r.get('n_modes', 'N/A')}")
    print(f"    FT ratio:     {r.get('fluctuation_theorem_ratio', 'N/A')}")
    ktnd_acc = r.get('ktnd_nber_accuracy')
    if ktnd_acc is not None:
        print(f"    NBER acc/F1:  {ktnd_acc:.3f} / {r.get('ktnd_nber_f1', 0):.3f}")

for mode_tag in ["univariate", "multiasset"]:
    cv_path = f"{RESULTS_DIR}/cv_results_{mode_tag}.json"
    if os.path.exists(cv_path):
        with open(cv_path) as f:
            cv = json.load(f)
        print(f"\n  === Walk-Forward CV ({mode_tag}, {cv.get('n_folds','?')} folds) ===")
        for metric in ['vamp2', 'spectral_gap', 'db_violation', 'complex_fraction']:
            mk, sk = f'{metric}_mean', f'{metric}_std'
            if mk in cv:
                print(f"    {metric:20s}  {cv[mk]:.4f} +/- {cv[sk]:.4f}")

ecal_path = f"{RESULTS_DIR}/entropy_calibration.json"
if os.path.exists(ecal_path):
    with open(ecal_path) as f:
        ecal = json.load(f)
    print(f"\n  === Entropy Calibration ===")
    for pt in ecal.get("calibration_points", []):
        if pt["ep_analytical"] > 1e-6:
            print(f"    T2={pt['T2']:.1f}: analytical={pt['ep_analytical']:.4f}, KDE={pt['ep_kde']:.4f}, spectral={pt['ep_spectral']:.6f}")

for mode_tag in ["univariate", "multiasset"]:
    if mode_tag in multi_seed_summary:
        s = multi_seed_summary[mode_tag]
        print(f"\n  === Multi-Seed {mode_tag} ({s.get('n_seeds','?')} seeds) ===")
        for metric in METRICS:
            mk = f'{metric}_mean'
            if mk in s:
                print(f"    {metric:35s}  {s[mk]:.4f} +/- {s.get(f'{metric}_std',0):.4f}")

stat_path = f"{RESULTS_DIR}/statistical_tests.json"
if os.path.exists(stat_path):
    with open(stat_path) as f:
        st = json.load(f)
    print(f"\n  === Statistical Tests (IAAFT) ===")
    for k, v in st.items():
        if isinstance(v, dict):
            if v.get('skipped'):
                print(f"    {k}: SKIPPED")
            elif 'passed' in v:
                print(f"    {k}: {'PASSED' if v['passed'] else 'FAILED'}")
            elif 'p_value' in v:
                d_str = f", d={v['cohens_d']:.2f}" if 'cohens_d' in v else ""
                print(f"    {k}: p={v['p_value']:.4f}{d_str}")
            elif 'any_significant' in v:
                print(f"    {k}: {'SIGNIFICANT' if v['any_significant'] else 'NOT SIGNIFICANT'}")

print(f"\n{'='*70}")
print(f"  DONE ({total_min:.1f} min). All results in: {RESULTS_DIR}/")
print(f"  Next: Cell 3 (ablations) -> Cell 4 (figures) -> Cell 5 (download)")
print(f"{'='*70}")

In [None]:
#@title 3. Ablations + Brownian gyrator (~6 hours, resume-safe)

import subprocess, time, json, os, sys
import numpy as np
import pandas as pd

REPO_DIR = "/content/ktnd_finance"
OUTPUT_DIR = "/content/ktnd_finance/outputs"
RESULTS_DIR = "/content/ktnd_finance/outputs/results"
FIGURES_DIR = "/content/ktnd_finance/outputs/figures"
N_ABLATION_SEEDS = 10

os.chdir(REPO_DIR)
python = sys.executable

def run_streaming(name, cmd, check_files=None):
    """Run with live output streaming."""
    print(f"\n{'='*70}")
    print(f"  STAGE: {name}")
    print(f"{'='*70}", flush=True)
    t0 = time.time()
    proc = subprocess.Popen(cmd, shell=True, cwd=REPO_DIR,
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
    for line in proc.stdout:
        print(f"  {line}", end="", flush=True)
    proc.wait()
    elapsed = time.time() - t0
    if proc.returncode != 0:
        print(f"  >> {name}: FAILED (exit {proc.returncode}, {elapsed/60:.1f} min)")
        return False
    if check_files:
        missing = [f for f in check_files if not os.path.exists(f)]
        if missing:
            for f in missing:
                print(f"    MISSING: {f}")
            return False
        for f in check_files:
            print(f"  OK: {os.path.basename(f)} ({os.path.getsize(f):,} bytes)")
    print(f"  >> {name}: OK ({elapsed/60:.1f} min)")
    return True

def run(name, cmd, check_files=None):
    """Run a stage, print output, verify files."""
    print(f"\n{'='*70}")
    print(f"  STAGE: {name}")
    print(f"{'='*70}")
    t0 = time.time()
    result = subprocess.run(cmd, shell=True, cwd=REPO_DIR, capture_output=True, text=True)
    elapsed = time.time() - t0
    if result.stdout:
        for line in result.stdout.strip().split('\n')[-30:]:
            print(f"  {line}")
    if result.returncode != 0:
        if result.stderr:
            for line in result.stderr.strip().split('\n')[-20:]:
                print(f"  ! {line}")
        print(f"  >> {name}: FAILED ({elapsed/60:.1f} min)")
        return False
    if check_files:
        missing = [f for f in check_files if not os.path.exists(f)]
        if missing:
            for f in missing:
                print(f"    MISSING: {f}")
            return False
    print(f"  >> {name}: OK ({elapsed/60:.1f} min)")
    return True

t_start = time.time()
results = {}

# ======================================================================
# PART 1: ABLATION STUDY (10 seeds x ~32 variants)
# ======================================================================
results['ablations'] = run_streaming(f'Ablations ({N_ABLATION_SEEDS} seeds)',
    f'{python} -u experiments/run_ablations.py --config config/default.yaml'
    f' --n-seeds {N_ABLATION_SEEDS} --n-jobs 1'
    f' --output-dir {RESULTS_DIR}',
    check_files=[f'{RESULTS_DIR}/ablation_summary.csv'])

summary_path = f"{RESULTS_DIR}/ablation_summary.csv"
if os.path.exists(summary_path):
    abl_df = pd.read_csv(summary_path)
    print(f"\n  {len(abl_df)} ablation variants ({N_ABLATION_SEEDS} seeds each):")
    cols = ['name', 'n_valid', 'vamp2_mean', 'vamp2_std',
            'spectral_gap_mean', 'spectral_gap_std',
            'entropy_total_mean', 'entropy_total_std']
    cols = [c for c in cols if c in abl_df.columns]
    print(abl_df[cols].to_string(index=False))

    if 'vamp2_mean' in abl_df.columns:
        baseline = abl_df[abl_df['name'] == 'baseline']
        if len(baseline) > 0:
            bl_vamp2 = baseline['vamp2_mean'].values[0]
            diff = abl_df.copy()
            diff['vamp2_delta_pct'] = ((diff['vamp2_mean'] - bl_vamp2) / abs(bl_vamp2) * 100)
            notable = diff[abs(diff['vamp2_delta_pct']) > 5].sort_values('vamp2_delta_pct')
            if len(notable) > 0:
                print(f"\n  Variants with >5% VAMP-2 change:")
                for _, row in notable.iterrows():
                    print(f"    {row['name']:40s}  {row['vamp2_delta_pct']:+.1f}%")
            # Save delta table as CSV
            diff.to_csv(f"{RESULTS_DIR}/ablation_delta.csv", index=False)
            print(f"  Saved: ablation_delta.csv")

# ======================================================================
# PART 2: BROWNIAN GYRATOR BENCHMARK
# ======================================================================
from scipy.linalg import solve_continuous_lyapunov

def analytical_ep(T1, T2, k=1.0, kappa=0.5):
    A = np.array([[k, -kappa], [-kappa, k]])
    D = np.array([[T1, 0.0], [0.0, T2]])
    Sigma = solve_continuous_lyapunov(A, 2.0 * D)
    Q = A - D @ np.linalg.inv(Sigma)
    D_inv = np.diag([1.0/T1, 1.0/T2])
    return np.trace(Q @ Sigma @ Q.T @ D_inv)

gyrator_rows = []
for T2 in [1.0, 1.5, 2.0, 3.0, 5.0, 8.0]:
    ep = analytical_ep(1.0, T2)
    gyrator_rows.append({'T1': 1.0, 'T2': T2, 'ep_analytical': ep})
    print(f"  T1=1.0, T2={T2:.1f}: EP={ep:.6f}{'  (equilibrium)' if T2==1.0 else ''}")

pd.DataFrame(gyrator_rows).to_csv(f"{RESULTS_DIR}/gyrator_analytical.csv", index=False)
print(f"  Saved: gyrator_analytical.csv")

results['gyrator'] = run('Brownian gyrator tests',
    f'{python} -m pytest tests/test_synthetic.py::TestBrownianGyrator -v')

# ======================================================================
# SUMMARY
# ======================================================================
total_min = (time.time() - t_start) / 60
print(f"\n{'='*70}")
print(f"  ABLATIONS + GYRATOR COMPLETE ({total_min:.1f} min)")
print(f"{'='*70}")
for name, ok in results.items():
    print(f"  {'OK' if ok else 'FAIL':6s}  {name}")
print(f"\n  Next: Cell 4 (figures) -> Cell 5 (download)")
print(f"{'='*70}")

In [None]:
#@title 4. View figures (run after Cell 2 finishes)

import glob, os
from IPython.display import Image, display

FIGURES_DIR = "/content/ktnd_finance/outputs/figures"

pngs = sorted(glob.glob(f"{FIGURES_DIR}/*.png"))
sup_dir = os.path.join(FIGURES_DIR, "supplemental")
if os.path.exists(sup_dir):
    pngs += sorted(glob.glob(f"{sup_dir}/*.png"))

if pngs:
    print(f"Found {len(pngs)} figures:\n")
    for p in pngs:
        print(f"--- {os.path.basename(p)} ---")
        display(Image(filename=p, width=800))
        print()
else:
    print("No figures found. Make sure Cell 2 has finished running first.")
    print(f"Checked: {FIGURES_DIR}")
    results_dir = "/content/ktnd_finance/outputs/results"
    if os.path.exists(results_dir):
        files = os.listdir(results_dir)
        print(f"Result files available ({len(files)}): {files}")
    else:
        print("No results directory found - Cell 2 needs to run first.")


In [None]:
#@title 5. Download all results as zip

!cd /content/ktnd_finance && zip -rq /content/ktnd_results.zip outputs/
from google.colab import files
files.download('/content/ktnd_results.zip')
print("Download started.")