In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
MACD + Fibonacci Retracement Swing Strategy for Indian Equities
with Ratio EMA200 & 52w-high filter, RS-63 ranking, WFO, Parallel Grid & Caching
=================================================================================

Outputs (per fold directory under ./outputs_wfo/<timestamp>/fold_XX_*):
- grid_train.csv (scored)
- grid_train_raw.csv (full results incl. errors)
- grid_errors_summary.csv (histogram of failures)
- trades_train.csv, trades_test.csv (with hold_days, reason, params, tag)
- monthly_pnl_train.csv, monthly_pnl_test.csv
- loaded_tickers_train.csv/test.csv, missing_tickers_train.csv/test.csv
- funnel_train.csv, funnel_test.csv

Aggregate at root:
- summary_wfo_all_folds.csv (train+test rows across folds)
- summary_wfo_test_aggregate.csv (roll-up of TEST phases)

Dependencies:
    pip install pandas numpy yfinance joblib pytz
"""

import os
import math
import json
import time
import copy
import errno
import hashlib
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import pytz
import yfinance as yf
from joblib import Parallel, delayed

# =========================
# USER CONFIG
# =========================

TIMEZONE              = "Asia/Kolkata"
STARTING_CAPITAL      = 500000.0             # ₹5L
MAX_CONCURRENT        = 5
SLIPPAGE_PCT          = 0.0005               # 5 bps
FEE_PCT_PER_SIDE      = 0.0003               # 3 bps
ALLOC_STOP_PCT        = 0.05                 # ~ -5% price stop

# MASTER window (fits your 10y train + 24m test)
MASTER_START = "2013-01-01"
MASTER_END   = "2025-09-30"

# Walk-Forward Folds: (TRAIN_YEARS, TEST_MONTHS)
WFO_FOLDS = [
    (10, 24),   # 10y train, 2y test -> should produce exactly 1 fold for the MASTER_* above
]

# Universe (replace with your NIFTY500 list)
UNIVERSE = [
    "RELIANCE.NS","HDFCBANK.NS","ICICIBANK.NS","INFY.NS","TCS.NS",
    "LT.NS","ASIANPAINT.NS","SBIN.NS","ITC.NS","BHARTIARTL.NS",
]

# Index candidates (try in order)
INDEX_CANDIDATES = ["^CRSLDX", "^CNX500", "^NSEI"]

# Caching & outputs
CACHE_PRICE_DIR   = "./cache"                # raw OHLCV per ticker/index
CACHE_INDIC_DIR   = "./cache_indicators"     # per-(ticker,param-hash) indicators
OUT_ROOT          = f'./outputs_wfo/{datetime.now().strftime("%Y-%m-%d_%H%M%S")}'

# Grid parameter space (edit breadth for speed vs. thoroughness)
DEFAULT_PARAMS = dict(
    macd_fast=12,
    macd_slow=26,
    macd_sig=9,
    macd_req_above0=False,
    fib_lookback=60,
    fib_band_low=0.382,
    fib_band_high=0.618,
    min_52w_pct=0.80,
    time_stop_bars=20,
)
GRID = dict(
    macd_fast=[10, 12],
    macd_slow=[20, 26],
    macd_sig=[9],
    macd_req_above0=[False, True],
    fib_lookback=[40, 60, 90],
    fib_band_low=[0.236, 0.382, 0.5],
    fib_band_high=[0.618, 0.786],
    min_52w_pct=[0.75, 0.80, 0.85, 0.90],
    time_stop_bars=[15, 20, 30],
)

# Parallelism
N_JOBS = max(1, os.cpu_count() - 1)

# Warmup requirement (for EMA200 & 52w-high stability)
MIN_WARMUP_DAYS = 260

# =========================
# Utilities & Logging
# =========================

IST = pytz.timezone(TIMEZONE)

def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

def log(msg: str):
    print(f"[{datetime.now(IST).strftime('%Y-%m-%d %H:%M:%S')}] {msg}")

def to_ist_index(df: pd.DataFrame) -> pd.DataFrame:
    if df.index.tz is None:
        df = df.tz_localize("UTC")
    return df.tz_convert(IST)

# =========================
# Date Helpers & WFO Windows
# =========================

def _last_day_of_month(y: int, m: int) -> int:
    return (pd.Timestamp(year=y, month=m, day=1) + pd.offsets.MonthEnd(0)).day

def _safe_month_add(dt: pd.Timestamp, months: int) -> pd.Timestamp:
    y = dt.year + (dt.month - 1 + months) // 12
    m = (dt.month - 1 + months) % 12 + 1
    d = min(dt.day, _last_day_of_month(y, m))
    return dt.replace(year=y, month=m, day=d)

def ym_add(start_dt: pd.Timestamp, years=0, months=0) -> pd.Timestamp:
    tmp = start_dt.replace(year=start_dt.year + years)
    return _safe_month_add(tmp, months)

def generate_wfo_windows(master_start: str, master_end: str, train_years: int, test_months: int):
    s = pd.to_datetime(master_start)
    e = pd.to_datetime(master_end)
    if s > e:
        return
    cur_train_start = s
    while True:
        cur_train_end = ym_add(cur_train_start, years=train_years, months=0)
        if cur_train_end > e:
            break
        cur_test_start = cur_train_end + pd.Timedelta(days=1)
        if cur_test_start > e:
            break
        cur_test_end = _safe_month_add(cur_test_start, test_months)
        if cur_test_end > e:
            cur_test_end = e
        if cur_test_end < cur_test_start:
            break
        yield (
            cur_train_start.strftime("%Y-%m-%d"),
            cur_train_end.strftime("%Y-%m-%d"),
            cur_test_start.strftime("%Y-%m-%d"),
            cur_test_end.strftime("%Y-%m-%d"),
        )
        cur_train_start = cur_test_start
        if cur_train_start >= e:
            break

def sanity_check_wfo():
    for (ty, tm) in WFO_FOLDS:
        wins = list(generate_wfo_windows(MASTER_START, MASTER_END, ty, tm))
        log(f"[WFO CHECK] {ty}y/{tm}m -> {len(wins)} window(s) within {MASTER_START}..{MASTER_END}")
        if wins[:1]:
            tr_s, tr_e, te_s, te_e = wins[0]
            log(f"[WFO CHECK] First window: TRAIN {tr_s}→{tr_e} | TEST {te_s}→{te_e}")

# =========================
# Data Loading & Caches
# =========================

def download_or_load_price(ticker: str, start: str, end: str) -> pd.DataFrame:
    ensure_dir(CACHE_PRICE_DIR)
    fname = os.path.join(CACHE_PRICE_DIR, f"{ticker.replace('^','_INDEX_')}.csv")
    if os.path.exists(fname):
        try:
            df = pd.read_csv(fname, parse_dates=["Datetime"]).set_index("Datetime")
            if df.index.max().date() >= pd.to_datetime(end).date():
                return to_ist_index(df)
        except Exception:
            pass
    # retry (Yahoo hiccups)
    for _ in range(3):
        df = yf.download(
            ticker,
            start=start,
            end=(pd.to_datetime(end) + pd.Timedelta(days=2)).strftime("%Y-%m-%d"),
            auto_adjust=False,
            progress=False,
            threads=False,
        )
        if not df.empty:
            df.index.name = "Datetime"
            df = to_ist_index(df)
            df.to_csv(fname, index_label="Datetime")
            return df
        time.sleep(1.0)
    return pd.DataFrame()

def load_index_series(start: str, end: str) -> pd.DataFrame:
    last_err = None
    for tk in INDEX_CANDIDATES:
        try:
            df = download_or_load_price(tk, start, end)
            if not df.empty:
                return df[["Open","High","Low","Close","Volume"]].copy()
        except Exception as e:
            last_err = e
    raise RuntimeError(f"Failed to load any index from {INDEX_CANDIDATES} for {start}..{end}. Last error: {last_err}")

# =========================
# Indicators
# =========================

def ema(s: pd.Series, span: int) -> pd.Series:
    return s.ewm(span=span, adjust=False, min_periods=span).mean()

def macd(close: pd.Series, fast: int, slow: int, sig: int):
    macd_line = ema(close, fast) - ema(close, slow)
    macd_signal = ema(macd_line, sig)
    macd_hist = macd_line - macd_signal
    return macd_line, macd_signal, macd_hist

def detect_fib_bounce(close, high, low, lookback, band_low, band_high):
    roll_high = high.rolling(lookback).max()
    roll_low  = low.rolling(lookback).min()
    dist_high = (roll_high - close).abs()
    dist_low  = (close - roll_low).abs()
    up_leg = dist_high < dist_low  # simple heuristic

    start = roll_low
    end   = roll_high
    diff  = (end - start).replace(0, np.nan)

    def level(frac):
        return end - diff * frac

    band_lo_px = level(band_low)
    band_hi_px = level(band_high)

    y_in_band = (close.shift(1) >= band_lo_px.shift(1)) & (close.shift(1) <= band_hi_px.shift(1))
    bounce_up = close > close.shift(1)
    sig = up_leg & y_in_band & bounce_up
    return sig.fillna(False)

def pct_to_52w_high(close: pd.Series, window: int = 252) -> pd.Series:
    return (close / close.rolling(window).max()).fillna(method="backfill")

def rs63(stock_close: pd.Series, index_close: pd.Series, lookback: int = 63) -> pd.Series:
    ratio = stock_close / index_close
    return (ratio / ratio.shift(lookback) - 1.0).fillna(-np.inf)

def ratio_ema200_pass(stock_close: pd.Series, index_close: pd.Series) -> pd.Series:
    ratio = stock_close / index_close
    return (ratio > ema(ratio, 200)).fillna(False)

def param_hash(params: dict) -> str:
    key_subset = {
        "macd_fast": params["macd_fast"],
        "macd_slow": params["macd_slow"],
        "macd_sig": params["macd_sig"],
        "fib_lookback": params["fib_lookback"],
        "fib_band_low": params["fib_band_low"],
        "fib_band_high": params["fib_band_high"],
    }
    s = json.dumps(key_subset, sort_keys=True)
    return hashlib.sha256(s.encode()).hexdigest()[:16]

def build_indicators_cached(ticker: str, raw_df: pd.DataFrame, params: dict) -> pd.DataFrame:
    ensure_dir(CACHE_INDIC_DIR)
    h = param_hash(params)
    tdir = os.path.join(CACHE_INDIC_DIR, ticker.replace('^','_INDEX_'))
    ensure_dir(tdir)
    fpath = os.path.join(tdir, f"{h}.csv")
    if os.path.exists(fpath):
        try:
            df = pd.read_csv(fpath, parse_dates=["Datetime"]).set_index("Datetime")
            return df
        except Exception:
            pass

    df = raw_df[["Open","High","Low","Close","Volume"]].copy()
    m_line, m_sig, m_hist = macd(df["Close"], params["macd_fast"], params["macd_slow"], params["macd_sig"])
    df["MACD_LINE"] = m_line
    df["MACD_SIG"]  = m_sig
    df["MACD_HIST"] = m_hist
    df["FIB_BOUNCE"] = detect_fib_bounce(df["Close"], df["High"], df["Low"],
                                         params["fib_lookback"], params["fib_band_low"], params["fib_band_high"])
    df["PCT_52W_HIGH"] = pct_to_52w_high(df["Close"], 252)
    df.to_csv(fpath, index_label="Datetime")
    return df

# =========================
# Signals & Simulator
# =========================

def entry_signal(stock_df: pd.DataFrame, index_df: pd.DataFrame, params: dict) -> pd.Series:
    macd_bull = stock_df["MACD_LINE"] > stock_df["MACD_SIG"]
    if params["macd_req_above0"]:
        macd_bull &= stock_df["MACD_LINE"] > 0.0
    fib_ok = stock_df["FIB_BOUNCE"]
    ratio_ok = ratio_ema200_pass(stock_df["Close"], index_df["Close"])
    near_high = stock_df["PCT_52W_HIGH"] >= params["min_52w_pct"]
    return (macd_bull & fib_ok & ratio_ok & near_high).fillna(False)

def simulate_portfolio(params, prices, index_df, start_date, end_date, tag, universe, verbose=True):
    # Align date index with index_df
    date_index = index_df.loc[start_date:end_date].index
    cash = STARTING_CAPITAL
    positions = {}
    trades = []
    funnel_rows = []
    equity_curve = []

    # Precompute series
    all_ent, all_exit, all_rs = {}, {}, {}
    for t in universe:
        sdf = prices[t].reindex(index_df.index)
        all_ent[t]  = entry_signal(sdf, index_df, params)
        all_exit[t] = (sdf["MACD_LINE"] < sdf["MACD_SIG"]).fillna(False)
        all_rs[t]   = rs63(sdf["Close"], index_df["Close"], 63)

    def funnel_template(d):
        return {"date": d.strftime("%Y-%m-%d"),
                "stage_all":0, "stage_macd":0, "stage_fib":0, "stage_ratio":0, "stage_52w":0, "stage_final":0}

    for d in date_index:
        nxt = index_df.index[index_df.index.get_loc(d)+1] if d != date_index[-1] else None

        # Mark-to-market
        mtm = cash
        for t, pos in positions.items():
            px = prices[t].loc[d, "Close"]
            mtm += pos["shares"] * px
        equity_curve.append((d, mtm))

        # Exits
        to_close = []
        for t, pos in list(positions.items()):
            sdf = prices[t]
            days_held = (sdf.index.get_loc(d) - sdf.index.get_loc(pos["entry_date"])) if d >= pos["entry_date"] else 0
            time_stop_hit = days_held >= params["time_stop_bars"]
            sig_exit = all_exit[t].get(d, False)
            cur_close = sdf.loc[d, "Close"]
            alloc_stop_hit = cur_close <= pos["entry_px"] * (1.0 - ALLOC_STOP_PCT)
            if (sig_exit or time_stop_hit or alloc_stop_hit) and nxt is not None:
                reason = "exit_signal" if sig_exit else ("time_stop" if time_stop_hit else "alloc_stop")
                to_close.append((t, reason))

        for (t, reason) in to_close:
            sdf = prices[t]
            if nxt not in sdf.index or pd.isna(sdf.loc[nxt, "Open"]):
                fill_px = sdf.loc[d, "Close"]
            else:
                fill_px = sdf.loc[nxt, "Open"]
            fill_px *= (1 - SLIPPAGE_PCT)
            notional = positions[t]["shares"] * fill_px
            fees = notional * FEE_PCT_PER_SIDE
            cash += (notional - fees)
            entry_val = positions[t]["shares"] * positions[t]["entry_px"]
            pnl = (notional - fees) - entry_val
            hold_days = (nxt - positions[t]["entry_date"]).days if nxt else (d - positions[t]["entry_date"]).days
            trades.append(dict(
                ticker=t, side="SELL",
                entry_date=positions[t]["entry_date"].strftime("%Y-%m-%d"),
                entry_px=positions[t]["entry_px"],
                exit_date=(nxt or d).strftime("%Y-%m-%d"),
                exit_px=fill_px, shares=positions[t]["shares"], pnl=pnl,
                reason=reason, hold_days=hold_days, params=json.dumps(params), tag=tag
            ))
            del positions[t]

        # Entries
        if nxt is None:
            continue
        free_slots = MAX_CONCURRENT - len(positions)
        if free_slots <= 0:
            continue

        funnel = funnel_template(d)
        cands = []
        for t in universe:
            sdf = prices[t]
            if d not in sdf.index:
                continue
            funnel["stage_all"] += 1
            macd_ok = (sdf.loc[d,"MACD_LINE"] > sdf.loc[d,"MACD_SIG"]) and (sdf.loc[d,"MACD_LINE"] > 0 if params["macd_req_above0"] else True)
            if macd_ok: funnel["stage_macd"] += 1
            fib_ok = bool(sdf.loc[d,"FIB_BOUNCE"])
            if macd_ok and fib_ok: funnel["stage_fib"] += 1
            ratio_ok = ((sdf.loc[d,"Close"]/index_df.loc[d,"Close"]) > ema((sdf["Close"]/index_df["Close"]),200).loc[d]) if d in index_df.index else False
            if macd_ok and fib_ok and ratio_ok: funnel["stage_ratio"] += 1
            high_ok = sdf.loc[d,"PCT_52W_HIGH"] >= params["min_52w_pct"]
            if macd_ok and fib_ok and ratio_ok and high_ok: funnel["stage_52w"] += 1
            if all_ent[t].get(d, False):
                funnel["stage_final"] += 1
                cands.append(t)

        funnel_rows.append(funnel)
        if not cands:
            continue

        ranked = sorted(cands, key=lambda x: all_rs[x].get(d, -np.inf), reverse=True)
        chosen = ranked[:free_slots]

        latest_equity = mtm
        target_notional = latest_equity / MAX_CONCURRENT

        for t in chosen:
            sdf = prices[t]
            if pd.isna(sdf.loc[nxt, "Open"]):
                continue
            fill_px = sdf.loc[nxt, "Open"] * (1 + SLIPPAGE_PCT)
            shares = math.floor(target_notional / fill_px)
            if shares <= 0:
                continue
            notional = shares * fill_px
            fees = notional * FEE_PCT_PER_SIDE
            total_cost = notional + fees
            if total_cost > cash:
                shares = math.floor((cash * 0.99) / fill_px)
                if shares <= 0:
                    continue
                notional = shares * fill_px
                fees = notional * FEE_PCT_PER_SIDE
                total_cost = notional + fees
            cash -= total_cost
            positions[t] = dict(entry_date=nxt, entry_px=fill_px, shares=shares)
            trades.append(dict(
                ticker=t, side="BUY",
                entry_date=nxt.strftime("%Y-%m-%d"), entry_px=fill_px,
                exit_date="", exit_px=np.nan, shares=shares,
                pnl=0.0, reason="entry_macd_fib_ratio_52w", hold_days=0,
                params=json.dumps(params), tag=tag
            ))

    # Liquidate leftovers at final day close
    last_day = date_index[-1]
    for t, pos in list(positions.items()):
        sdf = prices[t]
        if last_day not in sdf.index:
            continue
        fill_px = sdf.loc[last_day, "Close"] * (1 - SLIPPAGE_PCT)
        notional = pos["shares"] * fill_px
        fees = notional * FEE_PCT_PER_SIDE
        cash += (notional - fees)
        pnl = (notional - fees) - pos["shares"] * pos["entry_px"]
        hold_days = (last_day - pos["entry_date"]).days
        trades.append(dict(
            ticker=t, side="SELL",
            entry_date=pos["entry_date"].strftime("%Y-%m-%d"),
            entry_px=pos["entry_px"],
            exit_date=last_day.strftime("%Y-%m-%d"),
            exit_px=fill_px, shares=pos["shares"], pnl=pnl, reason="liquidation_eod",
            hold_days=hold_days, params=json.dumps(params), tag=tag
        ))

    # Build trades df with stable columns even if empty
    cols = [
        "ticker","side","entry_date","entry_px","exit_date","exit_px",
        "shares","pnl","reason","hold_days","params","tag"
    ]
    trades_df = pd.DataFrame(trades, columns=cols)

    # Monthly P&L from equity changes
    eq_df = pd.DataFrame(equity_curve, columns=["date","equity"]).set_index("date")
    pnl_df = eq_df["equity"].diff().fillna(0.0)
    monthly = pnl_df.groupby(eq_df.index.to_series().dt.strftime("%Y-%m")).sum().reset_index()
    monthly.columns = ["month","pnl"]

    return trades_df, monthly, pd.DataFrame(funnel_rows)

# =========================
# Prep, Grid, and Scoring
# =========================

def eligible_universe_for_window(universe, train_start: str) -> list:
    """Keep tickers that have at least MIN_WARMUP_DAYS before train_start."""
    need_start = (pd.to_datetime(train_start) - pd.Timedelta(days=MIN_WARMUP_DAYS)).date()
    ok = []
    for t in universe:
        fp = os.path.join(CACHE_PRICE_DIR, f"{t.replace('^','_INDEX_')}.csv")
        df = None
        if os.path.exists(fp):
            try:
                df = pd.read_csv(fp, parse_dates=["Datetime"]).set_index("Datetime")
            except Exception:
                df = None
        if df is None or df.empty:
            df = yf.download(t, start="2005-01-01", end=train_start, progress=False, threads=False)
            if df.empty:
                continue
            df.index.name = "Datetime"
        earliest = pd.to_datetime(df.index.min()).date()
        if earliest <= need_start:
            ok.append(t)
    return ok

def prepare_ticker_prices(universe, start, end, params):
    loaded, missing, prices = [], [], {}
    for t in universe:
        raw = download_or_load_price(t, start, end)
        if raw.empty:
            missing.append(t)
            continue
        ind = build_indicators_cached(t, raw, params)
        prices[t] = ind
        loaded.append(t)
    return prices, loaded, missing

def cartesian_space(grid: dict):
    from itertools import product
    keys = list(grid.keys())
    vals = [grid[k] for k in keys]
    for combo in product(*vals):
        yield dict(zip(keys, combo))

def score_grid_combo(params_delta, base_params, universe, start, end):
    """Joblib worker: build indicators (cached), simulate TRAIN, return metrics or error (safe for empty trades)."""
    params = copy.deepcopy(base_params)
    params.update(params_delta)
    try:
        idx_df = load_index_series(start, end)
        prices, loaded, missing = prepare_ticker_prices(universe, start, end, params)
        if not loaded:
            return dict(**params, error="no_tickers_loaded", loaded=0, missing=len(missing))

        trades_df, monthly_df, _ = simulate_portfolio(
            params, prices, idx_df, start, end, tag="TRAIN", universe=loaded, verbose=False
        )

        # Robust scoring: handle empty ledgers safely
        if trades_df is None or trades_df.empty:
            return dict(
                **params,
                total_pnl=0.0,
                n_trades=0,
                win_rate_pct=0.0,
                avg_hold_days=np.nan,
                loaded=len(loaded),
                missing=len(missing),
            )

        sells = trades_df[trades_df["side"] == "SELL"]
        total_pnl = float(sells["pnl"].sum())  # realized P&L only
        n_trades  = int(len(sells))
        wins      = int((sells["pnl"] > 0).sum())
        win_rate  = float(100.0 * wins / n_trades) if n_trades > 0 else 0.0
        avg_hold  = float(sells["hold_days"].mean()) if n_trades > 0 else np.nan

        return dict(
            **params,
            total_pnl=total_pnl,
            n_trades=n_trades,
            win_rate_pct=win_rate,
            avg_hold_days=avg_hold,
            loaded=len(loaded),
            missing=len(missing),
        )

    except Exception as e:
        return dict(**params, error=str(e))

def run_grid_parallel(universe, start, end, grid, base_params) -> pd.DataFrame:
    combos = list(cartesian_space(grid))
    log(f"[GRID] Combos: {len(combos)} | Parallel jobs: {N_JOBS}")
    results = Parallel(n_jobs=N_JOBS, backend="loky", verbose=10)(
        delayed(score_grid_combo)(c, base_params, universe, start, end) for c in combos
    )
    return pd.DataFrame(results)

def _dump_grid_failures(grid_df: pd.DataFrame, fold_dir: str):
    if grid_df is None or grid_df.empty:
        return
    grid_df.to_csv(os.path.join(fold_dir, "grid_train_raw.csv"), index=False)
    if "error" in grid_df.columns:
        err = grid_df["error"].value_counts(dropna=False).reset_index()
        err.columns = ["error", "count"]
        err.to_csv(os.path.join(fold_dir, "grid_errors_summary.csv"), index=False)

def pick_best_params(grid_df: pd.DataFrame):
    if grid_df is None or grid_df.empty:
        raise RuntimeError("Grid returned 0 rows (index/universe likely empty).")
    ok = grid_df.dropna(subset=["total_pnl"]) if "total_pnl" in grid_df.columns else pd.DataFrame()
    if ok.empty:
        errs = grid_df["error"].value_counts().to_dict() if "error" in grid_df.columns else {}
        raise RuntimeError(f"No valid grid results. Error histogram: {errs}")
    best = ok.sort_values(["total_pnl","win_rate_pct","n_trades"], ascending=[False, False, False]).iloc[0]
    chosen = {k: DEFAULT_PARAMS.get(k, None) for k in DEFAULT_PARAMS.keys()}
    for k in chosen.keys():
        if k in best:
            chosen[k] = best[k]
    return chosen, best.to_dict()

def write_csvs(out_dir: str, **dfs):
    ensure_dir(out_dir)
    for name, df in dfs.items():
        if df is None:
            continue
        path = os.path.join(out_dir, f"{name}.csv")
        df.to_csv(path, index=False)

# =========================
# Walk-Forward Engine
# =========================

def run_fold(universe, train_start, train_end, test_start, test_end, fold_id, out_root):
    fold_dir = os.path.join(out_root, f"fold_{fold_id:02d}_{train_start}_to_{test_end}")
    ensure_dir(fold_dir)
    log(f"[FOLD {fold_id}] Train {train_start}→{train_end} | Test {test_start}→{test_end}")

    # Prefilter universe by warmup availability
    fold_universe = eligible_universe_for_window(universe, train_start)
    if not fold_universe:
        log(f"[FOLD {fold_id}] No eligible tickers with ≥{MIN_WARMUP_DAYS}d history before {train_start}. Skipping.")
        return pd.DataFrame(columns=["fold","phase","start","end","total_pnl","n_trades","win_rate_pct","avg_hold_days","params"])

    # 1) GRID (parallel) on TRAIN
    grid_df = run_grid_parallel(fold_universe, train_start, train_end, GRID, DEFAULT_PARAMS)
    _dump_grid_failures(grid_df, fold_dir)
    grid_df.to_csv(os.path.join(fold_dir, "grid_train.csv"), index=False)
    best_params, best_row = pick_best_params(grid_df)
    log(f"[FOLD {fold_id}] Best params: {best_params}")
    log(f"[FOLD {fold_id}] Best TRAIN metrics: {json.dumps({k: best_row.get(k,None) for k in ['total_pnl','n_trades','win_rate_pct','avg_hold_days','loaded','missing']}, indent=2)}")

    # 2) TRAIN detailed run with best
    idx_df_tr = load_index_series(train_start, train_end)
    prices_tr, loaded_tr, missing_tr = prepare_ticker_prices(fold_universe, train_start, train_end, best_params)
    trades_tr, monthly_tr, funnel_tr = simulate_portfolio(best_params, prices_tr, idx_df_tr, train_start, train_end, tag="TRAIN", universe=loaded_tr, verbose=False)
    write_csvs(fold_dir,
               trades_train=trades_tr,
               monthly_pnl_train=monthly_tr,
               loaded_tickers_train=pd.DataFrame({"ticker":loaded_tr}),
               missing_tickers_train=pd.DataFrame({"ticker":missing_tr}),
               funnel_train=funnel_tr)

    # 3) TEST with best
    idx_df_te = load_index_series(test_start, test_end)
    prices_te, loaded_te, missing_te = prepare_ticker_prices(fold_universe, test_start, test_end, best_params)
    trades_te, monthly_te, funnel_te = simulate_portfolio(best_params, prices_te, idx_df_te, test_start, test_end, tag="TEST", universe=loaded_te, verbose=False)
    write_csvs(fold_dir,
               trades_test=trades_te,
               monthly_pnl_test=monthly_te,
               loaded_tickers_test=pd.DataFrame({"ticker":loaded_te}),
               missing_tickers_test=pd.DataFrame({"ticker":missing_te}),
               funnel_test=funnel_te)

    # Summaries
    tr_sum = dict(
        fold=fold_id,
        phase="TRAIN",
        start=train_start,
        end=train_end,
        total_pnl=float(trades_tr.loc[trades_tr["side"] == "SELL", "pnl"].sum() if not trades_tr.empty else 0.0),
        n_trades=int((trades_tr["side"] == "SELL").count()) if not trades_tr.empty else 0,
        win_rate_pct=float(
            (trades_tr.loc[(trades_tr["side"] == "SELL") & (trades_tr["pnl"] > 0)].shape[0] /
            max(1, (trades_tr["side"] == "SELL").sum())) * 100.0
        ) if not trades_tr.empty else 0.0,
        avg_hold_days=float(
            trades_tr.loc[trades_tr["side"] == "SELL", "hold_days"].mean()
        ) if (not trades_tr.empty and (trades_tr["side"] == "SELL").sum() > 0) else np.nan,
        params=json.dumps(best_params),
    )

    te_sum = dict(
        fold=fold_id,
        phase="TEST",
        start=test_start,
        end=test_end,
        total_pnl=float(trades_te.loc[trades_te["side"] == "SELL", "pnl"].sum() if not trades_te.empty else 0.0),
        n_trades=int((trades_te["side"] == "SELL").count()) if not trades_te.empty else 0,
        win_rate_pct=float(
            (trades_te.loc[(trades_te["side"] == "SELL") & (trades_te["pnl"] > 0)].shape[0] /
            max(1, (trades_te["side"] == "SELL").sum())) * 100.0
        ) if not trades_te.empty else 0.0,
        avg_hold_days=float(
            trades_te.loc[trades_te["side"] == "SELL", "hold_days"].mean()
        ) if (not trades_te.empty and (trades_te["side"] == "SELL").sum() > 0) else np.nan,
        params=json.dumps(best_params),
    )

    summary_df = pd.DataFrame([tr_sum, te_sum])
    summary_df.to_csv(os.path.join(fold_dir, "summary_fold.csv"), index=False)
    return summary_df

def run_wfo(universe, out_root):
    ensure_dir(out_root)
    all_summaries = []
    fold_id = 1

    any_windows = False
    for (train_years, test_months) in WFO_FOLDS:
        windows = list(generate_wfo_windows(MASTER_START, MASTER_END, train_years, test_months))
        log(f"[INFO] Scheme {train_years}y/{test_months}m -> {len(windows)} window(s)")
        for (tr_s, tr_e, te_s, te_e) in windows:
            any_windows = True
            summary_df = run_fold(universe, tr_s, tr_e, te_s, te_e, fold_id, out_root)
            all_summaries.append(summary_df)
            fold_id += 1

    if not any_windows:
        log("[ERROR] No WFO windows were generated. Check MASTER_* and WFO_FOLDS.")
        return

    agg = pd.concat(all_summaries, axis=0, ignore_index=True)
    agg.to_csv(os.path.join(out_root, "summary_wfo_all_folds.csv"), index=False)

    test_only = agg[agg["phase"]=="TEST"].copy()
    if not test_only.empty:
        rollup = pd.DataFrame([dict(
            folds=len(test_only["fold"].unique()),
            total_pnl=float(test_only["total_pnl"].sum()),
            mean_pnl=float(test_only["total_pnl"].mean()),
            median_pnl=float(test_only["total_pnl"].median()),
            total_trades=int(test_only["n_trades"].sum()),
            mean_win_rate_pct=float(test_only["win_rate_pct"].mean()),
            median_win_rate_pct=float(test_only["win_rate_pct"].median()),
            mean_avg_hold_days=float(test_only["avg_hold_days"].mean()),
        )])
        rollup.to_csv(os.path.join(out_root, "summary_wfo_test_aggregate.csv"), index=False)
    log("WFO complete.")

# =========================
# Main
# =========================

def main():
    t0 = time.time()
    ensure_dir(OUT_ROOT)
    log(f"Outputs -> {OUT_ROOT}")
    log(f"Universe size: {len(UNIVERSE)} | CPU parallel jobs: {N_JOBS}")

    sanity_check_wfo()
    run_wfo(UNIVERSE, OUT_ROOT)

    log(f"Done in {time.time() - t0:.1f}s")

if __name__ == "__main__":
    main()


[2025-10-03 23:34:36] Outputs -> ./outputs_wfo/2025-10-03_233436
[2025-10-03 23:34:36] Universe size: 10 | CPU parallel jobs: 9
[2025-10-03 23:34:36] [WFO CHECK] 10y/24m -> 1 window(s) within 2013-01-01..2025-09-30
[2025-10-03 23:34:36] [WFO CHECK] First window: TRAIN 2013-01-01→2023-01-01 | TEST 2023-01-02→2025-01-02
[2025-10-03 23:34:36] [INFO] Scheme 10y/24m -> 1 window(s)
[2025-10-03 23:34:36] [FOLD 1] Train 2013-01-01→2023-01-01 | Test 2023-01-02→2025-01-02
[2025-10-03 23:34:36] [GRID] Combos: 1728 | Parallel jobs: 9


  df = yf.download(t, start="2005-01-01", end=train_start, progress=False, threads=False)
  df = yf.download(t, start="2005-01-01", end=train_start, progress=False, threads=False)
  df = yf.download(t, start="2005-01-01", end=train_start, progress=False, threads=False)
  df = yf.download(t, start="2005-01-01", end=train_start, progress=False, threads=False)
  df = yf.download(t, start="2005-01-01", end=train_start, progress=False, threads=False)
  df = yf.download(t, start="2005-01-01", end=train_start, progress=False, threads=False)
  df = yf.download(t, start="2005-01-01", end=train_start, progress=False, threads=False)
  df = yf.download(t, start="2005-01-01", end=train_start, progress=False, threads=False)
  df = yf.download(t, start="2005-01-01", end=train_start, progress=False, threads=False)
  df = yf.download(t, start="2005-01-01", end=train_start, progress=False, threads=False)
[Parallel(n_jobs=9)]: Using backend LokyBackend with 9 concurrent workers.
[Parallel(n_jobs=9)]: Don