# Resumable Quantitative Research Pipeline

A fully automated, restart-safe research pipeline for systematic strategy discovery.

**Pipeline Steps:**
1. Data Layer — Load & clean OHLCV data
2. Feature Engine — Momentum, volatility, regime features + cross-sectional deciles
3. Candidate Generator — Decile conditions, decision trees, logistic rank model
4. Edge Evaluation — Win rate, Sharpe, lift, expectancy per candidate
5. Walk-Forward Validation — Rolling 3Y train / 1Y test
6. Overfitting Control — Stability filtering, bootstrap CI
7. Strategy Scoring — Composite rank and final selection
8. Output — Ranked table, equity curves, portfolio combination

**Key Feature:** Every step saves progress to Google Drive. If the runtime disconnects, re-run the notebook and it resumes from the last completed step.

| Runtime | Est. Time (100 tickers, 10Y) |
|---------|------------------------------|
| T4 GPU  | ~60-90 min                   |
| CPU     | ~120-180 min                 |

In [None]:
!pip install -q yfinance pandas numpy scikit-learn scipy matplotlib pyarrow joblib

## 0. Configuration

In [None]:
import os
import json
import time
import gc
import logging
import warnings
from dataclasses import dataclass, field, asdict
from typing import List, Optional, Dict, Tuple

warnings.filterwarnings('ignore')


@dataclass
class PipelineConfig:
    """Central configuration for the entire pipeline."""
    # --- Data ---
    tickers: List[str] = field(default_factory=lambda: [
        # US Large Cap
        "AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "META", "TSLA", "BRK-B",
        "JPM", "JNJ", "V", "PG", "UNH", "HD", "MA", "DIS", "BAC", "NFLX",
        "ADBE", "CRM", "XOM", "VZ", "KO", "INTC", "PEP", "ABT", "CSCO",
        "COST", "MRK", "WMT", "AVGO", "ACN", "CVX", "NKE", "LLY", "MCD",
        "QCOM", "UPS", "BMY", "LIN", "NEE", "ORCL", "RTX", "HON", "TXN",
        "AMD", "PYPL", "CMCSA", "TMO", "DHR",
    ])
    market_ticker: str = "SPY"
    data_period: str = "10y"

    # --- Features ---
    momentum_windows: List[int] = field(default_factory=lambda: [5, 20, 60, 120])
    volatility_windows: List[int] = field(default_factory=lambda: [20, 60])
    regime_window: int = 60
    n_decile_bins: int = 10

    # --- Candidates ---
    max_combo_features: int = 2
    min_sample_size: int = 300
    candidate_batch_size: int = 500
    # Decision tree
    tree_max_depth: int = 2
    tree_min_samples_leaf: int = 500
    n_trees: int = 20
    # Logistic rank
    logistic_top_pct: float = 0.20

    # --- Forward return ---
    forward_days: int = 21  # 1-month forward return

    # --- Walk-Forward ---
    wf_train_years: int = 3
    wf_test_months: int = 12
    wf_step_months: int = 6
    wf_embargo_days: int = 5

    # --- Overfitting ---
    min_stability: float = 0.5
    min_sharpe: float = 0.3
    min_win_rate: float = 0.50
    bootstrap_n: int = 1000
    bootstrap_ci: float = 0.95

    # --- Scoring ---
    w_stability: float = 0.30
    w_sharpe: float = 0.30
    w_lift: float = 0.20
    w_sample: float = 0.20

    # --- Paths ---
    drive_root: str = "/content/drive/MyDrive/quant_pipeline"

    # --- Random seed ---
    seed: int = 42

    @property
    def data_dir(self): return os.path.join(self.drive_root, "data")
    @property
    def features_dir(self): return os.path.join(self.drive_root, "features")
    @property
    def candidates_dir(self): return os.path.join(self.drive_root, "candidates")
    @property
    def evaluation_dir(self): return os.path.join(self.drive_root, "evaluation")
    @property
    def walkforward_dir(self): return os.path.join(self.drive_root, "walkforward")
    @property
    def logs_dir(self): return os.path.join(self.drive_root, "logs")
    @property
    def state_path(self): return os.path.join(self.drive_root, "state.json")


CFG = PipelineConfig()
print("Config created. Drive root: %s" % CFG.drive_root)
print("Tickers: %d | Period: %s | Forward: %dD" % (
    len(CFG.tickers), CFG.data_period, CFG.forward_days))

## 1. Persistence & Resume System

In [None]:
# Mount Google Drive
DRIVE_MOUNTED = False
try:
    from google.colab import drive
    drive.mount('/content/drive', timeout_ms=60000)
    DRIVE_MOUNTED = True
    print("Google Drive mounted.")
except Exception as e:
    print("Drive mount failed: %s" % str(e)[:80])
    print("Using local storage (data lost on disconnect).")
    CFG.drive_root = "/content/quant_pipeline"

# Create all directories
for d in [CFG.data_dir, CFG.features_dir, CFG.candidates_dir,
          CFG.evaluation_dir, CFG.walkforward_dir, CFG.logs_dir]:
    os.makedirs(d, exist_ok=True)
print("Directories ready.")

In [None]:
class ProgressTracker:
    """JSON-based checkpoint system for resumable execution."""

    def __init__(self, state_path: str):
        self.state_path = state_path
        self.state = self._load()

    def _load(self) -> dict:
        if os.path.exists(self.state_path):
            with open(self.state_path, 'r') as f:
                return json.load(f)
        return {"completed_steps": {}, "metadata": {}}

    def _save(self):
        with open(self.state_path, 'w') as f:
            json.dump(self.state, f, indent=2, default=str)

    def is_completed(self, step_name: str) -> bool:
        return self.state["completed_steps"].get(step_name, False)

    def mark_completed(self, step_name: str, metadata: dict = None):
        self.state["completed_steps"][step_name] = True
        if metadata:
            self.state["metadata"][step_name] = metadata
        self._save()
        print("  [CHECKPOINT] %s completed." % step_name)

    def get_metadata(self, step_name: str) -> dict:
        return self.state["metadata"].get(step_name, {})

    def reset(self, step_name: str = None):
        if step_name:
            self.state["completed_steps"].pop(step_name, None)
            self.state["metadata"].pop(step_name, None)
        else:
            self.state = {"completed_steps": {}, "metadata": {}}
        self._save()

    def summary(self):
        completed = [k for k, v in self.state["completed_steps"].items() if v]
        print("=== Progress Summary ===")
        if completed:
            for s in completed:
                print("  [DONE] %s" % s)
        else:
            print("  No steps completed yet.")


tracker = ProgressTracker(CFG.state_path)
tracker.summary()

# Uncomment to force re-run from scratch:
# tracker.reset()

In [None]:
# Setup logging
import logging

log_file = os.path.join(CFG.logs_dir, "pipeline.log")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler(log_file, mode='a'),
    ]
)
logger = logging.getLogger("pipeline")
logger.info("Pipeline started. Log file: %s" % log_file)

import numpy as np
import pandas as pd
np.random.seed(CFG.seed)

## 2. Data Layer

Downloads OHLCV data via yfinance. Saves cleaned data to `data/processed.parquet`.

**Safeguards:**
- No lookahead bias (forward returns computed separately)
- Proper datetime index
- Survivorship bias warning

In [None]:
import yfinance as yf

STEP = "data_load"
processed_path = os.path.join(CFG.data_dir, "processed.parquet")

if tracker.is_completed(STEP):
    logger.info("[SKIP] %s already completed. Loading from cache." % STEP)
    ohlcv_panel = pd.read_parquet(processed_path)
    market_df = pd.read_parquet(os.path.join(CFG.data_dir, "market.parquet"))
    print("Loaded: %s" % str(ohlcv_panel.shape))
else:
    logger.info("[RUN] %s" % STEP)
    t0 = time.time()

    # WARNING: Survivorship bias -- this universe only includes currently listed tickers.
    # For production research, use a point-in-time universe with delisted stocks.

    all_dfs = []
    failed = []
    for i, ticker in enumerate(CFG.tickers):
        if (i + 1) % 10 == 0 or i == 0:
            print("  [%d/%d] %s" % (i + 1, len(CFG.tickers), ticker))
        try:
            df = yf.download(ticker, period=CFG.data_period, progress=False, auto_adjust=True)
            if df.empty or len(df) < 252:
                failed.append(ticker)
                continue
            # Flatten MultiIndex columns if present
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = df.columns.get_level_values(0)
            df = df[["Open", "High", "Low", "Close", "Volume"]].copy()
            df.columns = ["open", "high", "low", "close", "volume"]
            df.index = pd.to_datetime(df.index)
            df.index = df.index.tz_localize(None)
            df["ticker"] = ticker
            all_dfs.append(df)
        except Exception as e:
            failed.append(ticker)
            print("    FAIL: %s -- %s" % (ticker, str(e)[:60]))

    ohlcv_panel = pd.concat(all_dfs)
    ohlcv_panel = ohlcv_panel.set_index([ohlcv_panel.index, "ticker"])
    ohlcv_panel.index.names = ["date", "ticker"]
    ohlcv_panel = ohlcv_panel.sort_index()

    # Market index
    market_df = yf.download(CFG.market_ticker, period=CFG.data_period, progress=False, auto_adjust=True)
    if isinstance(market_df.columns, pd.MultiIndex):
        market_df.columns = market_df.columns.get_level_values(0)
    market_df = market_df[["Close"]].copy()
    market_df.columns = ["close"]
    market_df.index = pd.to_datetime(market_df.index)
    market_df.index = market_df.index.tz_localize(None)

    # Save
    ohlcv_panel.to_parquet(processed_path)
    market_df.to_parquet(os.path.join(CFG.data_dir, "market.parquet"))

    elapsed = time.time() - t0
    meta = {"n_tickers": len(all_dfs), "failed": failed, "rows": len(ohlcv_panel), "time_sec": elapsed}
    tracker.mark_completed(STEP, meta)

    print("\nDownloaded: %d/%d tickers in %.0fs" % (len(all_dfs), len(CFG.tickers), elapsed))
    if failed:
        print("Failed: %s" % failed)

# Summary
valid_tickers = ohlcv_panel.index.get_level_values(1).unique().tolist()
dates = ohlcv_panel.index.get_level_values(0).unique()
print("\nPanel: %s | Tickers: %d" % (str(ohlcv_panel.shape), len(valid_tickers)))
print("Date range: %s to %s" % (dates.min().date(), dates.max().date()))

## 3. Feature Engine

Generates momentum, volatility, and regime features, then converts to cross-sectional deciles.

Each feature group is saved separately and tracked in `state.json`.

In [None]:
def compute_momentum_features(close_df: pd.DataFrame, windows: list) -> pd.DataFrame:
    """Compute momentum (return) features for each window."""
    feats = {}
    for w in windows:
        feats["mom_%dd" % w] = close_df.pct_change(w)
    return pd.DataFrame(feats, index=close_df.index)


def compute_volatility_features(close_df: pd.DataFrame, windows: list) -> pd.DataFrame:
    """Compute volatility features."""
    daily_ret = close_df.pct_change()
    feats = {}
    for w in windows:
        feats["vol_%dd" % w] = daily_ret.rolling(w).std()
    # Volatility change (short / long)
    if len(windows) >= 2:
        short_w, long_w = windows[0], windows[-1]
        short_vol = daily_ret.rolling(short_w).std()
        long_vol = daily_ret.rolling(long_w).std()
        feats["vol_change"] = short_vol / long_vol.replace(0, np.nan) - 1.0
    return pd.DataFrame(feats, index=close_df.index)


def compute_regime_features(market_close: pd.Series, regime_window: int) -> pd.DataFrame:
    """Compute market regime features."""
    market_ret = market_close.pct_change()
    feats = {}
    feats["market_mom_%dd" % regime_window] = market_close.pct_change(regime_window)
    feats["market_vol_%dd" % regime_window] = market_ret.rolling(regime_window).std()
    # Binary regime: 1 = bull (positive momentum + low vol), 0 = bear
    mom = feats["market_mom_%dd" % regime_window]
    vol = feats["market_vol_%dd" % regime_window]
    vol_median = vol.rolling(252, min_periods=60).median()
    feats["regime_bull"] = ((mom > 0) & (vol < vol_median)).astype(float)
    return pd.DataFrame(feats, index=market_close.index)


def compute_market_relative_return(stock_ret: pd.Series, market_ret: pd.Series) -> pd.Series:
    """Stock return minus market return."""
    return stock_ret - market_ret


def to_cross_sectional_deciles(feature_series: pd.Series, date_index, n_bins: int = 10) -> pd.Series:
    """Convert a feature to rolling cross-sectional decile ranks (0-9)."""
    def rank_date(group):
        valid = group.dropna()
        if len(valid) < n_bins:
            return pd.Series(np.nan, index=group.index)
        ranks = valid.rank(method='first')
        deciles = pd.cut(ranks, bins=n_bins, labels=False)
        return deciles.reindex(group.index)
    return feature_series.groupby(level=date_index).transform(rank_date)


print("Feature functions defined.")

In [None]:
STEP = "features"
features_path = os.path.join(CFG.features_dir, "all_features.parquet")

if tracker.is_completed(STEP):
    logger.info("[SKIP] %s already completed. Loading." % STEP)
    feature_panel = pd.read_parquet(features_path)
    print("Loaded features: %s" % str(feature_panel.shape))
else:
    logger.info("[RUN] %s" % STEP)
    t0 = time.time()
    all_features = []

    # Market returns for relative features & regime
    market_close = market_df["close"]
    market_ret_20d = market_close.pct_change(20)

    # Regime features (date-level, will broadcast to all tickers)
    regime_feats = compute_regime_features(market_close, CFG.regime_window)

    for ticker in valid_tickers:
        try:
            tdata = ohlcv_panel.loc[(slice(None), ticker), :].droplevel(1)
            close = tdata["close"]

            # Momentum
            mom = compute_momentum_features(close, CFG.momentum_windows)

            # Market-relative return
            stock_20d_ret = close.pct_change(20)
            mkt_20d_aligned = market_ret_20d.reindex(close.index, method='ffill')
            mom["market_relative_20d"] = stock_20d_ret - mkt_20d_aligned

            # Volatility
            vol = compute_volatility_features(close, CFG.volatility_windows)

            # Regime (broadcast market-level to stock dates)
            reg = regime_feats.reindex(close.index, method='ffill')

            # Forward return (target) -- shifted properly to avoid lookahead
            fwd_ret = close.pct_change(CFG.forward_days).shift(-CFG.forward_days)

            # Combine
            combined = pd.concat([mom, vol, reg], axis=1)
            combined["fwd_return"] = fwd_ret
            combined["ticker"] = ticker
            combined.index.name = "date"
            all_features.append(combined)
        except Exception as e:
            logger.warning("Feature error for %s: %s" % (ticker, str(e)[:60]))

    feature_panel = pd.concat(all_features)
    feature_panel = feature_panel.reset_index().set_index(["date", "ticker"]).sort_index()

    # Drop rows where all features are NaN
    feat_cols = [c for c in feature_panel.columns if c != "fwd_return"]
    feature_panel = feature_panel.dropna(subset=feat_cols, how='all')

    print("Raw features: %s" % str(feature_panel.shape))

    # Cross-sectional deciles for each feature
    logger.info("Computing cross-sectional deciles...")
    for col in feat_cols:
        decile_col = col + "_decile"
        feature_panel[decile_col] = to_cross_sectional_deciles(
            feature_panel[col], "date", CFG.n_decile_bins
        )

    # Save
    feature_panel.to_parquet(features_path)
    elapsed = time.time() - t0
    tracker.mark_completed(STEP, {
        "n_features": len(feat_cols),
        "n_rows": len(feature_panel),
        "time_sec": elapsed,
    })
    print("Features saved (%.0fs): %s" % (elapsed, str(feature_panel.shape)))
    gc.collect()

# List feature columns
feat_cols = [c for c in feature_panel.columns if c != "fwd_return" and not c.endswith("_decile")]
decile_cols = [c for c in feature_panel.columns if c.endswith("_decile")]
print("\nRaw features (%d): %s" % (len(feat_cols), feat_cols))
print("Decile features (%d): %s" % (len(decile_cols), decile_cols))
print("Forward return NaN: %.1f%%" % (feature_panel["fwd_return"].isna().mean() * 100))

## 4. Candidate Generator

Three methods:
- **A. Decile Conditions** — Single and 2-feature decile combinations
- **B. Decision Tree** — Shallow trees (depth=2) generate leaf-based strategies
- **C. Logistic Rank** — Top quantile of logistic regression predicted probability

In [None]:
# === 4A. Decile Condition Generator ===

from itertools import combinations

STEP = "candidates_decile"
decile_candidates_path = os.path.join(CFG.candidates_dir, "decile_candidates.parquet")

if tracker.is_completed(STEP):
    logger.info("[SKIP] %s" % STEP)
    decile_candidates = pd.read_parquet(decile_candidates_path)
    print("Loaded %d decile candidates." % len(decile_candidates))
else:
    logger.info("[RUN] %s" % STEP)
    t0 = time.time()

    # Work with rows that have forward returns
    valid_data = feature_panel.dropna(subset=["fwd_return"]).copy()
    n_bins = CFG.n_decile_bins

    candidates = []

    # --- Single feature conditions ---
    logger.info("Generating 1-feature decile conditions...")
    for col in decile_cols:
        for decile_val in range(n_bins):
            mask = valid_data[col] == decile_val
            n_trades = mask.sum()
            if n_trades < CFG.min_sample_size:
                continue
            ret = valid_data.loc[mask, "fwd_return"]
            candidates.append({
                "strategy_id": "%s_d%d" % (col, decile_val),
                "type": "single_decile",
                "features": col,
                "condition": "== %d" % decile_val,
                "n_trades": int(n_trades),
                "mean_return": float(ret.mean()),
                "win_rate": float((ret > 0).mean()),
            })

    logger.info("  Single conditions: %d" % len(candidates))

    # --- 2-feature combinations (top & bottom deciles only for efficiency) ---
    logger.info("Generating 2-feature decile combinations...")
    extreme_deciles = [0, 1, n_bins - 2, n_bins - 1]  # top/bottom 2 deciles
    n_before = len(candidates)
    batch_count = 0

    for col_a, col_b in combinations(decile_cols, 2):
        for da in extreme_deciles:
            for db in extreme_deciles:
                mask = (valid_data[col_a] == da) & (valid_data[col_b] == db)
                n_trades = mask.sum()
                if n_trades < CFG.min_sample_size:
                    continue
                ret = valid_data.loc[mask, "fwd_return"]
                candidates.append({
                    "strategy_id": "%s_d%d_AND_%s_d%d" % (col_a, da, col_b, db),
                    "type": "combo_decile",
                    "features": "%s, %s" % (col_a, col_b),
                    "condition": "%s==%d AND %s==%d" % (col_a, da, col_b, db),
                    "n_trades": int(n_trades),
                    "mean_return": float(ret.mean()),
                    "win_rate": float((ret > 0).mean()),
                })

        batch_count += 1
        if batch_count % 50 == 0:
            print("    Processed %d feature pairs, %d candidates so far" % (
                batch_count, len(candidates)))

    logger.info("  Combo conditions: %d" % (len(candidates) - n_before))
    logger.info("  Total decile candidates: %d" % len(candidates))

    decile_candidates = pd.DataFrame(candidates)
    decile_candidates.to_parquet(decile_candidates_path)

    elapsed = time.time() - t0
    tracker.mark_completed(STEP, {"n_candidates": len(candidates), "time_sec": elapsed})
    print("Decile candidates: %d (%.0fs)" % (len(candidates), elapsed))
    gc.collect()

print("\nDecile candidates: %d" % len(decile_candidates))
if len(decile_candidates) > 0:
    print(decile_candidates[["type", "n_trades", "mean_return", "win_rate"]].describe())

In [None]:
# === 4B. Decision Tree Generator ===

from sklearn.tree import DecisionTreeClassifier
import pickle

STEP = "candidates_tree"
tree_candidates_path = os.path.join(CFG.candidates_dir, "tree_candidates.parquet")

if tracker.is_completed(STEP):
    logger.info("[SKIP] %s" % STEP)
    tree_candidates = pd.read_parquet(tree_candidates_path)
    print("Loaded %d tree candidates." % len(tree_candidates))
else:
    logger.info("[RUN] %s" % STEP)
    t0 = time.time()

    valid_data = feature_panel.dropna(subset=["fwd_return"] + feat_cols).copy()
    X = valid_data[feat_cols].values.astype(np.float32)
    # Binary target: positive return = 1
    y = (valid_data["fwd_return"].values > 0).astype(int)

    tree_strats = []

    for tree_idx in range(CFG.n_trees):
        # Random feature subset for diversity
        n_feat_subset = max(3, len(feat_cols) // 2)
        feat_subset_idx = np.random.choice(len(feat_cols), n_feat_subset, replace=False)
        feat_subset_names = [feat_cols[i] for i in feat_subset_idx]
        X_sub = X[:, feat_subset_idx]

        # Sample rows for diversity
        sample_idx = np.random.choice(len(X_sub), min(len(X_sub), 50000), replace=False)

        tree = DecisionTreeClassifier(
            max_depth=CFG.tree_max_depth,
            min_samples_leaf=CFG.tree_min_samples_leaf,
            random_state=CFG.seed + tree_idx,
        )
        tree.fit(X_sub[sample_idx], y[sample_idx])

        # Each leaf = candidate strategy
        leaf_ids = tree.apply(X_sub)
        unique_leaves = np.unique(leaf_ids)

        for leaf in unique_leaves:
            leaf_mask_full = tree.apply(X) == leaf  # apply on full data
            # Actually we need to apply on the subset columns
            leaf_mask_full = tree.apply(X[:, feat_subset_idx]) == leaf
            n_trades = leaf_mask_full.sum()
            if n_trades < CFG.min_sample_size:
                continue
            ret = valid_data["fwd_return"].values[leaf_mask_full]
            tree_strats.append({
                "strategy_id": "tree_%d_leaf_%d" % (tree_idx, leaf),
                "type": "decision_tree",
                "features": ", ".join(feat_subset_names[:5]),
                "condition": "tree_%d/leaf_%d" % (tree_idx, leaf),
                "n_trades": int(n_trades),
                "mean_return": float(np.nanmean(ret)),
                "win_rate": float((ret > 0).mean()),
            })

        # Save tree model
        tree_path = os.path.join(CFG.candidates_dir, "tree_%d.pkl" % tree_idx)
        with open(tree_path, 'wb') as f:
            pickle.dump({"tree": tree, "features": feat_subset_names}, f)

        if (tree_idx + 1) % 5 == 0:
            print("  Tree %d/%d done, %d candidates so far" % (
                tree_idx + 1, CFG.n_trees, len(tree_strats)))

    tree_candidates = pd.DataFrame(tree_strats)
    tree_candidates.to_parquet(tree_candidates_path)

    elapsed = time.time() - t0
    tracker.mark_completed(STEP, {"n_candidates": len(tree_strats), "time_sec": elapsed})
    print("Tree candidates: %d (%.0fs)" % (len(tree_strats), elapsed))
    gc.collect()

print("\nTree candidates: %d" % len(tree_candidates))

In [None]:
# === 4C. Logistic Rank Model ===

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

STEP = "candidates_logistic"
logistic_candidates_path = os.path.join(CFG.candidates_dir, "logistic_candidates.parquet")

if tracker.is_completed(STEP):
    logger.info("[SKIP] %s" % STEP)
    logistic_candidates = pd.read_parquet(logistic_candidates_path)
    print("Loaded %d logistic candidates." % len(logistic_candidates))
else:
    logger.info("[RUN] %s" % STEP)
    t0 = time.time()

    valid_data = feature_panel.dropna(subset=["fwd_return"] + feat_cols).copy()
    X = valid_data[feat_cols].values.astype(np.float32)
    y = (valid_data["fwd_return"].values > 0).astype(int)

    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train/test split by time (80/20)
    dates_sorted = valid_data.index.get_level_values(0)
    split_date = dates_sorted.unique()[int(len(dates_sorted.unique()) * 0.8)]
    train_mask = dates_sorted <= split_date
    test_mask = dates_sorted > split_date

    lr = LogisticRegression(
        max_iter=1000, C=0.1, penalty='l2',
        random_state=CFG.seed, solver='lbfgs',
    )
    lr.fit(X_scaled[train_mask], y[train_mask])

    # Predict probability
    proba = lr.predict_proba(X_scaled)[:, 1]

    # Strategy = top 20% by predicted probability
    threshold = np.percentile(proba, (1 - CFG.logistic_top_pct) * 100)
    top_mask = proba >= threshold

    # Also create quintile-based strategies
    logistic_strats = []
    quintile_edges = np.percentile(proba, [0, 20, 40, 60, 80, 100])

    for q in range(5):
        q_mask = (proba >= quintile_edges[q]) & (proba < quintile_edges[q + 1])
        if q == 4:  # include right edge for top quintile
            q_mask = proba >= quintile_edges[q]
        n_trades = q_mask.sum()
        if n_trades < CFG.min_sample_size:
            continue
        ret = valid_data["fwd_return"].values[q_mask]
        logistic_strats.append({
            "strategy_id": "logistic_q%d" % (q + 1),
            "type": "logistic_rank",
            "features": "all_features",
            "condition": "logistic_quintile_%d" % (q + 1),
            "n_trades": int(n_trades),
            "mean_return": float(np.nanmean(ret)),
            "win_rate": float((ret > 0).mean()),
        })

    # Save model + quintile edges for consistent reconstruction across resume
    model_path = os.path.join(CFG.candidates_dir, "logistic_model.pkl")
    with open(model_path, 'wb') as f:
        pickle.dump({"model": lr, "scaler": scaler, "features": feat_cols,
                     "quintile_edges": quintile_edges.tolist()}, f)

    logistic_candidates = pd.DataFrame(logistic_strats)
    logistic_candidates.to_parquet(logistic_candidates_path)

    elapsed = time.time() - t0
    tracker.mark_completed(STEP, {"n_candidates": len(logistic_strats), "time_sec": elapsed})
    print("Logistic candidates: %d (%.0fs)" % (len(logistic_strats), elapsed))
    gc.collect()

print("\nLogistic candidates: %d" % len(logistic_candidates))

In [None]:
# === Merge all candidates ===

all_candidates = pd.concat([
    decile_candidates, tree_candidates, logistic_candidates
], ignore_index=True)

print("=== All Candidates ===")
print("Total: %d" % len(all_candidates))
print("\nBy type:")
print(all_candidates["type"].value_counts())
print("\nTop 10 by mean return:")
print(all_candidates.nlargest(10, "mean_return")[
    ["strategy_id", "type", "n_trades", "mean_return", "win_rate"]
].to_string(index=False))

## 5. Edge Evaluation Engine

For each candidate, compute:
- Win rate, mean return, Sharpe ratio, max drawdown
- Lift (vs unconditional mean), expectancy, sample size

Results saved incrementally — already-evaluated strategies are skipped.

In [None]:
def evaluate_strategy_edge(returns: np.ndarray) -> dict:
    """Compute edge metrics for a strategy's return series."""
    returns = returns[~np.isnan(returns)]
    n = len(returns)
    if n < 30:
        return None

    mean_ret = float(np.mean(returns))
    std_ret = float(np.std(returns, ddof=1))
    win_rate = float((returns > 0).mean())
    avg_win = float(np.mean(returns[returns > 0])) if (returns > 0).any() else 0.0
    avg_loss = float(np.mean(returns[returns <= 0])) if (returns <= 0).any() else 0.0

    # Sharpe (annualized, assuming ~21 trading days per period)
    sharpe = (mean_ret / std_ret * np.sqrt(252 / max(1, 21))) if std_ret > 1e-8 else 0.0

    # Max drawdown from cumulative returns
    cum = np.cumsum(returns)
    running_max = np.maximum.accumulate(cum)
    drawdowns = cum - running_max
    max_dd = float(np.min(drawdowns)) if len(drawdowns) > 0 else 0.0

    # Expectancy: avg_win * win_rate + avg_loss * (1 - win_rate)
    expectancy = avg_win * win_rate + avg_loss * (1 - win_rate)

    return {
        "n_trades": n,
        "mean_return": mean_ret,
        "std_return": std_ret,
        "win_rate": win_rate,
        "avg_win": avg_win,
        "avg_loss": avg_loss,
        "sharpe": float(sharpe),
        "max_drawdown": max_dd,
        "expectancy": float(expectancy),
    }


print("Edge evaluation function defined.")

In [None]:
STEP = "edge_evaluation"
eval_path = os.path.join(CFG.evaluation_dir, "edge_results.parquet")

# Columns expected in edge_results (for empty DataFrame guard)
_EDGE_COLS = ["strategy_id", "type", "n_trades", "mean_return", "std_return",
              "win_rate", "avg_win", "avg_loss", "sharpe", "max_drawdown",
              "expectancy", "lift"]

if tracker.is_completed(STEP):
    logger.info("[SKIP] %s" % STEP)
    edge_results = pd.read_parquet(eval_path)
    print("Loaded %d edge results." % len(edge_results))
else:
    logger.info("[RUN] %s" % STEP)
    t0 = time.time()

    # BUG-C FIX: Use same filtering as candidate generation (drop NaN in feat_cols too)
    valid_data = feature_panel.dropna(subset=["fwd_return"] + feat_cols).copy()
    unconditional_mean = valid_data["fwd_return"].mean()
    logger.info("Unconditional mean return: %.6f" % unconditional_mean)

    # Check for already-evaluated (incremental resume)
    existing_ids = set()
    if os.path.exists(eval_path):
        existing_df = pd.read_parquet(eval_path)
        existing_ids = set(existing_df["strategy_id"].values)
        eval_rows = existing_df.to_dict('records')
        logger.info("Resuming: %d already evaluated." % len(existing_ids))
    else:
        eval_rows = []

    to_evaluate = all_candidates[~all_candidates["strategy_id"].isin(existing_ids)]
    logger.info("Evaluating %d new candidates..." % len(to_evaluate))

    for idx, (_, row) in enumerate(to_evaluate.iterrows()):
        sid = row["strategy_id"]
        stype = row["type"]

        # Reconstruct strategy mask based on type
        try:
            if stype == "single_decile":
                col = row["features"]
                decile_val = int(row["condition"].split("== ")[1])
                mask = valid_data[col] == decile_val
            elif stype == "combo_decile":
                parts = row["condition"].split(" AND ")
                col_a, val_a = parts[0].split("==")
                col_b, val_b = parts[1].split("==")
                mask = (valid_data[col_a.strip()] == int(val_a)) & (valid_data[col_b.strip()] == int(val_b))
            elif stype == "decision_tree":
                tree_idx_str = sid.split("_")[1]
                leaf_id = int(sid.split("_")[3])
                tree_path = os.path.join(CFG.candidates_dir, "tree_%s.pkl" % tree_idx_str)
                with open(tree_path, 'rb') as f:
                    tree_data = pickle.load(f)
                # BUG-D FIX: Validate all tree features exist
                missing = [fn for fn in tree_data["features"] if fn not in feat_cols]
                if missing:
                    logger.warning("Tree %s missing features: %s" % (sid, missing))
                    continue
                tree_feat_idx = [feat_cols.index(fn) for fn in tree_data["features"]]
                X_full = valid_data[feat_cols].values[:, tree_feat_idx].astype(np.float32)
                np.nan_to_num(X_full, copy=False)
                # BUG-A FIX: Wrap numpy bool array in pd.Series for safe .loc indexing
                mask = pd.Series(tree_data["tree"].apply(X_full) == leaf_id, index=valid_data.index)
            elif stype == "logistic_rank":
                model_path = os.path.join(CFG.candidates_dir, "logistic_model.pkl")
                with open(model_path, 'rb') as f:
                    lr_data = pickle.load(f)
                X_full = valid_data[feat_cols].values.astype(np.float32)
                np.nan_to_num(X_full, copy=False)
                X_scaled = lr_data["scaler"].transform(X_full)
                proba = lr_data["model"].predict_proba(X_scaled)[:, 1]
                q_num = int(sid.split("_q")[1])
                # BUG-B FIX: Use saved quintile edges instead of recomputing
                edges = lr_data.get("quintile_edges",
                                    np.percentile(proba, [0, 20, 40, 60, 80, 100]).tolist())
                if q_num == 5:
                    mask = pd.Series(proba >= edges[q_num - 1], index=valid_data.index)
                else:
                    mask = pd.Series((proba >= edges[q_num - 1]) & (proba < edges[q_num]),
                                     index=valid_data.index)
            else:
                continue

            returns = valid_data.loc[mask, "fwd_return"].values
            edge = evaluate_strategy_edge(returns)
            if edge is None:
                continue

            # Compute lift
            edge["lift"] = edge["mean_return"] - unconditional_mean
            edge["strategy_id"] = sid
            edge["type"] = stype
            eval_rows.append(edge)

        except Exception as e:
            logger.warning("Eval error for %s: %s" % (sid, str(e)[:60]))

        # Incremental save every 200 strategies
        if (idx + 1) % 200 == 0:
            pd.DataFrame(eval_rows).to_parquet(eval_path)
            print("  Evaluated %d/%d, saved checkpoint" % (idx + 1, len(to_evaluate)))

    # BUG-F FIX: Ensure DataFrame always has expected columns
    if eval_rows:
        edge_results = pd.DataFrame(eval_rows)
    else:
        edge_results = pd.DataFrame(columns=_EDGE_COLS)
    edge_results.to_parquet(eval_path)

    elapsed = time.time() - t0
    tracker.mark_completed(STEP, {"n_evaluated": len(edge_results), "time_sec": elapsed})
    print("Edge evaluation: %d strategies (%.0fs)" % (len(edge_results), elapsed))
    gc.collect()

print("\n=== Edge Results Summary ===")
print("Total evaluated: %d" % len(edge_results))
if len(edge_results) > 0:
    print(edge_results[["mean_return", "win_rate", "sharpe", "lift", "expectancy"]].describe())

## 6. Walk-Forward Validation

Rolling walk-forward: 3-year train, 1-year test, slide by 6 months.

Pre-filters to top candidates by in-sample edge, then validates out-of-sample.
Resumes from last unfinished fold.

In [None]:
from dateutil.relativedelta import relativedelta

STEP = "walk_forward"
wf_results_path = os.path.join(CFG.walkforward_dir, "wf_results.parquet")

# Columns expected in wf_results (for empty DataFrame guard)
_WF_COLS = ["strategy_id", "fold_idx", "n_trades", "mean_return", "std_return",
            "win_rate", "avg_win", "avg_loss", "sharpe", "max_drawdown",
            "expectancy", "test_start", "test_end"]

if tracker.is_completed(STEP):
    logger.info("[SKIP] %s" % STEP)
    wf_results = pd.read_parquet(wf_results_path)
    print("Loaded %d walk-forward results." % len(wf_results))
else:
    logger.info("[RUN] %s" % STEP)
    t0 = time.time()

    # Pre-filter: top candidates by edge (sharpe > 0.2 or top 200)
    if len(edge_results) > 200:
        top_candidates = edge_results.nlargest(200, "sharpe")["strategy_id"].tolist()
    else:
        top_candidates = edge_results[edge_results["sharpe"] > 0.2]["strategy_id"].tolist()
        if len(top_candidates) < 20:
            top_candidates = edge_results.nlargest(min(50, len(edge_results)), "sharpe")["strategy_id"].tolist()

    logger.info("Walk-forward on %d candidates" % len(top_candidates))

    valid_data = feature_panel.dropna(subset=["fwd_return"]).copy()
    all_dates = valid_data.index.get_level_values(0).unique().sort_values()
    min_date = all_dates.min()
    max_date = all_dates.max()

    # Generate fold boundaries
    folds = []
    train_start = min_date
    while True:
        train_end = train_start + relativedelta(years=CFG.wf_train_years)
        test_start = train_end + pd.Timedelta(days=CFG.wf_embargo_days)
        test_end = test_start + relativedelta(months=CFG.wf_test_months)
        if test_end > max_date:
            break
        folds.append((train_start, train_end, test_start, test_end))
        train_start += relativedelta(months=CFG.wf_step_months)

    logger.info("Folds: %d" % len(folds))
    for i, (ts, te, vs, ve) in enumerate(folds):
        print("  Fold %d: train [%s, %s] test [%s, %s]" % (
            i, ts.date(), te.date(), vs.date(), ve.date()))

    # Load existing partial results for resume
    wf_rows = []
    completed_keys = set()
    partial_path = os.path.join(CFG.walkforward_dir, "wf_partial.parquet")
    if os.path.exists(partial_path):
        partial_df = pd.read_parquet(partial_path)
        wf_rows = partial_df.to_dict('records')
        completed_keys = set(zip(partial_df["strategy_id"], partial_df["fold_idx"].astype(int)))
        logger.info("Resuming: %d fold results loaded." % len(completed_keys))

    # BUG-E FIX: build_mask as a standalone function with explicit sid parameter
    def build_mask(data, stype, cand_row, sid):
        """Build strategy boolean mask on given data slice."""
        if stype == "single_decile":
            col = cand_row["features"]
            dv = int(cand_row["condition"].split("== ")[1])
            return data[col] == dv
        elif stype == "combo_decile":
            parts = cand_row["condition"].split(" AND ")
            ca, va = parts[0].split("==")
            cb, vb = parts[1].split("==")
            return (data[ca.strip()] == int(va)) & (data[cb.strip()] == int(vb))
        elif stype == "decision_tree":
            tree_num = sid.split("_")[1]
            leaf_id = int(sid.split("_")[3])
            tp = os.path.join(CFG.candidates_dir, "tree_%s.pkl" % tree_num)
            with open(tp, 'rb') as f:
                td = pickle.load(f)
            # BUG-D FIX: Validate tree features
            missing = [fn for fn in td["features"] if fn not in feat_cols]
            if missing:
                raise ValueError("Tree features missing: %s" % missing)
            fi = [feat_cols.index(fn) for fn in td["features"]]
            X = data[feat_cols].values[:, fi].astype(np.float32)
            np.nan_to_num(X, copy=False)
            return pd.Series(td["tree"].apply(X) == leaf_id, index=data.index)
        elif stype == "logistic_rank":
            mp = os.path.join(CFG.candidates_dir, "logistic_model.pkl")
            with open(mp, 'rb') as f:
                ld = pickle.load(f)
            X = data[feat_cols].values.astype(np.float32)
            np.nan_to_num(X, copy=False)
            proba = ld["model"].predict_proba(ld["scaler"].transform(X))[:, 1]
            q_num = int(sid.split("_q")[1])
            # BUG-B FIX: Use saved quintile edges for consistency
            edges = ld.get("quintile_edges",
                           np.percentile(proba, [0, 20, 40, 60, 80, 100]).tolist())
            if q_num == 5:
                return pd.Series(proba >= edges[q_num - 1], index=data.index)
            return pd.Series((proba >= edges[q_num - 1]) & (proba < edges[q_num]), index=data.index)
        return pd.Series(False, index=data.index)

    total_evals = len(top_candidates) * len(folds)
    done_count = len(completed_keys)

    for fold_idx, (train_start, train_end, test_start, test_end) in enumerate(folds):
        dates_idx = valid_data.index.get_level_values(0)
        train_mask = (dates_idx >= train_start) & (dates_idx < train_end)
        test_mask = (dates_idx >= test_start) & (dates_idx < test_end)
        train_data = valid_data[train_mask]
        test_data = valid_data[test_mask]

        for sid in top_candidates:
            if (sid, fold_idx) in completed_keys:
                continue

            try:
                cand_row = all_candidates[all_candidates["strategy_id"] == sid].iloc[0]
                stype = cand_row["type"]

                test_mask_strat = build_mask(test_data, stype, cand_row, sid)
                test_returns = test_data.loc[test_mask_strat, "fwd_return"].values

                if len(test_returns) < 20:
                    continue

                edge = evaluate_strategy_edge(test_returns)
                if edge is None:
                    continue

                edge["strategy_id"] = sid
                edge["fold_idx"] = fold_idx
                edge["test_start"] = str(test_start.date())
                edge["test_end"] = str(test_end.date())
                wf_rows.append(edge)

            except Exception as e:
                pass  # silently skip errors

            done_count += 1

        # Checkpoint after each fold
        if wf_rows:
            pd.DataFrame(wf_rows).to_parquet(partial_path)
        print("  Fold %d complete. Progress: %d/%d" % (fold_idx, done_count, total_evals))

    # BUG-F FIX: Ensure DataFrame always has expected columns
    if wf_rows:
        wf_results = pd.DataFrame(wf_rows)
    else:
        wf_results = pd.DataFrame(columns=_WF_COLS)
    wf_results.to_parquet(wf_results_path)

    # Clean up partial file
    if os.path.exists(partial_path):
        os.remove(partial_path)

    elapsed = time.time() - t0
    tracker.mark_completed(STEP, {"n_results": len(wf_results), "n_folds": len(folds), "time_sec": elapsed})
    print("Walk-forward: %d results across %d folds (%.0fs)" % (len(wf_results), len(folds), elapsed))
    gc.collect()

print("\nWalk-forward results: %d" % len(wf_results))
if len(wf_results) > 0:
    n_strategies_wf = wf_results["strategy_id"].nunique()
    n_folds_wf = wf_results["fold_idx"].nunique()
    print("Strategies: %d | Folds: %d" % (n_strategies_wf, n_folds_wf))

## 7. Overfitting Control

Filters strategies using:
- Stability score across folds (reject if performance sign flips)
- Sharpe consistency (reject if unstable)
- Win rate collapse detection
- Bootstrap confidence interval for win rate

In [None]:
STEP = "overfitting_control"
filtered_path = os.path.join(CFG.evaluation_dir, "filtered_strategies.parquet")

if tracker.is_completed(STEP):
    logger.info("[SKIP] %s" % STEP)
    filtered_strategies = pd.read_parquet(filtered_path)
    print("Loaded %d filtered strategies." % len(filtered_strategies))
else:
    logger.info("[RUN] %s" % STEP)
    t0 = time.time()

    if len(wf_results) == 0:
        print("No walk-forward results. Skipping.")
        filtered_strategies = pd.DataFrame()
    else:
        # Aggregate per strategy across folds
        strategy_stats = []
        for sid, group in wf_results.groupby("strategy_id"):
            n_folds = len(group)
            if n_folds < 2:
                continue

            fold_returns = group["mean_return"].values
            fold_sharpes = group["sharpe"].values
            fold_win_rates = group["win_rate"].values

            # Stability: fraction of folds with positive return
            stability = float((fold_returns > 0).mean())

            # Sharpe stats
            mean_sharpe = float(np.mean(fold_sharpes))
            sharpe_std = float(np.std(fold_sharpes, ddof=1))

            # Win rate stats
            mean_win_rate = float(np.mean(fold_win_rates))
            min_win_rate = float(np.min(fold_win_rates))

            # Sign flip detection
            sign_flips = int(np.sum(np.diff(np.sign(fold_returns)) != 0))

            # Bootstrap CI for win rate (using all fold trade counts)
            all_n_trades = group["n_trades"].values
            total_trades = int(all_n_trades.sum())
            total_wins = int((fold_win_rates * all_n_trades).sum())

            bootstrap_wins = np.random.binomial(total_trades, total_wins / max(1, total_trades), CFG.bootstrap_n)
            bootstrap_wr = bootstrap_wins / total_trades
            ci_low = float(np.percentile(bootstrap_wr, (1 - CFG.bootstrap_ci) / 2 * 100))
            ci_high = float(np.percentile(bootstrap_wr, (1 + CFG.bootstrap_ci) / 2 * 100))

            strategy_stats.append({
                "strategy_id": sid,
                "n_folds": n_folds,
                "stability": stability,
                "mean_sharpe": mean_sharpe,
                "sharpe_std": sharpe_std,
                "mean_win_rate": mean_win_rate,
                "min_win_rate": min_win_rate,
                "sign_flips": sign_flips,
                "total_trades": total_trades,
                "wr_ci_low": ci_low,
                "wr_ci_high": ci_high,
                "mean_return": float(np.mean(fold_returns)),
                "mean_lift": float(np.mean(fold_returns)),  # approx
            })

        stats_df = pd.DataFrame(strategy_stats)

        # Apply filters
        n_before = len(stats_df)
        mask = (
            (stats_df["stability"] >= CFG.min_stability) &
            (stats_df["mean_sharpe"] >= CFG.min_sharpe) &
            (stats_df["mean_win_rate"] >= CFG.min_win_rate) &
            (stats_df["wr_ci_low"] >= 0.48)  # CI lower bound must be near 50%
        )
        filtered_strategies = stats_df[mask].copy()
        filtered_strategies = filtered_strategies.sort_values("mean_sharpe", ascending=False)

        print("\nOverfitting filters:")
        print("  Before: %d strategies" % n_before)
        print("  Stability >= %.1f: %d pass" % (
            CFG.min_stability, (stats_df["stability"] >= CFG.min_stability).sum()))
        print("  Sharpe >= %.1f: %d pass" % (
            CFG.min_sharpe, (stats_df["mean_sharpe"] >= CFG.min_sharpe).sum()))
        print("  Win rate >= %.2f: %d pass" % (
            CFG.min_win_rate, (stats_df["mean_win_rate"] >= CFG.min_win_rate).sum()))
        print("  CI lower >= 0.48: %d pass" % ((stats_df["wr_ci_low"] >= 0.48).sum()))
        print("  After ALL filters: %d strategies" % len(filtered_strategies))

        filtered_strategies.to_parquet(filtered_path)

    elapsed = time.time() - t0
    tracker.mark_completed(STEP, {"n_filtered": len(filtered_strategies), "time_sec": elapsed})
    gc.collect()

print("\nFiltered strategies: %d" % len(filtered_strategies))
if len(filtered_strategies) > 0:
    print(filtered_strategies[
        ["strategy_id", "stability", "mean_sharpe", "mean_win_rate", "wr_ci_low", "total_trades"]
    ].head(20).to_string(index=False))

## 8. Strategy Scoring Engine

Final composite score:
```
Score = 0.3 * Stability + 0.3 * Sharpe_norm + 0.2 * Lift_norm + 0.2 * SampleScore
```

Includes optional strategy clustering to remove redundancy.

In [None]:
STEP = "scoring"
scored_path = os.path.join(CFG.evaluation_dir, "scored_strategies.parquet")

if tracker.is_completed(STEP):
    logger.info("[SKIP] %s" % STEP)
    scored_strategies = pd.read_parquet(scored_path)
    print("Loaded %d scored strategies." % len(scored_strategies))
else:
    logger.info("[RUN] %s" % STEP)

    if len(filtered_strategies) == 0:
        print("No strategies passed filters. Check parameters or data.")
        scored_strategies = pd.DataFrame()
    else:
        df = filtered_strategies.copy()

        # Normalize each component to [0, 1]
        def normalize_col(s):
            r = s.max() - s.min()
            return (s - s.min()) / r if r > 1e-8 else pd.Series(0.5, index=s.index)

        df["stability_norm"] = normalize_col(df["stability"])
        df["sharpe_norm"] = normalize_col(df["mean_sharpe"])

        # Lift: merge from edge_results
        lift_map = edge_results.set_index("strategy_id")["lift"].to_dict()
        df["lift"] = df["strategy_id"].map(lift_map).fillna(0)
        df["lift_norm"] = normalize_col(df["lift"])

        # Sample size score: log-scaled
        df["sample_score"] = normalize_col(np.log1p(df["total_trades"]))

        # Composite score
        df["composite_score"] = (
            CFG.w_stability * df["stability_norm"]
            + CFG.w_sharpe * df["sharpe_norm"]
            + CFG.w_lift * df["lift_norm"]
            + CFG.w_sample * df["sample_score"]
        )

        scored_strategies = df.sort_values("composite_score", ascending=False).reset_index(drop=True)
        scored_strategies["rank"] = range(1, len(scored_strategies) + 1)
        scored_strategies.to_parquet(scored_path)

    tracker.mark_completed(STEP, {"n_scored": len(scored_strategies)})

print("\n=== RANKED STRATEGIES ===")
if len(scored_strategies) > 0:
    display_cols = ["rank", "strategy_id", "composite_score", "stability",
                    "mean_sharpe", "mean_win_rate", "lift", "total_trades"]
    print(scored_strategies[display_cols].head(30).to_string(index=False))
else:
    print("No strategies to rank.")

## 8b. Strategy Clustering (Redundancy Removal)

Cluster similar strategies and keep only the best per cluster.

In [None]:
from sklearn.cluster import AgglomerativeClustering

if len(scored_strategies) > 5:
    logger.info("Clustering strategies for redundancy removal...")

    # Build return correlation matrix using walk-forward fold returns
    pivot = wf_results.pivot_table(
        values="mean_return", index="fold_idx", columns="strategy_id", aggfunc="first"
    )
    # Only include scored strategies
    scored_ids = scored_strategies["strategy_id"].tolist()
    pivot = pivot[[c for c in pivot.columns if c in scored_ids]].dropna(axis=1, how='all')
    pivot = pivot.fillna(0)

    if pivot.shape[1] >= 5:
        corr = pivot.corr().values
        distance = 1 - np.abs(corr)
        np.fill_diagonal(distance, 0)
        distance = np.maximum(distance, 0)  # ensure non-negative

        n_clusters = max(3, min(10, len(pivot.columns) // 3))
        clustering = AgglomerativeClustering(
            n_clusters=n_clusters, metric='precomputed', linkage='average'
        )
        labels = clustering.fit_predict(distance)

        cluster_map = dict(zip(pivot.columns, labels))
        scored_strategies["cluster"] = scored_strategies["strategy_id"].map(cluster_map)

        # Best per cluster
        best_per_cluster = scored_strategies.dropna(subset=["cluster"]).groupby("cluster").first()
        deduped_ids = best_per_cluster["strategy_id"].tolist()

        print("\n=== Clustered Strategies ===")
        print("Clusters: %d" % n_clusters)
        for c in range(n_clusters):
            members = scored_strategies[scored_strategies["cluster"] == c]
            best = members.iloc[0]["strategy_id"] if len(members) > 0 else "N/A"
            print("  Cluster %d: %d members, best=%s" % (c, len(members), best))

        print("\nDeduplicated strategies: %d (from %d)" % (len(deduped_ids), len(scored_strategies)))
    else:
        print("Not enough strategies for clustering. Keeping all.")
        deduped_ids = scored_strategies["strategy_id"].tolist()
else:
    print("Fewer than 5 strategies. Skipping clustering.")
    deduped_ids = scored_strategies["strategy_id"].tolist() if len(scored_strategies) > 0 else []

## 9. Final Output

- Ranked strategy table
- Stability diagnostics
- Equity curves of top 5 strategies
- Portfolio combination performance

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

if len(scored_strategies) == 0:
    print("No strategies to visualize. Pipeline found no viable candidates.")
    print("Consider relaxing filter thresholds or adding more data.")
else:
    top5 = scored_strategies.head(5)
    print("=== TOP 5 STRATEGIES ===")
    print(top5[["rank", "strategy_id", "composite_score", "stability",
                "mean_sharpe", "mean_win_rate", "total_trades"]].to_string(index=False))

    # === Figure: 4-panel dashboard ===
    fig = plt.figure(figsize=(18, 14))
    gs = gridspec.GridSpec(2, 2, hspace=0.35, wspace=0.3)

    # --- Panel 1: Composite score bar ---
    ax1 = fig.add_subplot(gs[0, 0])
    top20 = scored_strategies.head(20)
    colors = ['#4CAF50' if s >= 0.7 else '#FFC107' if s >= 0.4 else '#F44336'
              for s in top20['composite_score']]
    ax1.barh(range(len(top20)), top20['composite_score'], color=colors, edgecolor='white')
    ax1.set_yticks(range(len(top20)))
    ax1.set_yticklabels(top20['strategy_id'].str[:30], fontsize=7)
    ax1.set_xlabel('Composite Score')
    ax1.set_title('Strategy Rankings (Top 20)', fontweight='bold')
    ax1.invert_yaxis()

    # --- Panel 2: Stability diagnostics ---
    ax2 = fig.add_subplot(gs[0, 1])
    for _, row in top5.iterrows():
        sid = row['strategy_id']
        fold_data = wf_results[wf_results['strategy_id'] == sid]
        if len(fold_data) > 0:
            ax2.plot(fold_data['fold_idx'], fold_data['sharpe'],
                     'o-', label=sid[:25], linewidth=2, markersize=6)
    ax2.axhline(y=0, color='gray', linewidth=0.5, linestyle='--')
    ax2.set_xlabel('Fold')
    ax2.set_ylabel('Sharpe Ratio')
    ax2.set_title('Stability: Sharpe Across Folds (Top 5)', fontweight='bold')
    ax2.legend(fontsize=7)
    ax2.grid(alpha=0.3)

    # --- Panel 3: Equity curves ---
    ax3 = fig.add_subplot(gs[1, 0])
    valid_data = feature_panel.dropna(subset=['fwd_return']).copy()

    for _, row in top5.iterrows():
        sid = row['strategy_id']
        # Build cumulative PnL from walk-forward test periods
        fold_data = wf_results[wf_results['strategy_id'] == sid].sort_values('fold_idx')
        if len(fold_data) > 0:
            cum_ret = np.cumsum(fold_data['mean_return'].values)
            ax3.plot(range(len(cum_ret)), cum_ret, 'o-', label=sid[:25], linewidth=2)

    ax3.axhline(y=0, color='gray', linewidth=0.5, linestyle='--')
    ax3.set_xlabel('Walk-Forward Fold')
    ax3.set_ylabel('Cumulative Return')
    ax3.set_title('Equity Curves (Top 5)', fontweight='bold')
    ax3.legend(fontsize=7)
    ax3.grid(alpha=0.3)

    # --- Panel 4: Portfolio combination ---
    ax4 = fig.add_subplot(gs[1, 1])

    # Equal-weight portfolio of top 5
    top5_ids = top5['strategy_id'].tolist()
    portfolio_fold_data = wf_results[wf_results['strategy_id'].isin(top5_ids)]

    if len(portfolio_fold_data) > 0:
        portfolio_returns = portfolio_fold_data.groupby('fold_idx')['mean_return'].mean()
        cum_portfolio = np.cumsum(portfolio_returns.values)

        # Compare with best single strategy
        best_sid = top5.iloc[0]['strategy_id']
        best_fold = wf_results[wf_results['strategy_id'] == best_sid].sort_values('fold_idx')
        cum_best = np.cumsum(best_fold['mean_return'].values)

        ax4.plot(range(len(cum_portfolio)), cum_portfolio, 'b-', linewidth=2.5,
                 label='Equal-Weight Portfolio (Top 5)')
        ax4.plot(range(len(cum_best)), cum_best, 'r--', linewidth=1.5,
                 label='Best Single: %s' % best_sid[:20])
        ax4.fill_between(range(len(cum_portfolio)), 0, cum_portfolio,
                         where=np.array(cum_portfolio) > 0, alpha=0.1, color='blue')

        # Portfolio stats
        port_sharpe = (
            portfolio_returns.mean() / portfolio_returns.std() * np.sqrt(252 / 21)
            if portfolio_returns.std() > 1e-8 else 0
        )
        ax4.text(0.05, 0.95,
                 'Portfolio Sharpe: %.2f\nTotal Return: %.2f%%' % (
                     port_sharpe, cum_portfolio[-1] * 100),
                 transform=ax4.transAxes, fontsize=9, verticalalignment='top',
                 bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.8))

    ax4.axhline(y=0, color='gray', linewidth=0.5, linestyle='--')
    ax4.set_xlabel('Walk-Forward Fold')
    ax4.set_ylabel('Cumulative Return')
    ax4.set_title('Portfolio Combination', fontweight='bold')
    ax4.legend(fontsize=8)
    ax4.grid(alpha=0.3)

    fig.suptitle('Quantitative Research Pipeline — Final Results',
                 fontsize=16, fontweight='bold', y=1.01)
    plt.savefig(os.path.join(CFG.drive_root, 'pipeline_results.png'),
                dpi=150, bbox_inches='tight')
    plt.show()
    print("Dashboard saved.")

In [None]:
# === Final Summary ===

print("=" * 60)
print("PIPELINE COMPLETE")
print("=" * 60)

tracker.summary()

if len(scored_strategies) > 0:
    print("\n=== Top Strategy ===")
    best = scored_strategies.iloc[0]
    print("  ID:           %s" % best['strategy_id'])
    print("  Composite:    %.4f" % best['composite_score'])
    print("  Stability:    %.2f" % best['stability'])
    print("  Sharpe:       %.2f" % best['mean_sharpe'])
    print("  Win Rate:     %.2f%%" % (best['mean_win_rate'] * 100))
    print("  Total Trades: %d" % best['total_trades'])

    print("\n=== Portfolio (EW Top 5) ===")
    top5_ids = scored_strategies.head(5)['strategy_id'].tolist()
    port_folds = wf_results[wf_results['strategy_id'].isin(top5_ids)]
    if len(port_folds) > 0:
        port_ret = port_folds.groupby('fold_idx')['mean_return'].mean()
        print("  Mean Return:  %.4f" % port_ret.mean())
        port_s = port_ret.mean() / port_ret.std() * np.sqrt(252 / 21) if port_ret.std() > 1e-8 else 0
        print("  Sharpe:       %.2f" % port_s)
        print("  Win Folds:    %d/%d" % ((port_ret > 0).sum(), len(port_ret)))

    # Save final report
    report = {
        "pipeline_complete": True,
        "n_tickers": len(valid_tickers),
        "n_candidates_total": len(all_candidates),
        "n_evaluated": len(edge_results),
        "n_walk_forward": len(wf_results),
        "n_filtered": len(filtered_strategies),
        "n_scored": len(scored_strategies),
        "top_strategy": best['strategy_id'],
        "top_composite": float(best['composite_score']),
    }
    with open(os.path.join(CFG.drive_root, 'report.json'), 'w') as f:
        json.dump(report, f, indent=2)
    print("\nReport saved to: %s" % os.path.join(CFG.drive_root, 'report.json'))
else:
    print("\nNo viable strategies found. Consider:")
    print("  - Relaxing filter thresholds (min_stability, min_sharpe, min_win_rate)")
    print("  - Adding more tickers or longer data period")
    print("  - Reducing min_sample_size")

print("\n" + "=" * 60)