In [None]:
#!/usr/bin/env python
# ===================================================================
# STAGE-25 · Universal Ratio-vs-Resilience Ranker  (2025-06-17 + blend)
# ===================================================================
"""
Adds “blend” family (ScoreBlend_*, FlagBlend_*) and stronger guard-rails:
    • Fisher / Stouffer remain finite when any p==0/1
    • Cross-event summary skipped gracefully when nothing converged
Every artefact now carries the Flavour token (Speed / Depth / Blend).
"""
from __future__ import annotations
import contextlib, io, logging, math, os, sys, warnings
from pathlib import Path
from typing  import List

import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import chi2, norm, spearmanr
from sklearn.metrics import roc_auc_score
from numpy.linalg   import LinAlgError
from statsmodels.tools.sm_exceptions import (
    MissingDataError, PerfectSeparationError)

from pipeline_utils import load_cfg, resolve_run_dir

# ═════════════════════ GLOBALS ══════════════════════
warnings.filterwarnings("ignore", category=RuntimeWarning)
logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s")
log = logging.getLogger("stage25")

CFG        = load_cfg()
EVENT_LIST = list(CFG.get("events", {}).keys())            # ['1998', '2008', …]

MIN_COVER = float(os.getenv("MIN_COVERAGE", 40))           # %
MIN_OBS   = int  (os.getenv("MIN_OBS",      30))
EPS_VAR   = 1e-6

ID_COL, YEAR_COL, DATE_COL = "Symbol", "Year", "ReportDate"
_IN_NOTEBOOK = "ipykernel" in sys.modules

# ═════════════════════ STATS HELPERS ═════════════════
def _safe_spearman(x: pd.Series, y: pd.Series):
    ok = x.notna() & y.notna()
    if ok.sum() < MIN_OBS:
        return np.nan, np.nan
    r, p = spearmanr(x[ok], y[ok])
    return abs(r), p

def _logit_one(x: pd.Series, flag: pd.Series):
    ok = x.notna() & flag.isin([0, 1])
    n  = int(ok.sum())
    out = dict(SampleSize=n,
               PositivePct=float(flag[ok].mean()*100) if n else np.nan,
               PseudoR2=np.nan, AUROC=np.nan, CoefP=np.nan,
               ModelConv=False, Failure="")
    if n < MIN_OBS:
        out["Failure"] = "too_few_obs"; return out
    if flag[ok].nunique() < 2:
        out["Failure"] = "single_class"; return out
    if x[ok].var() < EPS_VAR:
        out["Failure"] = "zero_variance"; return out

    try:
        mdl = sm.Logit(flag[ok], sm.add_constant(x[ok])).fit(disp=False)
        if not mdl.mle_retvals.get("converged", True):
            out["Failure"] = "no_convergence"; return out
        out.update(ModelConv=True,
                   PseudoR2=float(mdl.prsquared),
                   AUROC=float(roc_auc_score(flag[ok], mdl.predict())),
                   CoefP=float(mdl.pvalues.iloc[1]))
    except (ValueError, LinAlgError,
            MissingDataError, PerfectSeparationError):
        out["Failure"] = "model_error"
    return out

def _fisher(p_list: List[float]) -> float:
    good = [p for p in p_list if 0 < p < 1]
    if not good:
        return np.nan
    stat = -2 * sum(math.log(max(p, 1e-300)) for p in good)  # clamp at 1e-300
    return 1 - chi2.cdf(stat, 2*len(good))

def _stouffer(p_list: List[float]) -> float:
    good = [p for p in p_list if 0 < p < 1]
    if not good:
        return np.nan
    z = [norm.isf(p/2) * math.copysign(1, .5-p) for p in good]  # safe
    return sum(z) / math.sqrt(len(z))

# ═════════════════════ RATIO DETECTOR ════════════════
def _ratio_columns(df: pd.DataFrame):
    """
    Return (raw_cols, wins_cols) with best-guess naming heuristics.
    """
    if any(c.endswith("_raw") for c in df.columns):
        raw  = [c for c in df if c.endswith("_raw") and c[:-4] in df.columns]
        wins = [c[:-4] for c in raw]
    elif any(c.endswith("_orig") for c in df.columns):
        raw  = [c for c in df if c.endswith("_orig") and c[:-5] in df.columns]
        wins = [c[:-5] for c in raw]
    else:
        raw  = []
        wins = [c for c in df if "_" in c and not c.endswith(("_raw", "_orig"))]
    return raw, wins

# ═════════════════════ ONE-EVENT CORE ════════════════
def run_event(ev: str) -> pd.DataFrame:
    run_dir = resolve_run_dir(ev,
                              must_have=f"stage03/Stage3_Data_WithRatios_{ev}.csv",
                              run_tag=os.getenv("RUN_TAG"))
    out_dir = run_dir / "stage25"; out_dir.mkdir(exist_ok=True)

    df = pd.read_csv(run_dir/"stage03"/f"Stage3_Data_WithRatios_{ev}.csv")
    if YEAR_COL not in df and DATE_COL in df:
        df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
        df[YEAR_COL] = df[DATE_COL].dt.year
    df = df[df[YEAR_COL] < int(ev)].copy()   # pre-event history only

    raw_cols, win_cols = _ratio_columns(df)

    # outcome columns ─────────────
    OUT = {
        "Speed": dict(score="Score_",       flag="Flag_"),
        "Depth": dict(score="ScoreDepth_",  flag="FlagDepth_"),
        "Blend": dict(score="ScoreBlend_",  flag="FlagBlend_"),
    }

    metric_map = {}
    for flav, pre in OUT.items():
        scols = [c for c in df if c.startswith(pre["score"])]
        fcols = [c for c in df if c.startswith(pre["flag"])]
        metrics = sorted(
            set(c[len(pre["score"]):] for c in scols)
            & set(c[len(pre["flag"]):]  for c in fcols)
        )
        metric_map[flav] = metrics

    # ranking helper ──────────────
    def _rank(cols: list[str], flavour: str):
        pre = OUT[flavour]
        metrics = metric_map[flavour]
        rows = []
        for rc in cols:
            cov = df[rc].notna().mean()*100
            if cov < MIN_COVER:
                continue
            for m in metrics:
                rho, p_rho = _safe_spearman(df[rc], df[f"{pre['score']}{m}"])
                rows.append(dict(
                    SwanYear    = int(ev),
                    Ratio       = rc.replace("_raw","").replace("_orig",""),
                    Metric      = m,
                    CoveragePct = round(cov,1),
                    AbsRho      = rho,
                    RhoP        = p_rho,
                    **_logit_one(df[rc], df[f"{pre['flag']}{m}"]),
                    Flavour     = flavour,
                    Series      = "winsor" if rc in win_cols else "raw",
                ))
        out = pd.DataFrame(rows)
        tag = "winsor" if cols is win_cols else "raw"
        if not out.empty:
            out.to_csv(out_dir/f"Stage25_{tag}_{flavour}_RatioRanking_{ev}.csv",
                       index=False)
        return out

    frames = []
    if win_cols:
        frames += [_rank(win_cols, f) for f in OUT]
    if raw_cols:
        frames += [_rank(raw_cols, f) for f in OUT]

    out_df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
    log.info("Stage-25 %s → %d rows", ev, len(out_df))
    return out_df

# ═════════════════════ DRIVER ════════════════════════
def main():
    all_frames = []
    for ev in EVENT_LIST:
        try:
            all_frames.append(run_event(ev))
        except Exception as exc:
            log.error("Event %s skipped: %s", ev, exc)

    if not all_frames:
        log.warning("No events processed – nothing to summarise"); return

    big = pd.concat(all_frames, ignore_index=True)

    # only converged winsor models
    sel = big[(big.Series=="winsor") & (big.ModelConv)]
    if sel.empty:
        log.warning("No converged winsor models – meta summary skipped"); return

    meta = (sel.groupby(["Ratio","Metric","Flavour"])
               .apply(lambda g: pd.Series(dict(
                   meanAbsRho = g.AbsRho.mean(),
                   meanAUROC = g.AUROC.mean(),
                   FisherP   = _fisher(g.CoefP.dropna().tolist()),
                   StoufferZ = _stouffer(g.CoefP.dropna().tolist()),
                   nEventsSig= int((g.CoefP < 0.05).sum()),
                   Events    = ",".join(map(str, sorted(set(g.SwanYear))))
               )))
               .reset_index()
               .sort_values(["Flavour","FisherP","meanAUROC"]))

    # write to the **latest** run folder that exists
    meta_dir: Path | None = None
    for ev in reversed(EVENT_LIST):
        with contextlib.suppress(FileNotFoundError):
            meta_dir = resolve_run_dir(ev, run_tag=os.getenv("RUN_TAG"))
            break
    if meta_dir is None:
        log.warning("No run folders found – meta summary NOT written"); return

    (meta_dir/"stage25").mkdir(exist_ok=True)
    meta_path = meta_dir/"stage25"/"Stage25_CrossEvent_Summary.csv"
    meta.to_csv(meta_path, index=False)
    log.info("Cross-event summary saved → %s", meta_path)

    if _IN_NOTEBOOK:
        from IPython.display import display, HTML
        display(HTML("<h3>Cross-event summary (top 50)</h3>"))
        display(meta.head(50).style.background_gradient(cmap="Purples"))

if __name__ == "__main__":
    with contextlib.redirect_stdout(io.StringIO()):
        main()