In [None]:
#!/usr/bin/env python
# ===================================================================
# STAGE-25 · Universal Ratio-vs-Resilience Ranker  (2025-06-17)
# ===================================================================
"""
Changes vs the 2025-06-15 release
─────────────────────────────────
• Guard‐rails: no math-domain error when p-value ∈ {0,1}; no TypeError when
  the cross-event CSV is empty.
• NEW: understands draw-down outcomes
      ─ ScoreDepth_<metric>, FlagDepth_<metric>, DD_<metric> ─
  and produces *parallel* rankings for “speed” and “depth”.
• All output filenames now include the flavour (‘Speed’ / ‘Depth’) so nothing
  is overwritten.
Everything else is unchanged.
"""
from __future__ import annotations
import contextlib, io, logging, math, os, sys, warnings
from pathlib import Path

import numpy as np
import pandas as pd
import statsmodels.api as sm
from numpy.linalg import LinAlgError
from scipy.stats import chi2, norm, spearmanr
from sklearn.metrics import roc_auc_score
from statsmodels.tools.sm_exceptions import (
    MissingDataError, PerfectSeparationError)

from pipeline_utils import load_cfg, resolve_run_dir

# ──────────────── GLOBAL CONFIG ────────────────────────────────────
warnings.filterwarnings("ignore", category=RuntimeWarning)
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s | %(levelname)-7s | %(message)s")
log = logging.getLogger("stage25")

CFG           = load_cfg()
EVENT_LIST    = list(CFG.get("events", {}).keys())         # ['1998','2008', …]
MIN_COVER     = float(os.getenv("MIN_COVERAGE", 40))       # %
MIN_OBS       = int  (os.getenv("MIN_OBS",      30))
EPS_VAR       = 1e-6
ID_COL, YEAR_COL, DATE_COL = "Symbol", "Year", "ReportDate"
_IN_NOTEBOOK  = "ipykernel" in sys.modules

# ──────────────── STATS HELPERS ───────────────────────────────────
def _safe_spearman(x: pd.Series, y: pd.Series):
    ok = x.notna() & y.notna()
    if ok.sum() < MIN_OBS:
        return np.nan, np.nan
    r, p = spearmanr(x[ok], y[ok])
    return abs(r), p

def _logit_one(x: pd.Series, flag: pd.Series):
    ok = x.notna() & flag.isin([0, 1])
    n  = int(ok.sum())
    out = dict(SampleSize=n,
               PositivePct=float(flag[ok].mean() * 100) if n else np.nan,
               PseudoR2=np.nan, AUROC=np.nan, CoefP=np.nan,
               ModelConv=False, Failure="")
    if n < MIN_OBS:
        out["Failure"] = "too_few_obs"; return out
    if flag[ok].nunique() < 2:
        out["Failure"] = "single_class"; return out
    if x[ok].var() < EPS_VAR:
        out["Failure"] = "zero_variance"; return out
    try:
        mdl = sm.Logit(flag[ok], sm.add_constant(x[ok])).fit(disp=False)
        if not mdl.mle_retvals.get("converged", True):
            out["Failure"] = "no_convergence"; return out
        out.update(ModelConv=True,
                   PseudoR2=float(mdl.prsquared),
                   AUROC=float(roc_auc_score(flag[ok], mdl.predict())),
                   CoefP=float(mdl.pvalues.iloc[1]))
        return out
    except (ValueError, LinAlgError,
            MissingDataError, PerfectSeparationError):
        out["Failure"] = "model_error"; return out

def _fisher(p_list):
    good = [p for p in p_list if 0 < p < 1]
    if not good:
        return np.nan
    stat = -2 * sum(math.log(p) for p in good)
    return 1 - chi2.cdf(stat, 2 * len(good))

def _stouffer(p_list):
    good = [p for p in p_list if 0 < p < 1]
    if not good:
        return np.nan
    z = [norm.isf(p / 2) * math.copysign(1, .5 - p) for p in good]
    return sum(z) / math.sqrt(len(z))

# ──────────────── RATIO COLUMN DETECTOR ───────────────────────────
def _ratio_columns(df: pd.DataFrame):
    """Return (raw_cols, winsorised_cols)."""
    if any(c.endswith("_raw") for c in df.columns):
        raw  = [c for c in df if c.endswith("_raw") and c[:-4] in df]
        wins = [c[:-4] for c in raw]
    elif any(c.endswith("_orig") for c in df.columns):
        raw  = [c for c in df if c.endswith("_orig") and c[:-5] in df]
        wins = [c[:-5] for c in raw]
    else:
        raw  = []
        wins = [c for c in df if "_" in c and not c.endswith(("_raw", "_orig"))]
    return raw, wins

# ──────────────── ONE-EVENT ROUTINE ───────────────────────────────
def run_event(ev: str) -> pd.DataFrame:
    run_dir = resolve_run_dir(ev,
                              must_have=f"stage03/Stage3_Data_WithRatios_{ev}.csv",
                              run_tag  =os.getenv("RUN_TAG"))
    out_dir = run_dir / "stage25"; out_dir.mkdir(exist_ok=True)

    df = pd.read_csv(run_dir / "stage03" / f"Stage3_Data_WithRatios_{ev}.csv")
    if YEAR_COL not in df and DATE_COL in df:
        df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
        df[YEAR_COL] = df[DATE_COL].dt.year
    df = df[df[YEAR_COL] < int(ev)].copy()

    raw_cols, win_cols = _ratio_columns(df)

    # outcome columns ------------------------------------------------
    speed_score_cols = [c for c in df if c.startswith("Score_")]
    depth_score_cols = [c for c in df if c.startswith("ScoreDepth_")]
    speed_flag_cols  = [c for c in df if c.startswith("Flag_")]
    depth_flag_cols  = [c for c in df if c.startswith("FlagDepth_")]

    def _metric_name(col: str, prefix: str) -> str:
        return col[len(prefix):]

    speed_metrics = sorted(set(_metric_name(c, "Score_") for c in speed_score_cols)
                           & set(_metric_name(c, "Flag_")  for c in speed_flag_cols))
    depth_metrics = sorted(set(_metric_name(c, "ScoreDepth_") for c in depth_score_cols)
                           & set(_metric_name(c, "FlagDepth_")  for c in depth_flag_cols))

    # ---------- ranking helper -------------------------------------
    def _rank(cols: list[str], flavour: str):
        """
        flavour ∈ {"Speed", "Depth"}
        """
        score_prefix = "Score_"      if flavour == "Speed" else "ScoreDepth_"
        flag_prefix  = "Flag_"       if flavour == "Speed" else "FlagDepth_"
        metrics      = speed_metrics if flavour == "Speed" else depth_metrics

        rows = []
        for rc in cols:
            cov = df[rc].notna().mean() * 100
            if cov < MIN_COVER:
                continue
            for m in metrics:
                rho, p_rho = _safe_spearman(df[rc], df[f"{score_prefix}{m}"])
                rows.append(dict(
                    SwanYear   = int(ev),
                    Ratio      = rc.replace("_raw", "").replace("_orig", ""),
                    Metric     = m,
                    CoveragePct= round(cov, 1),
                    AbsRho     = rho,
                    RhoP       = p_rho,
                    **_logit_one(df[rc], df[f"{flag_prefix}{m}"]),
                    Flavour    = flavour,
                    Series     = "winsor" if rc in win_cols else "raw"
                ))
        out = pd.DataFrame(rows)
        tag = "winsor" if cols is win_cols else "raw"
        if not out.empty:
            out.to_csv(out_dir / f"Stage25_{tag}_{flavour}_RatioRanking_{ev}.csv",
                       index=False)
        return out

    frames = []
    if win_cols:
        frames.append(_rank(win_cols, "Speed"))
        frames.append(_rank(win_cols, "Depth"))
    if raw_cols:
        frames.append(_rank(raw_cols, "Speed"))
        frames.append(_rank(raw_cols, "Depth"))

    out_df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
    log.info("Stage-25 %s → %d rows", ev, len(out_df))
    return out_df

# ──────────────── DRIVER ──────────────────────────────────────────
def main():
    all_frames = []
    for ev in EVENT_LIST:
        try:
            all_frames.append(run_event(ev))
        except Exception as exc:
            log.error("Event %s skipped: %s", ev, exc)

    if not all_frames:
        log.warning("No events processed – nothing to summarise.")
        return

    big = pd.concat(all_frames, ignore_index=True)

    # focus on converged winsorised models only
    sel = big[(big.Series == "winsor") & (big.ModelConv)]
    if sel.empty:
        log.warning("No converged winsor models – meta summary skipped.")
        return

    meta = (sel.groupby(["Ratio", "Metric", "Flavour"])
                .apply(lambda g: pd.Series(dict(
                    meanAbsRho = g.AbsRho.mean(),
                    meanAUROC = g.AUROC.mean(),
                    FisherP   = _fisher(g.CoefP.dropna().tolist()),
                    StoufferZ = _stouffer(g.CoefP.dropna().tolist()),
                    nEventsSig= int((g.CoefP < 0.05).sum()),
                    Events    = ",".join(map(str, sorted(set(g.SwanYear))))
                )))
                .reset_index()
                .sort_values(["Flavour", "FisherP", "meanAUROC"]))

    # pick the newest run folder that exists — any event is fine
    meta_dir: Path | None = None
    for ev in reversed(EVENT_LIST):
        try:
            meta_dir = resolve_run_dir(ev, run_tag=os.getenv("RUN_TAG"))
            break
        except FileNotFoundError:
            continue
    if meta_dir is None:
        log.warning("No run folders found – meta summary NOT written.")
        return

    (meta_dir / "stage25").mkdir(exist_ok=True)
    meta_path = meta_dir / "stage25" / "Stage25_CrossEvent_Summary.csv"
    meta.to_csv(meta_path, index=False)
    log.info("Cross-event summary saved → %s", meta_path)

    if _IN_NOTEBOOK:
        from IPython.display import display, HTML
        display(HTML("<h3>Cross-event summary (top 50)</h3>"))
        display(meta.head(50).style.background_gradient(cmap="Purples"))

if __name__ == "__main__":
    with contextlib.redirect_stdout(io.StringIO()):
        main()
