In [None]:
#!/usr/bin/env python
# ===================================================================
#  STAGE-25 · UNIVERSAL RATIO-vs-RESILIENCE RANKER   v4.3 · 2025-06-20
# ===================================================================
"""
 * Works with the upgraded pipeline (Stages 02-07).
 * Accepts any subset of the four outcome families; families whose
   score/flag columns are missing are skipped automatically.
 * Uses keyword args with `resolve_run_dir` (v2.1+) – no TypeErrors.
 * Gracefully handles events that lack Dynamic flags.
 * Logging now honours `stage25/` folder just like other stages.
"""
from __future__ import annotations

import contextlib, io, logging, math, os, sys, warnings
from pathlib import Path
from typing  import Dict, List

import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats     import chi2, norm, spearmanr
from sklearn.metrics import roc_auc_score
from numpy.linalg    import LinAlgError
from statsmodels.tools.sm_exceptions import (
    MissingDataError, PerfectSeparationError,
)

from pipeline_utils import load_cfg, resolve_run_dir

# ═════════════════════ GLOBALS ══════════════════════
warnings.filterwarnings("ignore", category=RuntimeWarning)

CFG         = load_cfg()
EVENT_LIST  = list(CFG.get("events", {}).keys())            # run every event folder

MIN_COVER   = float(os.getenv("MIN_COVERAGE", 40))          # % non-NA
MIN_OBS     = int  (os.getenv("MIN_OBS", 30))               # minimum rows for rho / logit
EPS_VAR     = 1e-6                                          # flat-series guard

ID_COL, YEAR_COL, DATE_COL = "Symbol", "Year", "ReportDate"
_IN_NOTEBOOK = "ipykernel" in sys.modules

# ── logger (one file per run root) ─────────────────────────────────
ROOT_RUN = None
try:
    # choose the first event that actually exists so the log ends up in a run folder
    ROOT_RUN = resolve_run_dir(
        swan_year = EVENT_LIST[0], run_tag = os.getenv("RUN_TAG"), create = False
    ).parent
except Exception:
    pass

LOG_PATH = (ROOT_RUN / "stage25" / "stage25.log") if ROOT_RUN else None
if LOG_PATH:
    LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(LOG_PATH, "w", "utf-8")] if LOG_PATH else [],
)
log = logging.getLogger("stage25")


# ═════════════════════ STATS HELPERS ════════════════
def _safe_spearman(x: pd.Series, y: pd.Series):
    ok = x.notna() & y.notna()
    if ok.sum() < MIN_OBS or x[ok].var() < EPS_VAR or y[ok].var() < EPS_VAR:
        return np.nan, np.nan
    return spearmanr(x[ok], y[ok])[:2]

def _logit_one(x: pd.Series, flag: pd.Series) -> Dict:
    ok = x.notna() & flag.isin([0, 1])
    n  = int(ok.sum())
    res = dict(SampleSize=n, PositivePct=float(flag[ok].mean()*100) if n else np.nan,
               PseudoR2=np.nan, AUROC=np.nan, CoefP=np.nan,
               ModelConv=False, Failure="")
    if n < MIN_OBS:                     res["Failure"] = "too_few_obs";  return res
    if flag[ok].nunique() < 2:          res["Failure"] = "single_class"; return res
    if x[ok].var() < EPS_VAR:           res["Failure"] = "zero_variance";return res
    try:
        mdl = sm.Logit(flag[ok], sm.add_constant(x[ok])).fit(disp=False)
        if not mdl.mle_retvals.get("converged", True):
            res["Failure"] = "no_convergence"; return res
        res.update(ModelConv=True,
                   PseudoR2=float(mdl.prsquared),
                   AUROC=float(roc_auc_score(flag[ok], mdl.predict())),
                   CoefP=float(mdl.pvalues.iloc[1]))
    except (ValueError, LinAlgError, MissingDataError, PerfectSeparationError):
        res["Failure"] = "model_error"
    return res

def _fisher(p: List[float])  -> float:      # meta-p
    g = [q for q in p if 0 < q < 1]
    if not g: return np.nan
    stat = -2 * sum(math.log(max(q, 1e-300)) for q in g)
    return 1 - chi2.cdf(stat, 2*len(g))

def _stouffer(p: List[float]) -> float:     # meta-z
    g = [q for q in p if 0 < q < 1]
    if not g: return np.nan
    z = [norm.isf(q/2) * math.copysign(1, .5-q) for q in g]
    return sum(z)/math.sqrt(len(z))

# ═════════════════════ RATIO DETECTOR ═══════════════
def _ratio_columns(df: pd.DataFrame):
    """
    Returns (raw_like, winsor_like) lists.
    *_raw*  or *_orig* = un-winsorised; the partner column (same stem) is winsor.
    """
    if any(c.endswith("_raw") for c in df.columns):
        raw  = [c for c in df if c.endswith("_raw")  and c[:-4] in df.columns]
        wins = [c[:-4] for c in raw]
    elif any(c.endswith("_orig") for c in df.columns):
        raw  = [c for c in df if c.endswith("_orig") and c[:-5] in df.columns]
        wins = [c[:-5] for c in raw]
    else:                                     # already only one copy (winsor)
        raw, wins = [], [c for c in df if "_" in c]
    return raw, wins


# ═════════════════════ ONE-EVENT CORE ═══════════════
def run_event(ev: str) -> pd.DataFrame:
    run_dir = resolve_run_dir(
        swan_year = ev,
        run_tag   = os.getenv("RUN_TAG"),
        must_have = f"stage03/Stage3_Data_WithRatios_{ev}.csv",
    )
    out_dir = run_dir / "stage25"
    out_dir.mkdir(exist_ok=True)

    df = pd.read_csv(run_dir/"stage03"/f"Stage3_Data_WithRatios_{ev}.csv")
    if YEAR_COL not in df and DATE_COL in df:
        df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
        df[YEAR_COL] = df[DATE_COL].dt.year
    df = df[df[YEAR_COL] < int(ev)].copy()           # pre-event only

    raw_cols, win_cols = _ratio_columns(df)

    # outcome families -----------------------------------------------------
    OUT = {
        "Temporal": dict(score="ScoreTemporal_", flag="FlagTemporal_"),
        "Impact"  : dict(score="ScoreImpact_",   flag="FlagImpact_"),
        "Dynamic" : dict(score="ScoreDynamic_",  flag="FlagDynamic_"),
        "Blend"   : dict(score="ScoreBlend_",    flag="FlagBlend_"),
    }

    # available metrics per family
    metric_map: Dict[str, List[str]] = {}
    for flav, pre in OUT.items():
        score_mets = {c[len(pre["score"]):] for c in df if c.startswith(pre["score"])}
        flag_mets  = {c[len(pre["flag"] ):] for c in df if c.startswith(pre["flag"] )}
        metric_map[flav] = sorted(score_mets & flag_mets)

    # ranking routine ------------------------------------------------------
    def _rank(cols: list[str], flavour: str) -> pd.DataFrame:
        if not metric_map[flavour]:                       # family absent → skip
            return pd.DataFrame()

        pre = OUT[flavour]
        recs = []
        for ratio in cols:
            cov = df[ratio].notna().mean()*100
            if cov < MIN_COVER:
                continue
            for m in metric_map[flavour]:
                rho, p_rho = _safe_spearman(df[ratio], df[f"{pre['score']}{m}"])
                recs.append(dict(
                    SwanYear    = int(ev),
                    Ratio       = ratio.replace("_raw","").replace("_orig",""),
                    Metric      = m,
                    CoveragePct = round(cov,1),
                    AbsRho      = abs(rho) if pd.notna(rho) else np.nan,
                    RhoP        = p_rho,
                    **_logit_one(df[ratio], df[f"{pre['flag']}{m}"]),
                    Flavour     = flavour,
                    Series      = "winsor" if ratio in win_cols else "raw",
                ))
        res = pd.DataFrame(recs)
        if not res.empty:
            tag = "winsor" if cols is win_cols else "raw"
            res.to_csv(out_dir / f"Stage25_{tag}_{flavour}_RatioRanking_{ev}.csv",
                       index=False)
        return res

    frames = []
    if win_cols: frames += [_rank(win_cols, f) for f in OUT]
    if raw_cols: frames += [_rank(raw_cols, f) for f in OUT]

    out_df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
    log.info("Stage-25 %s → %d rows", ev, len(out_df))
    return out_df


# ═════════════════════ DRIVER ═══════════════════════
def main():
    all_frames = []
    for ev in EVENT_LIST:
        try:
            all_frames.append(run_event(ev))
        except Exception as exc:
            log.error("Event %s skipped: %s", ev, exc)

    if not all_frames:
        log.warning("No events processed – nothing to summarise")
        return

    big = pd.concat(all_frames, ignore_index=True)
    sel = big[(big.Series == "winsor") & big.ModelConv]

    if sel.empty:
        log.warning("No converged winsor models – meta summary skipped")
        return

    meta = (sel.groupby(["Ratio", "Metric", "Flavour"])
              .apply(lambda g: pd.Series(dict(
                  meanAbsRho = g.AbsRho.mean(),
                  meanAUROC  = g.AUROC.mean(),
                  FisherP    = _fisher(g.CoefP.dropna().tolist()),
                  StoufferZ  = _stouffer(g.CoefP.dropna().tolist()),
                  nEventsSig = int((g.CoefP < .05).sum()),
                  Events     = ",".join(map(str, sorted(g.SwanYear.unique()))),
              ))).reset_index()
              .sort_values(["Flavour", "FisherP", "meanAUROC"]))

    # write inside *latest* run folder
    last_ev = next((e for e in reversed(EVENT_LIST)
                    if (resolve_run_dir(swan_year=e, run_tag=os.getenv("RUN_TAG"),
                                        create=False, must_have="")).exists()), None)
    if last_ev is None:
        log.warning("Run folder not found – meta summary NOT written")
        return

    meta_dir = resolve_run_dir(swan_year=last_ev, run_tag=os.getenv("RUN_TAG"))
    (meta_dir / "stage25").mkdir(exist_ok=True)
    out_fp = meta_dir / "stage25" / "Stage25_CrossEvent_Summary.csv"
    meta.to_csv(out_fp, index=False)
    log.info("Cross-event summary saved → %s", out_fp)

    if _IN_NOTEBOOK:
        from IPython.display import display, HTML
        display(HTML("<h3>Stage-25 cross-event summary (top 50)</h3>"))
        display(meta.head(50).style.background_gradient(cmap="Purples"))


if __name__ == "__main__":
    # hide noisy stdout from statsmodels when run at the CLI
    with contextlib.redirect_stdout(io.StringIO()):
        main()
