In [1]:
# Parameters
INPUT_CSV = "C:/Users/Jason Pohl/OneDrive - Bond University/PhD/rff/NEW_DATA.csv"
OUTPUT_ROOT = "C:/Users/Jason Pohl/OneDrive - Bond University/PhD/rff/outputs_rff"
SWAN_YEAR = 2020
WIN_START = 2015
WIN_END = 2024


In [2]:
#!/usr/bin/env python
"""
Stage-25 · Universal Ratio-vs-Resilience Ranker      (resilience-pipeline v2)
──────────────────────────────────────────────────────────────────────────────
• Consumes only Stage-03 artefacts.
• Produces per-event ranking CSVs (raw / winsor) plus a cross-event meta
  summary (Fisher χ² + Stouffer Z) identical to the old notebook outputs.
• Adds full guard-rails: minimum sample size, class-balance checks, variance
  checks, and convergence flags – eliminating NaNs and spurious warnings.
"""

from __future__ import annotations

import contextlib, io, logging, math, os, sys, warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from numpy.linalg import LinAlgError
from scipy.stats import chi2, norm, spearmanr
from sklearn.metrics import roc_auc_score
from statsmodels.tools.sm_exceptions import MissingDataError, PerfectSeparationError

from pipeline_utils import load_cfg, resolve_run_dir

# ───────────────────────── env & config ─────────────────────────
warnings.filterwarnings("ignore", category=RuntimeWarning)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-7s | %(message)s",
)

CFG          = load_cfg()
OUTPUT_ROOT  = Path(CFG["defaults"]["OUTPUT_ROOT"]).expanduser()
RAW_COV_TH   = float(os.getenv("MIN_COVERAGE", 40))   # %
MIN_OBS      = int(os.getenv("MIN_OBS", 30))          # min valid rows to fit a model
EPS_VAR      = 1e-6                                  # variance floor
SWAN_YEARS   = list(CFG["events"].keys())            # ['2000','2008','2020', …]
ID_COL, YEAR_COL = "Symbol", "Year"
_IN_NOTEBOOK = "ipykernel" in sys.modules            # crude check

# ───────────────────────── helper stats ─────────────────────────
def _spearman(x: pd.Series, y: pd.Series):
    ok = x.notna() & y.notna()
    if ok.sum() < MIN_OBS:
        return np.nan, np.nan
    rho, p = spearmanr(x[ok], y[ok])
    return abs(rho), p


def _logit_stats(x: pd.Series, y_flag: pd.Series):
    ok = x.notna() & y_flag.isin([0, 1])
    n   = ok.sum()
    if n < MIN_OBS:
        return dict(PseudoR2=np.nan, AUROC=np.nan, CoefP=np.nan,
                    SampleSize=n, PositivePct=np.nan,
                    ModelConverged=False, FailureReason="too_few_obs")

    x_ok, y_ok = x[ok], y_flag[ok]
    pos_pct    = y_ok.mean() * 100
    if y_ok.nunique() < 2:
        return dict(PseudoR2=np.nan, AUROC=np.nan, CoefP=np.nan,
                    SampleSize=n, PositivePct=pos_pct,
                    ModelConverged=False, FailureReason="single_class")

    if x_ok.var() < EPS_VAR:
        return dict(PseudoR2=np.nan, AUROC=np.nan, CoefP=np.nan,
                    SampleSize=n, PositivePct=pos_pct,
                    ModelConverged=False, FailureReason="zero_variance")

    try:
        mdl   = sm.Logit(y_ok, sm.add_constant(x_ok)).fit(disp=False)
        conv  = bool(mdl.mle_retvals.get("converged", True))
        prs2  = mdl.prsquared if conv else np.nan
        auc   = roc_auc_score(y_ok, mdl.predict()) if conv else np.nan
        pval  = float(mdl.pvalues.iloc[1]) if conv else np.nan
        fail  = None if conv else "no_convergence"
        return dict(PseudoR2=prs2, AUROC=auc, CoefP=pval,
                    SampleSize=n, PositivePct=pos_pct,
                    ModelConverged=conv, FailureReason=fail)
    except (ValueError, LinAlgError, MissingDataError, PerfectSeparationError):
        return dict(PseudoR2=np.nan, AUROC=np.nan, CoefP=np.nan,
                    SampleSize=n, PositivePct=pos_pct,
                    ModelConverged=False, FailureReason="model_error")


def fisher_p(pvals):
    pvals = [p for p in pvals if 0 < p < 1]
    if not pvals:
        return np.nan
    chi2_stat = -2 * sum(math.log(p) for p in pvals)
    return 1 - chi2.cdf(chi2_stat, 2 * len(pvals))


def stouffer_z(pvals):
    pvals = [p for p in pvals if 0 < p < 1]
    if not pvals:
        return np.nan
    z_vals = [norm.isf(p / 2) * np.sign(0.5 - p) for p in pvals]  # two-sided
    return sum(z_vals) / math.sqrt(len(z_vals))

# ───────────────────────── column sniffing ──────────────────────
def _detect_ratio_columns(df: pd.DataFrame) -> tuple[list[str], list[str]]:
    """Return (raw_cols, winsor_cols) detecting *_raw / *_orig paired columns."""
    if any(c.endswith("_raw") for c in df.columns):
        raw = [c for c in df if c.endswith("_raw") and c[:-4] in df]
        win = [c[:-4] for c in raw]
    elif any(c.endswith("_orig") for c in df.columns):
        raw = [c for c in df if c.endswith("_orig") and c[:-5] in df]
        win = [c[:-5] for c in raw]
    else:
        raw, win = [], []
    return raw, win

# ───────────────────────── per-event routine ────────────────────
def rank_one_event(swan: str) -> pd.DataFrame:
    run_dir = resolve_run_dir(
        swan_year=swan,
        must_have="stage03/Stage3_Data_WithRatios.csv",
        run_tag=os.getenv("RUN_TAG"),   # optional pin
    )
    out_dir = run_dir / "stage25"
    out_dir.mkdir(parents=True, exist_ok=True)

    df = pd.read_csv(run_dir / "stage03" / "Stage3_Data_WithRatios.csv")
    df = df[df[YEAR_COL] < int(swan)].copy()          # pre-event only

    raw_cols, win_cols = _detect_ratio_columns(df)
    if not raw_cols:
        raise RuntimeError("No *_raw / *_orig columns found – is Stage 3 output complete?")

    score_cols = [c for c in df if c.startswith("Score_")]
    flag_cols  = [c for c in df if c.startswith("Flag_")]
    metrics    = sorted({c[6:] for c in score_cols} & {c[5:] for c in flag_cols})

    def _compute(cols, tag):
        recs = []
        for ratio in cols:
            cov = df[ratio].notna().mean() * 100
            if cov < RAW_COV_TH:
                continue
            x = df[ratio]
            for m in metrics:
                s, f = f"Score_{m}", f"Flag_{m}"
                rho, p_rho = _spearman(x, df[s])

                stats = _logit_stats(x, df[f])
                recs.append({
                    "SwanYear":     int(swan),
                    "Ratio":        ratio.replace("_raw", "").replace("_orig", ""),
                    "Metric":       m,
                    "CoveragePct":  round(cov, 1),
                    "AbsRho":       rho,
                    "RhoP":         p_rho,
                    "PseudoR2":     stats["PseudoR2"],
                    "AUROC":        stats["AUROC"],
                    "CoefP":        stats["CoefP"],
                    "SampleSize":   stats["SampleSize"],
                    "PositivePct":  stats["PositivePct"],
                    "ModelConv":    stats["ModelConverged"],
                    "FailReason":   stats["FailureReason"],
                    "Series":       tag,
                })
        out = pd.DataFrame(recs)
        out.to_csv(out_dir / f"Stage25_{tag}_RatioRanking.csv", index=False)
        return out

    raw_df = _compute(raw_cols, "raw")
    win_df = _compute(win_cols, "winsor")
    full   = pd.concat([raw_df, win_df], ignore_index=True)
    logging.info("✓ Stage-25 done for %s → %s", swan, out_dir)
    return full

# ────────────────────────── master driver ───────────────────────
def main():
    collected = []
    for sw in SWAN_YEARS:
        try:
            collected.append(rank_one_event(sw))
        except Exception as e:
            logging.error("Stage-25 failed for event %s: %s", sw, e)

    if not collected:
        logging.warning("No per-event data collected – nothing to summarise")
        return

    big = pd.concat(collected, ignore_index=True)

    # cross-event meta summary (winsor only, converged models only)
    win_only = big[(big["Series"] == "winsor") & (big["ModelConv"])]
    grp      = win_only.groupby(["Ratio", "Metric"])

    meta = grp.apply(lambda g: pd.Series({
        "meanAbsRho": g["AbsRho"].mean(),
        "meanAUROC" : g["AUROC"].mean(),
        "FisherP"   : fisher_p(g["CoefP"]),
        "StoufferZ" : stouffer_z(g["CoefP"]),
        "nEventsSig": (g["CoefP"] < 0.05).sum(),
        "Events"    : ",".join(sorted({str(int(y)) for y in g["SwanYear"]})),
    })).reset_index().sort_values("FisherP")

    meta_path = OUTPUT_ROOT / "stage25_cross_event_summary.csv"
    meta.to_csv(meta_path, index=False)
    logging.info("✓ Cross-event summary saved → %s", meta_path)

    # Optional notebook visual output (unchanged)
    if _IN_NOTEBOOK:
        from IPython.display import display, HTML
        display(HTML("<h2>Cross-event summary (top 50)</h2>"))
        display(meta.head(50).style.background_gradient(cmap="Purples"))

# ───────────────────────── entry point ──────────────────────────
if __name__ == "__main__":
    with contextlib.redirect_stdout(io.StringIO()):   # silence statsmodels
        main()




































2025-06-14 14:52:45,951 | INFO    | ✓ Stage-25 done for 2000 → C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2000\2025-06-13\stage25










2025-06-14 14:54:04,558 | INFO    | ✓ Stage-25 done for 2008 → C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\2025-06-13\stage25










2025-06-14 14:56:12,233 | INFO    | ✓ Stage-25 done for 2020 → C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2020\2025-06-13\stage25


2025-06-14 14:56:13,337 | INFO    | ✓ Cross-event summary saved → C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\stage25_cross_event_summary.csv


Unnamed: 0,Ratio,Metric,meanAbsRho,meanAUROC,FisherP,StoufferZ,nEventsSig,Events
1788,WorkingCapital_to_Sales,TotalRevenue,0.145811,0.628932,0.0,15.222135,3,200020082020
863,FreeCashFlow_to_InvestedCapital,OperatingIncome,0.193459,0.633093,0.0,19.527457,3,200020082020
864,FreeCashFlow_to_InvestedCapital,TotalRevenue,0.187202,0.641152,0.0,25.731718,3,200020082020
865,GrossMargin_3yrAvg,Cash,0.140343,0.6054,0.0,26.520207,3,200020082020
866,GrossMargin_3yrAvg,CashAndCashEquivalents,0.170337,0.61434,0.0,27.137431,3,200020082020
867,GrossMargin_3yrAvg,EBITDA,0.325256,0.813564,0.0,42.270501,3,200020082020
868,GrossMargin_3yrAvg,EarningBeforeInterestAndTax,0.370914,0.818121,0.0,28.459139,3,200020082020
869,GrossMargin_3yrAvg,FreeCashFlow,0.153966,0.545769,0.0,16.552372,1,200020082020
870,GrossMargin_3yrAvg,GrossProfit,0.28782,0.892376,0.0,26.747311,2,20002008
871,GrossMargin_3yrAvg,NetIncome,0.234137,0.639595,0.0,24.632329,3,200020082020
