In [None]:
"""
Stage 04 · Ratio Diagnostics & Leaderboards
===========================================

– Correlation (|ρ|) and Logit pseudo-R² / AUROC ranking for every ratio created
  in Stage 03, plus multicollinearity filtering and descriptive stats.

Outputs
  <OUTPUT_ROOT>/event=<SWAN_YEAR>/<RUN_DATE>/stage04/
      ├─ Stage4_raw_RatioDiagnostics_<SWAN_YEAR>.csv
      ├─ Stage4_winsor_RatioDiagnostics_<SWAN_YEAR>.csv
      ├─ Stage4_DroppedCollinearWinsor_<SWAN_YEAR>.csv
      ├─ Stage4_<raw|winsor>_RatioRanking_<SWAN_YEAR>.csv
      ├─ Stage4_<raw|winsor>_{Bucket|Stage|Domain|Overall}Top3_<SWAN_YEAR>.csv
      └─ stage04.log
"""
from __future__ import annotations

# ── core imports ─────────────────────────────────────────────────────
from pathlib import Path
import os, sys, logging, warnings, io
from typing import Dict, List, Tuple, Set

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score
from numpy.linalg import LinAlgError
from statsmodels.tools.sm_exceptions import MissingDataError

# ── shared helper -----------------------------------------------------
from pipeline_utils import load_cfg, resolve_run_dir

# ═════════════════════════════════════════════════════════════════════
# 0 · CONFIG & PARAMS
# ═════════════════════════════════════════════════════════════════════
CFG: Dict        = load_cfg()
DEFAULTS: Dict   = CFG.get("defaults", {})
EVENTS: Dict     = {str(k): v for k, v in CFG.get("events", {}).items()}
ST4_OVR: Dict    = CFG.get("stage4", {})           # per-stage overrides

SWAN_YEAR: str   = os.getenv("SWAN_YEAR") or next(iter(EVENTS))
if SWAN_YEAR not in EVENTS:
    raise KeyError(f"SWAN_YEAR={SWAN_YEAR} not present in YAML events:")
SWAN_YEAR_INT: int = int(SWAN_YEAR)               # numeric version

DATE_COL      = ST4_OVR.get("date_col", "ReportDate")
ID_COL        = ST4_OVR.get("id_col",   "Symbol")
MC_THRESH     = float(os.getenv("MC_THRESH",    ST4_OVR.get("mc_thresh",    0.95)))
MIN_COVERAGE  = float(os.getenv("MIN_COVERAGE", ST4_OVR.get("min_coverage", 40)))

# ═════════════════════════════════════════════════════════════════════
# 1 · RUN-FOLDER RESOLUTION
# ═════════════════════════════════════════════════════════════════════
RUN_DIR = resolve_run_dir(
    swan_year=SWAN_YEAR,
    must_have=f"stage03/Stage3_Data_WithRatios_{SWAN_YEAR}.csv",
    run_tag=os.getenv("RUN_TAG"),
)
RUN_DATE   = RUN_DIR.name
STAGE3_FILE = RUN_DIR / "stage03" / f"Stage3_Data_WithRatios_{SWAN_YEAR}.csv"

OUT_DIR = RUN_DIR / "stage04"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ═════════════════════════════════════════════════════════════════════
# 2 · LOGGER
# ═════════════════════════════════════════════════════════════════════
logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[
        logging.FileHandler(OUT_DIR / "stage04.log", mode="w", encoding="utf-8"),
        logging.StreamHandler(sys.stdout),
    ],
)
logger = logging.getLogger(__name__)
logger.info("==========  STAGE 04: DIAGNOSTICS & LEADERBOARDS ==========")
logger.info("RUN_DIR        : %s", RUN_DIR)
logger.info("SWAN_YEAR=%s  RUN_DATE=%s", SWAN_YEAR, RUN_DATE)
logger.info("MC_THRESH=%.2f  MIN_COVERAGE=%.1f%%", MC_THRESH, MIN_COVERAGE)

warnings.filterwarnings("ignore", category=RuntimeWarning)

# ═════════════════════════════════════════════════════════════════════
# 3 · LOAD DATA  (memory → disk fallback)
# ═════════════════════════════════════════════════════════════════════
if "data_stage_3" in globals():
    df_full = globals()["data_stage_3"].copy()
    logger.info("Stage 03 data reused from memory: %d rows", len(df_full))
else:
    df_full = pd.read_csv(STAGE3_FILE, parse_dates=[DATE_COL], low_memory=False)
    logger.info("Stage 03 CSV loaded: %d rows", len(df_full))

# keep observations *before* the crisis year
pre_df = df_full[df_full[DATE_COL].dt.year < SWAN_YEAR_INT].copy()
if pre_df.empty:
    raise ValueError(f"No observations before SWAN_YEAR={SWAN_YEAR}")
logger.info("Pre-SWAN sample size: %d rows", len(pre_df))

# ░░░░░░░░░░░░░░░░░░  RATIO COLUMNS  ░░░░░░░░░░░░░░░░░░
ratio_names: List[str] = sorted({c[:-4] for c in pre_df.columns
                                 if c.endswith("_raw") and c[:-4] in pre_df.columns})
raw_cols  = [f"{r}_raw" for r in ratio_names]
win_cols  = ratio_names     # winsorised copies already exist
logger.info("Detected %d ratios (%d raw + %d winsor)",
            len(ratio_names), len(raw_cols), len(win_cols))

# ═════════════════════════════════════════════════════════════════════
# 4 · WINSORISATION & MULTICOLLINEARITY
# ═════════════════════════════════════════════════════════════════════
def winsorise(s: pd.Series, pct: float = 0.01) -> pd.Series:
    if s.notna().sum() < 3: return s
    lo, hi = np.nanpercentile(s.dropna(), [pct*100, (1-pct)*100])
    return s if lo == hi else s.clip(lo, hi)

wins_df = pre_df.copy()
for c in win_cols:
    wins_df[c] = winsorise(wins_df[c])

# Spearman |ρ| ≥ MC_THRESH → drop lower-coverage column
corr = wins_df[win_cols].corr(method="spearman").abs()
mask = np.triu(np.ones(corr.shape), 1).astype(bool)
high_pairs = corr.where(mask).stack().loc[lambda s: s >= MC_THRESH]

drop_mc: Set[str] = set()
for (c1, c2), _ in high_pairs.sort_values(ascending=False).items():
    if c1 in drop_mc or c2 in drop_mc: continue
    keep, drop = (c1, c2) if wins_df[c1].notna().mean() >= wins_df[c2].notna().mean() else (c2, c1)
    drop_mc.add(drop)

if drop_mc:
    logger.info("MC filter: dropping %d winsor columns", len(drop_mc))
    pd.Series(sorted(drop_mc), name="Dropped_Winsor_Columns") \
        .to_csv(OUT_DIR / f"Stage4_DroppedCollinearWinsor_{SWAN_YEAR}.csv", index=False)
    win_cols = [c for c in win_cols if c not in drop_mc]
    wins_df.drop(columns=list(drop_mc), inplace=True)
else:
    logger.info("MC filter: no columns dropped")

# ═════════════════════════════════════════════════════════════════════
# 5 · DIAGNOSTIC STATS
# ═════════════════════════════════════════════════════════════════════
def _write_diag(base: pd.DataFrame, cols: List[str], tag: str):
    diag = (
        base.assign(Year=base[DATE_COL].dt.year)
            .groupby("Year")[cols]
            .agg(['mean', 'std', 'median', 'min', 'max', 'count'])
            .stack(level=1)
            .reset_index()
    )
    diag.to_csv(OUT_DIR / f"Stage4_{tag}_RatioDiagnostics_{SWAN_YEAR}.csv", index=False)

_write_diag(pre_df,  raw_cols,  "raw")
_write_diag(wins_df, win_cols, "winsor")
logger.info("Diagnostic CSVs written")

# ═════════════════════════════════════════════════════════════════════
# 6 · RANKING LOGIC
# ═════════════════════════════════════════════════════════════════════
METRICS    = ["NetIncome","EarningBeforeInterestAndTax","OperatingIncome",
              "EBITDA","OperatingCashFlow","FreeCashFlow","Cash",
              "CashAndCashEquivalents","TotalRevenue","GrossProfit"]
score_cols = [f"Score_{m}" for m in METRICS]
flag_cols  = [f"Flag_{m}"  for m in METRICS]

def _abs_spearman(x: pd.Series, y: pd.Series, min_obs=100):
    ok = x.notna() & y.notna()
    return np.nan if ok.sum() < min_obs else abs(spearmanr(x[ok], y[ok]).correlation)

def _logit_stats(x: pd.Series, y_flag: pd.Series, min_obs=100) -> Tuple[float,float]:
    ok = x.notna() & y_flag.isin([0,1])
    if ok.sum() < min_obs or y_flag[ok].nunique() < 2 or x[ok].nunique() < 2:
        return np.nan, np.nan
    try:
        mdl = sm.Logit(y_flag[ok], sm.add_constant(x[ok])).fit(disp=False)
        return mdl.prsquared, roc_auc_score(y_flag[ok], mdl.predict())
    except (ValueError, LinAlgError, MissingDataError):
        return np.nan, np.nan

# bucket / stage / domain maps (populated if Stage 03 ran in this session)
ratio_to_buckets: Dict[str, List[str]] = {
    r: (bs if isinstance(bs, (list, tuple)) else [bs])
    for r, bs in globals().get("ratio_domain_stage_map", {}).items()
}
domains = {"Phys", "Info", "Cog", "Soc"}
stages  = {"Prepare", "Absorb", "Recover", "Adapt"}

# ═════════════════════════════════════════════════════════════════════
# 7 · RANK LOOP
# ═════════════════════════════════════════════════════════════════════
def _rank(tag: str, cols: List[str], base: pd.DataFrame):
    logger.info("Ranking %s (%d columns)…", tag, len(cols))

    rows = []
    for col in cols:
        cov_pct = base[col].notna().mean() * 100
        if cov_pct < MIN_COVERAGE: continue
        x = base[col]
        for m, sc, fc in zip(METRICS, score_cols, flag_cols):
            rho  = _abs_spearman(x, base[sc])
            pr2, auc = _logit_stats(x, base[fc])
            rows.append({"Ratio": col.replace("_raw",""),
                         "Metric": m,
                         "Coverage%": round(cov_pct, 1),
                         "|rho|":     round(rho,  3) if pd.notna(rho) else np.nan,
                         "PseudoR2":  round(pr2,  3) if pd.notna(pr2) else np.nan,
                         "AUROC":     round(auc,  3) if pd.notna(auc) else np.nan})
    rk = pd.DataFrame(rows)
    rk.to_csv(OUT_DIR / f"Stage4_{tag}_RatioRanking_{SWAN_YEAR}.csv", index=False)

    # ---- selector maps ------------------------------------------------
    bucket_map = ratio_to_buckets or {"All": [c.replace("_raw","") for c in cols]}
    stage_map  = {stg: [r for r, bs in bucket_map.items()
                        if any(str(b).endswith(f"-{stg}") for b in bs)]
                  for stg in stages}
    domain_map = {dom: [r for r, bs in bucket_map.items()
                        if any(str(b).startswith(dom) for b in bs)]
                  for dom in domains}

    def _top3(df): return df.nlargest(3, "|rho|")   # pick top-3 by |rho|

    # (a) bucket leaderboard
    bucket_rows = []
    for bucket, members in bucket_map.items():
        sub = rk[rk["Ratio"].isin(members)]
        for metric in METRICS:
            for _, r in _top3(sub[sub["Metric"] == metric]).iterrows():
                bucket_rows.append({"Bucket": bucket, **r.drop("Ratio")})
    pd.DataFrame(bucket_rows).to_csv(
        OUT_DIR / f"Stage4_{tag}_BucketTop3_{SWAN_YEAR}.csv", index=False)

    # (b) stage leaderboard
    stage_rows = []
    for stg, members in stage_map.items():
        sub = rk[rk["Ratio"].isin(members)]
        for metric in METRICS:
            for _, r in _top3(sub[sub["Metric"] == metric]).iterrows():
                stage_rows.append({"Stage": stg, **r.drop("Ratio")})
    pd.DataFrame(stage_rows).to_csv(
        OUT_DIR / f"Stage4_{tag}_StageTop3_{SWAN_YEAR}.csv", index=False)

    # (c) domain leaderboard
    domain_rows = []
    for dom, members in domain_map.items():
        sub = rk[rk["Ratio"].isin(members)]
        for metric in METRICS:
            for _, r in _top3(sub[sub["Metric"] == metric]).iterrows():
                domain_rows.append({"Domain": dom, **r.drop("Ratio")})
    pd.DataFrame(domain_rows).to_csv(
        OUT_DIR / f"Stage4_{tag}_DomainTop3_{SWAN_YEAR}.csv", index=False)

    # (d) overall leaderboard
    overall = []
    for metric in METRICS:
        for _, r in _top3(rk[rk["Metric"] == metric]).iterrows():
            overall.append(r.drop("Ratio").to_dict())
    pd.DataFrame(overall).to_csv(
        OUT_DIR / f"Stage4_{tag}_OverallTop3_{SWAN_YEAR}.csv", index=False)

# ── run rankings ─────────────────────────────────────────────────────
_rank("raw",    raw_cols,  pre_df)
_rank("winsor", win_cols, wins_df)

logger.info("✅  STAGE 04 complete — artefacts in %s", OUT_DIR)

2025-06-11 10:11:33,028 | INFO    | RUN_DIR        : C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250610
2025-06-11 10:11:33,029 | INFO    | SWAN_YEAR=2008  RUN_DATE=20250610
2025-06-11 10:11:33,030 | INFO    | MC_THRESH=0.95  MIN_COVERAGE=40.0%
2025-06-11 10:11:38,044 | INFO    | Stage 03 CSV loaded: 34862 rows
2025-06-11 10:11:38,095 | INFO    | Pre-SWAN sample size: 11204 rows
2025-06-11 10:11:38,097 | INFO    | Detected 229 ratios (229 raw + 229 winsor)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return s if lo == hi else s.clip(lo, hi)


2025-06-11 10:11:51,635 | INFO    | MC filter: dropping 65 winsor columns
2025-06-11 10:11:52,734 | INFO    | Diagnostic CSVs written
2025-06-11 10:11:52,736 | INFO    | Ranking raw (229 columns)…




2025-06-11 10:12:35,366 | INFO    | Ranking winsor (164 columns)…




2025-06-11 10:13:09,969 | INFO    | ✅  STAGE 04 complete — artefacts in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250610\stage04
