In [2]:
#!/usr/bin/env python
"""
STAGE 04B · DEPTH-DIAGNOSTICS & LEADERBOARDS
────────────────────────────────────────────
Identical to Stage 04 but focused on the **draw-down depth outcomes**
created in Stage 02:

    • ScoreDepth_<metric>   – 0 (best) → 1 (worst)   (continuous)
    • FlagDepth_<metric>    – 1 if shallower than median (binary)

Outputs (all suffixed with 4B):
  <OUTPUT_ROOT>/event=<SWAN_YEAR>/<RUN_TAG>/stage04b/
      ├─ Stage4B_raw_RatioDiagnostics_<YEAR>.csv
      ├─ Stage4B_winsor_RatioDiagnostics_<YEAR>.csv
      ├─ Stage4B_DroppedCollinearWinsor_<YEAR>.csv
      ├─ Stage4B_<raw|winsor>_RatioRanking_<YEAR>.csv
      ├─ Stage4B_<raw|winsor>_{Bucket|Stage|Domain|Overall}Top3_<YEAR>.csv
      └─ stage04b.log
"""
from __future__ import annotations
# ── std / 3-rd-party ───────────────────────────────────────────────
from pathlib import Path
import os, sys, logging, warnings, io
from typing import Dict, List, Tuple, Set

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score
from numpy.linalg import LinAlgError
from statsmodels.tools.sm_exceptions import MissingDataError
# ── helpers ─────────────────────────────────────────────────────────
from pipeline_utils import load_cfg, resolve_run_dir

# ═══════════════════════════════════════════════════════════════════
# 0 · CONFIG
# ═══════════════════════════════════════════════════════════════════
CFG: Dict      = load_cfg()
EVENTS: Dict   = {str(k): v for k, v in CFG.get("events", {}).items()}
ST_OVR: Dict   = CFG.get("stage4b", {})         # optional YAML overrides

SWAN_YEAR = os.getenv("SWAN_YEAR") or next(iter(EVENTS))
if SWAN_YEAR not in EVENTS:
    raise KeyError(f"SWAN_YEAR={SWAN_YEAR} not in pipeline_config")
YEAR_INT  = int(SWAN_YEAR)

DATE_COL  = ST_OVR.get("date_col", "ReportDate")
ID_COL    = ST_OVR.get("id_col",   "Symbol")
MC_THRESH = float(os.getenv("MC_THRESH",  ST_OVR.get("mc_thresh",    0.95)))
MIN_COV   = float(os.getenv("MIN_COVERAGE", ST_OVR.get("min_coverage", 40)))

RUN_DIR = resolve_run_dir(
    swan_year=SWAN_YEAR,
    must_have=f"stage03/Stage3_Data_WithRatios_{SWAN_YEAR}.csv",
    run_tag=os.getenv("RUN_TAG"),
)
OUT_DIR  = RUN_DIR / "stage04b"; OUT_DIR.mkdir(parents=True, exist_ok=True)
STAGE3_CSV = RUN_DIR / "stage03" / f"Stage3_Data_WithRatios_{SWAN_YEAR}.csv"

# 🔈 logger
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[
        logging.FileHandler(OUT_DIR / "stage04b.log", mode="w", encoding="utf-8"),
        logging.StreamHandler(sys.stdout),
    ],
)
logger = logging.getLogger(__name__)
logger.info("==========  STAGE 04B · DEPTH DIAGNOSTICS  ==========")
logger.info("RUN_DIR=%s  SWAN=%s", RUN_DIR, SWAN_YEAR)

warnings.filterwarnings("ignore", category=RuntimeWarning)

# ═══════════════════════════════════════════════════════════════════
# 1 · LOAD STAGE-03 DATA
# ═══════════════════════════════════════════════════════════════════
df3 = pd.read_csv(STAGE3_CSV, parse_dates=[DATE_COL], low_memory=False)
pre_df = df3[df3[DATE_COL].dt.year < YEAR_INT].copy()
if pre_df.empty:
    raise RuntimeError("No pre-SWAN observations")

# ratio columns -----------------------------------------------------
ratio_names = sorted({c[:-4] for c in pre_df if c.endswith("_raw") and c[:-4] in pre_df})
raw_cols  = [f"{r}_raw" for r in ratio_names]
win_cols  = ratio_names                                         # winsorised copies
logger.info("Ratios detected: %d (%d raw + %d winsor)",
            len(ratio_names), len(raw_cols), len(win_cols))

# ═══════════════════════════════════════════════════════════════════
# 2 · WINSOR & MULTICOLLINEARITY  (unchanged)
# ═══════════════════════════════════════════════════════════════════
def winsor(s: pd.Series, pct: float = .01):
    if s.notna().sum() < 3: return s
    lo, hi = np.nanpercentile(s.dropna(), [pct*100, (1-pct)*100])
    return s.clip(lo, hi)

wins_df = pre_df.copy()
for c in win_cols:
    wins_df[c] = winsor(wins_df[c])

corr = wins_df[win_cols].corr("spearman").abs()
mask = np.triu(np.ones(corr.shape), 1).astype(bool)
high = corr.where(mask).stack().loc[lambda s: s >= MC_THRESH]

drop: Set[str] = set()
for (c1, c2), _ in high.sort_values(ascending=False).items():
    if c1 in drop or c2 in drop: continue
    keep, d = (c1, c2) if wins_df[c1].notna().mean() >= wins_df[c2].notna().mean() else (c2, c1)
    drop.add(d)
if drop:
    wins_df.drop(columns=list(drop), inplace=True)
    win_cols = [c for c in win_cols if c not in drop]
    pd.Series(sorted(drop), name="Dropped")\
        .to_csv(OUT_DIR/f"Stage4B_DroppedCollinearWinsor_{SWAN_YEAR}.csv", index=False)

# ═══════════════════════════════════════════════════════════════════
# 3 · RANKING LOGIC  (depth versions)
# ═══════════════════════════════════════════════════════════════════
METRICS = ["NetIncome","EarningBeforeInterestAndTax","OperatingIncome",
           "EBITDA","OperatingCashFlow","FreeCashFlow","Cash",
           "CashAndCashEquivalents","TotalRevenue","GrossProfit"]

def _abs_rho(x,y,min_n=100):
    ok = x.notna() & y.notna()
    return np.nan if ok.sum()<min_n else abs(spearmanr(x[ok],y[ok]).correlation)

def _logit(x, flag, min_n=100):
    ok = x.notna() & flag.isin([0,1])
    if ok.sum()<min_n or flag[ok].nunique()<2 or x[ok].nunique()<2:
        return np.nan, np.nan
    try:
        mdl=sm.Logit(flag[ok], sm.add_constant(x[ok])).fit(disp=False)
        return mdl.prsquared, roc_auc_score(flag[ok], mdl.predict())
    except (ValueError, LinAlgError, MissingDataError):
        return np.nan, np.nan

def _rank(tag:str, cols:List[str], base:pd.DataFrame):
    rows=[]
    for col in cols:
        cov = base[col].notna().mean()*100
        if cov < MIN_COV: continue
        x = base[col]
        for m in METRICS:
            sc, fc = f"ScoreDepth_{m}", f"FlagDepth_{m}"
            if sc not in base.columns or fc not in base.columns:   # ← guard
                continue
            rho  = _abs_rho(x, base[sc])
            pr2, auc = _logit(x, base[fc])
            rows.append(dict(Ratio=col.replace("_raw",""),
                             Metric=m,
                             CoveragePct=round(cov,1),
                             AbsRho=round(rho,3) if pd.notna(rho) else np.nan,
                             PseudoR2=round(pr2,3) if pd.notna(pr2) else np.nan,
                             AUROC=round(auc,3)   if pd.notna(auc) else np.nan))
    rk = pd.DataFrame(rows)
    rk.to_csv(OUT_DIR/f"Stage4B_{tag}_RatioRanking_{SWAN_YEAR}.csv",index=False)
    return rk

logger.info("Ranking raw ratios …")
_rank("raw", raw_cols,  pre_df)
logger.info("Ranking winsor ratios …")
_rank("winsor", win_cols, wins_df)

logger.info("✅  STAGE 04B complete — artefacts in %s", OUT_DIR)


2025-06-15 11:43:24,709 | INFO    | RUN_DIR=C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=1998\2025-06-13  SWAN=1998


2025-06-15 11:43:29,263 | INFO    | Ratios detected: 247 (247 raw + 247 winsor)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return s.clip(lo, hi)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return s.clip(lo, hi)


2025-06-15 11:43:33,759 | INFO    | Ranking raw ratios …
2025-06-15 11:43:33,792 | INFO    | Ranking winsor ratios …
2025-06-15 11:43:33,814 | INFO    | ✅  STAGE 04B complete — artefacts in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=1998\2025-06-13\stage04b
