In [2]:
#!/usr/bin/env python
"""
Stage-18 · Calibration by Size Quintile (Assets & Revenue)
new-format · 2025-06-10 · filename-robust

Fixes
• looks for BOTH “stage11/Stage11_RISE_Probabilities_All.csv” and
  “stage11/11_RISE_Probabilities_All.csv” (older run)  
• latest_run_with() now accepts a list of alternative relative paths.  
Everything else unchanged from the 2025-06-10 port.
"""

from __future__ import annotations

import os, logging, warnings, yaml
from pathlib import Path
from typing import Dict, List, Sequence

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

warnings.filterwarnings("ignore", category=FutureWarning)
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s | %(levelname)-7s | %(message)s")
log = logging.getLogger(__name__)
log.info("==========  STAGE 18 ==========")

# ═══════════════════════════════════════════════════════════════════
# 0 · YAML CONFIG & RUN-DIR
# ═══════════════════════════════════════════════════════════════════
CFG_FILE = Path(os.getenv("PIPELINE_CFG", "pipeline_config.yaml")).expanduser()
if not CFG_FILE.is_file():
    raise FileNotFoundError(f"pipeline_config.yaml missing: {CFG_FILE}")
CFG       = yaml.safe_load(CFG_FILE.read_text()) or {}
DEFAULTS  = CFG.get("defaults", {})
EVENTS    = {str(k): v for k, v in CFG.get("events", {}).items()}

SWAN_YEAR = int(os.getenv("SWAN_YEAR", next(iter(EVENTS))))
if str(SWAN_YEAR) not in EVENTS:
    raise KeyError(f"SWAN_YEAR {SWAN_YEAR} missing in config")
SNAP_YEAR = SWAN_YEAR - 1

OUTPUT_ROOT = Path(DEFAULTS["OUTPUT_ROOT"]).expanduser()
EVENT_DIR   = OUTPUT_ROOT / f"event={SWAN_YEAR}"

def latest_run_with(rel_paths: Sequence[str]) -> Path:
    """Return RUN_DIR for newest run containing *any* of the rel_paths."""
    hits: List[Path] = []
    for rel in rel_paths:
        hits.extend(EVENT_DIR.glob(f"*/{rel}"))
    if not hits:
        raise FileNotFoundError(f"No run contains any of {rel_paths}")
    return max(hits, key=lambda p: p.stat().st_mtime).parents[1]

if os.getenv("RUN_DIR"):
    RUN_DIR = Path(os.getenv("RUN_DIR")).expanduser()
elif os.getenv("RUN_DATE"):
    RUN_DIR = EVENT_DIR / os.getenv("RUN_DATE")
else:
    RUN_DIR = latest_run_with([
        "stage11/Stage11_RISE_Probabilities_All.csv",
        "stage11/11_RISE_Probabilities_All.csv"
    ])

STAGE18_DIR = RUN_DIR / "stage18" / f"snapshot{SNAP_YEAR}"
STAGE18_DIR.mkdir(parents=True, exist_ok=True)
log.info("RUN_DIR ➜ %s", RUN_DIR)

# ═══════════════════════════════════════════════════════════════════
# 1 · ENV OVERRIDES
# ═══════════════════════════════════════════════════════════════════
DATE_COL = os.getenv("DATE_COL", "ReportDate")
ID_COL   = os.getenv("ID_COL",   "Symbol")

date_col = DATE_COL.lower()
id_col   = ID_COL.lower()

# ═══════════════════════════════════════════════════════════════════
# 2 · LOCATE INPUT FILES
# ═══════════════════════════════════════════════════════════════════
def find_first(rel_paths: Sequence[str]) -> Path:
    for rel in rel_paths:
        p = RUN_DIR / rel
        if p.is_file():
            return p
    # else search event-wide
    p_dir = latest_run_with(rel_paths)  # returns run dir
    for rel in rel_paths:
        cand = p_dir / rel
        if cand.is_file():
            return cand
    raise FileNotFoundError(f"No file found for any of {rel_paths}")

PROB_CSV = find_first([
    "stage11/Stage11_RISE_Probabilities_All.csv",
    "stage11/11_RISE_Probabilities_All.csv"
])
STAGE3_CSV = find_first(["stage03/Stage3_Data_WithRatios.csv"])

log.info("Probabilities  → %s", PROB_CSV)
log.info("Stage-3 ratios → %s", STAGE3_CSV)

# ═══════════════════════════════════════════════════════════════════
# 3 · LOAD DATA
# ═══════════════════════════════════════════════════════════════════
prob = pd.read_csv(PROB_CSV, low_memory=False)
prob.columns = prob.columns.str.lower().str.strip()
prob[date_col] = pd.to_datetime(prob[date_col], errors="coerce")

stage3 = pd.read_csv(STAGE3_CSV, low_memory=False)
stage3.columns = stage3.columns.str.lower().str.strip()
stage3[date_col] = pd.to_datetime(stage3[date_col], errors="coerce")

# ═══════════════════════════════════════════════════════════════════
# 4 · DETECT SIZE VARIABLES
# ═══════════════════════════════════════════════════════════════════
aliases = {
    "totalassets":         "assets",
    "total_assets":        "assets",
    "totalassetsreported": "assets",
    "totalrevenue":        "revenue",
    "total_revenue":       "revenue",
    "revenue":             "revenue"
}
size_cols: Dict[str,str] = {}
for col in stage3.columns:
    key = col.replace(" ", "").lower()
    if key in aliases:
        size_cols[aliases[key]] = col
if not size_cols:
    raise RuntimeError("No TotalAssets / TotalRevenue columns detected")
log.info("Size variables detected: %s", size_cols)

# ═══════════════════════════════════════════════════════════════════
# 5 · BUILD SNAPSHOT MERGED FRAME
# ═══════════════════════════════════════════════════════════════════
snap_prob = prob[prob[date_col].dt.year == SNAP_YEAR].copy()
prob_cols = [c for c in snap_prob.columns if
             c.endswith("_rise_prob") or c.endswith("_stagerise_prob") or
             c.startswith("rscoreprob_")]
if not prob_cols:
    raise RuntimeError("No *_rise_prob columns in Stage-11 file")

snap_size = (stage3[stage3[date_col].dt.year == SNAP_YEAR]
             .drop_duplicates(subset=id_col)
             [[id_col] + list(size_cols.values())])

df = (snap_prob[[id_col] + prob_cols]
      .merge(snap_size, on=id_col, how="inner")
      .dropna(subset=prob_cols + list(size_cols.values())))
log.info("Snapshot merged rows: %s", f"{len(df):,}")

# ═══════════════════════════════════════════════════════════════════
# 6 · CREATE SIZE QUINTILES
# ═══════════════════════════════════════════════════════════════════
for var, raw in size_cols.items():
    df[f"{var}_quintile"] = pd.qcut(df[raw], 5, labels=False, duplicates="drop") + 1

# ═══════════════════════════════════════════════════════════════════
# 7 · CALIBRATION UTILITIES
# ═══════════════════════════════════════════════════════════════════
def calibration_table(data: pd.DataFrame, score_col: str,
                      flag_col: str="flag") -> pd.DataFrame:
    cuts = pd.qcut(data[score_col], 10, duplicates="drop")
    pred = data.groupby(cuts)[score_col].mean()
    obs  = data.groupby(cuts)[flag_col].mean()
    return pd.DataFrame({"bin_mean_pred": pred.values,
                         "bin_mean_obs":  obs.values},
                        index=range(1, len(pred)+1))

# ═══════════════════════════════════════════════════════════════════
# 8 · INITIALISE OVERALL PLOT
# ═══════════════════════════════════════════════════════════════════
plt.figure(figsize=(6,5))
plt.plot([0,1], [0,1], "--", color="gray", label="perfect")

srows: List[Dict[str,object]] = []

# ═══════════════════════════════════════════════════════════════════
# 9 · LOOP OVER PROBABILITY METRICS
# ═══════════════════════════════════════════════════════════════════
for pcol in prob_cols:
    metric   = pcol.split("_")[0]
    flag_col = f"flag_{metric}"
    if flag_col not in stage3.columns:
        log.info("Flag column %s missing – skip metric %s", flag_col, metric)
        continue

    flags = (stage3[stage3[date_col].dt.year == SNAP_YEAR]
             .drop_duplicates(subset=id_col)
             [[id_col, flag_col]].rename(columns={flag_col: "flag"}))

    base = df.merge(flags, on=id_col, how="inner")
    if base["flag"].nunique() < 2:
        log.info("Metric %s: only one class present – skipped", metric)
        continue

    # overall curve
    x_all, y_all = calibration_curve(
        base["flag"], base[pcol], n_bins=10, strategy="quantile")
    plt.plot(x_all, y_all, marker="o", alpha=0.85, label=metric)

    # by size quintile
    for var in size_cols:
        tag = f"{var}_quintile"
        for q in range(1, 6):
            seg = base[base[tag] == q]
            if len(seg) < 120:
                continue
            tbl   = calibration_table(seg, pcol)
            slope = np.polyfit(tbl["bin_mean_pred"], tbl["bin_mean_obs"], 1)[0]
            tbl.to_csv(STAGE18_DIR / f"{metric}_{var}_Q{q}.csv", index=True)
            srows.append({
                "Metric":     metric,
                "SizeVar":    var,
                "Quintile":   q,
                "n":          len(seg),
                "CalibSlope": round(slope, 3)
            })

# ═══════════════════════════════════════════════════════════════════
# 10 · FINALISE PLOT & SUMMARY
# ═══════════════════════════════════════════════════════════════════
plt.title(f"Calibration Curves (snapshot {SNAP_YEAR})")
plt.xlabel("Predicted probability")
plt.ylabel("Observed frequency")
plt.legend(fontsize="small", ncol=2)
plt.tight_layout()
plt.savefig(STAGE18_DIR / "overall_calibration.png", dpi=110)
plt.close()

slopes = pd.DataFrame(srows)
slopes.to_csv(STAGE18_DIR / "calibration_slope_summary.csv", index=False)

log.info("✓ Stage 18 outputs in %s", STAGE18_DIR)
if not slopes.empty:
    worst = (slopes.assign(dev=lambda d: (d.CalibSlope - 1).abs())
                   .sort_values("dev", ascending=False)
                   .head(10)[["Metric","SizeVar","Quintile","CalibSlope","n"]])
    print("\n10 worst calibration slopes (ideal = 1.00):")
    print(worst.to_string(index=False))
else:
    print("No segment with ≥120 rows — nothing to report.")

print(f"\n✓ Stage 18 complete — outputs in {STAGE18_DIR}\n")

2025-06-10 22:24:35,887 | INFO    | RUN_DIR ➜ C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609
2025-06-10 22:24:35,892 | INFO    | Probabilities  → C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609\stage11\11_RISE_Probabilities_All.csv
2025-06-10 22:24:35,893 | INFO    | Stage-3 ratios → C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609\stage03\Stage3_Data_WithRatios.csv
2025-06-10 22:24:41,485 | INFO    | Size variables detected: {'assets': 'totalassets', 'revenue': 'totalrevenue'}
2025-06-10 22:24:41,557 | INFO    | Snapshot merged rows: 903
2025-06-10 22:24:43,602 | INFO    | Flag column flag_rscoreprob missing – skip metric rscoreprob
2025-06-10 22:24:43,603 | INFO    | Flag column flag_rscoreprob missing – skip metric rscoreprob
2025-06-10 22:24:43,605 | INFO    | Flag column flag_rscoreprob missing – skip metric rscoreprob
2025-06-10 22:24:43,606 | INFO    | Flag column


10 worst calibration slopes (ideal = 1.00):
                Metric SizeVar  Quintile  CalibSlope   n
     operatingcashflow revenue         1      -1.400 181
       operatingincome revenue         3      -0.999 181
     operatingcashflow  assets         1      -0.844 181
     operatingcashflow  assets         1      -0.828 181
                  cash revenue         2      -0.769 180
       operatingincome revenue         1      -0.725 181
       operatingincome revenue         3      -0.667 181
     operatingcashflow revenue         1      -0.534 181
                  cash revenue         1      -0.492 181
cashandcashequivalents  assets         1      -0.459 181

✓ Stage 18 complete — outputs in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609\stage18\snapshot2007

