In [None]:
 #!/usr/bin/env python
"""
STAGE-11 · END-TO-END REVIEW DASHBOARD
──────────────────────────────────────
*Understands both legacy “speed-only” outputs and the new speed / depth / blend
 prediction files introduced on 2025-06-16.*

Outputs (unchanged) land in …/stage11/, but new AUROC columns
(`AUROC_speed`, `AUROC_depth`, `AUROC_blend`, `AUROC_lassoDepth`) are added
when those flavours are available.
"""
from __future__ import annotations
from pathlib import Path
import os, logging, warnings
from typing import Dict, List, Optional

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

from pipeline_utils import load_cfg, resolve_run_dir
warnings.filterwarnings("ignore", category=FutureWarning)

# ═════════════════════ 0 · BOOTSTRAP ══════════════════════════════
CFG        : Dict = load_cfg()
EVENTS     : Dict = {str(k): v for k, v in CFG["events"].items()}

SWAN_YEAR  = str(os.getenv("SWAN_YEAR") or next(iter(EVENTS)))
SWAN_INT   = int(SWAN_YEAR)
PRE_YEAR   = SWAN_INT - 1

RUN_DIR = resolve_run_dir(
    swan_year = SWAN_YEAR,
    must_have = f"stage10/Stage10_BestSubset_MasterTable_{SWAN_YEAR}.csv",
    run_tag   = os.getenv("RUN_TAG"))
STAGE_DIR = RUN_DIR / "stage11"; STAGE_DIR.mkdir(exist_ok=True)

logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s | %(levelname)-7s | %(message)s",
                    handlers=[logging.FileHandler(STAGE_DIR/"stage11.log","w","utf-8"),
                              logging.StreamHandler()])
log = logging.getLogger(__name__)
log.info("Stage-11 — RUN=%s  SWAN=%s", RUN_DIR.name, SWAN_YEAR)

DATE_COL, ID_COL = "ReportDate", "Symbol"

# ═════════════════════ 1 · SMART CSV LOADER ═══════════════════════
def _first_csv(stage:str, names:List[str]) -> Optional[pd.DataFrame]:
    """Return first CSV (if any) from *names* inside RUN_DIR/<stage>/."""
    for nm in names:
        fp = RUN_DIR / stage / nm
        if fp.is_file():
            df = pd.read_csv(fp, low_memory=False)
            for c in df.columns:
                if "_" in c:        # normalise underscores to lower
                    df.rename(columns={c: c.lower().strip()}, inplace=True)
            if DATE_COL.lower() in df.columns:
                df[DATE_COL.lower()] = pd.to_datetime(
                    df[DATE_COL.lower()], errors="coerce")
            log.info("Loaded %s  (%d rows)", fp.relative_to(RUN_DIR), len(df))
            return df
    log.warning("None of %s found in %s", names, stage)
    return None

# Stage-level artefacts -------------------------------------------------------
df3 = _first_csv("stage03",[f"Stage3_Data_WithRatios_{SWAN_YEAR}.csv"])
df5 = _first_csv("stage05a",[f"Stage5A_QuintilesAndScores_{SWAN_YEAR}.csv"])

# speed / depth / blend prediction files (new names or legacy fallback)
df6_speed = _first_csv("stage06",[f"Stage6Speed_RISE_Predictions_{SWAN_YEAR}.csv",
                                  f"Stage6_RISE_Predictions_{SWAN_YEAR}.csv"])  # legacy
df6_depth = _first_csv("stage06",[f"Stage6Depth_RISE_Predictions_{SWAN_YEAR}.csv"])
df6_blend = _first_csv("stage06",[f"Stage6Blend_RISE_Predictions_{SWAN_YEAR}.csv"])
# legacy stage-weighted
df6_stage = _first_csv("stage06",[f"Stage6B_Stage_RISE_Predictions_{SWAN_YEAR}.csv"])

# lasso (speed + depth)
df8_speed = _first_csv("stage08",[f"08_pre{SWAN_YEAR}_AllMetrics_RScores.csv"])
df8_depth = _first_csv("stage08",[f"08_pre{SWAN_YEAR}_Depth_AllMetrics_RScores.csv"])

df10 = _first_csv("stage10",[f"Stage10_BestSubset_MasterTable_{SWAN_YEAR}.csv"])

if df3 is None:
    raise RuntimeError("Stage-03 artefacts missing — cannot proceed.")

# ═════════════════════ 2 · BACKBONE MASTER TABLE ══════════════════
METRICS = ["NetIncome","EarningBeforeInterestAndTax","OperatingIncome","EBITDA",
           "OperatingCashFlow","FreeCashFlow","Cash","CashAndCashEquivalents",
           "TotalRevenue","GrossProfit"]

flag_cols = [f"flag_{m.lower()}" for m in METRICS if f"flag_{m.lower()}" in df3.columns]
backbone  = df3[[ID_COL, DATE_COL] + flag_cols].copy()

def _merge_prob(src:pd.DataFrame|None, suffix:str, base:pd.DataFrame)->pd.DataFrame:
    if src is None: return base
    cols = [c for c in src.columns if c.endswith(suffix)]
    if not cols:     return base
    return base.merge(src[[ID_COL,DATE_COL]+cols], on=[ID_COL,DATE_COL], how="left", copy=False)

backbone = _merge_prob(df6_speed, "_rise_prob",          backbone)
backbone = _merge_prob(df6_stage, "_stagerise_prob",     backbone)  # legacy
backbone = _merge_prob(df6_depth, "_depthrise_prob",     backbone)
backbone = _merge_prob(df6_blend, "_blendrise_prob",     backbone)
backbone = _merge_prob(df8_speed, f"_pre{SWAN_YEAR}",    backbone)
backbone = _merge_prob(df8_depth, f"_depthpre{SWAN_YEAR}", backbone)

backbone[DATE_COL] = pd.to_datetime(backbone[DATE_COL], errors="coerce")
log.info("Backbone shape: %d rows × %d cols", *backbone.shape)

# ═════════════════════ 3 · AUROC QUALITY TABLE ════════════════════
snap = backbone[backbone[DATE_COL].dt.year == PRE_YEAR]

def _safe_auc(y:pd.Series, pcol:str)->float|np.nan:
    if pcol not in snap.columns:                  return np.nan
    m = y.notna() & snap[pcol].notna()
    if m.sum()<6 or y[m].nunique()<2:             return np.nan
    try:  return roc_auc_score(y[m], snap.loc[m,pcol])
    except ValueError: return np.nan

rows=[]
for m in METRICS:
    flag = f"flag_{m.lower()}"
    if flag not in snap.columns: continue
    y = snap[flag]
    rows.append({
        "Metric":           m,
        "AUROC_speed" : _safe_auc(y,f"{m.lower()}_rise_prob"),
        "AUROC_stage" : _safe_auc(y,f"{m.lower()}_stagerise_prob"),
        "AUROC_depth" : _safe_auc(y,f"{m.lower()}_depthrise_prob"),
        "AUROC_blend" : _safe_auc(y,f"{m.lower()}_blendrise_prob"),
        "AUROC_lasso" : _safe_auc(y,f"rscoreprob_{m.lower()}_pre{SWAN_YEAR}"),
        "AUROC_lassoDepth":_safe_auc(y,f"rscoredepthprob_{m.lower()}_pre{SWAN_YEAR}")
    })
quality = pd.DataFrame(rows).round(3)
quality.to_csv(STAGE_DIR/f"11_ModelQuality_{SWAN_YEAR}.csv", index=False)

print(f"\n=== AUROC snapshot FY-{PRE_YEAR} ===")
print(quality.to_string(index=False))

# ═════════════════════ 4 · RATIO FREQUENCY (Stage-10) ═════════════
if df10 is not None and "ratio" in [c.lower() for c in df10.columns]:
    rcol = next(c for c in df10.columns if c.lower()=="ratio")
    freq =(df10[rcol].str.lower().value_counts().rename("AppearsIn")
                 .loc[lambda s:s>=3])
    if not freq.empty:
        freq.to_csv(STAGE_DIR/f"11_BestSubset_RatioFrequency_{SWAN_YEAR}.csv")
        print("\nRatios appearing in ≥3 best-subset models:")
        print(freq.to_string())

# ═════════════════════ 5 · DUMP PROBABILITY MATRIX ════════════════
prob_cols = [c for c in backbone.columns if c.endswith((
                "_rise_prob","_stagerise_prob","_depthrise_prob","_blendrise_prob",
                f"_pre{SWAN_YEAR}",f"_depthpre{SWAN_YEAR}"))]
if prob_cols:
    backbone[[ID_COL,DATE_COL]+prob_cols]\
        .to_csv(STAGE_DIR/f"11_RISE_Probabilities_All_{SWAN_YEAR}.csv",index=False)
    log.info("Probability matrix saved with %d columns", len(prob_cols))

# ═════════════════════ 6 · RUN-METADATA ═══════════════════════════
meta = {
    "Unique firms (Stage3)"       : df3[ID_COL].nunique(),
    "Records in Stage3"           : len(df3),
    "Records in Stage5A"          : len(df5)   if df5   is not None else np.nan,
    "Rows with speed prob"        : len(df6_speed) if df6_speed is not None else np.nan,
    "Rows with depth prob"        : len(df6_depth) if df6_depth is not None else np.nan,
    "Rows with blend prob"        : len(df6_blend) if df6_blend is not None else np.nan,
    f"FY-{PRE_YEAR} snapshot rows": len(snap)
}
pd.Series(meta).to_frame("Value").to_csv(STAGE_DIR/f"11_RunMetadata_{SWAN_YEAR}.csv")

print("\nRun-metadata:")
for k,v in meta.items():
    print(f"{k:<32}{v:>10,.0f}" if pd.notna(v) else f"{k:<32} —")

log.info("🎉 Stage-11 complete — artefacts in %s", STAGE_DIR)
print(f"\n✅ Stage-11 complete — outputs in {STAGE_DIR}\n")


2025-06-11 10:37:49,360 | INFO    | RUN_DIR=C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609  SWAN_YEAR=2008  RUN_DATE=20250609
2025-06-11 10:37:55,495 | INFO    | Loaded stage03\Stage3_Data_WithRatios.csv  (34862 rows)
2025-06-11 10:37:55,654 | INFO    | Loaded stage06\Stage6_RISE_Predictions.csv  (974 rows)
2025-06-11 10:37:55,794 | INFO    | Loaded stage06\Stage6B_Stage_RISE_Predictions.csv  (974 rows)
2025-06-11 10:37:55,884 | INFO    | Loaded stage08\08_pre2008_AllMetrics_RScores.csv  (34862 rows)
2025-06-11 10:37:55,900 | INFO    | Loaded stage08\08_pre2008_CoefficientSummary.csv  (800 rows)
2025-06-11 10:37:55,918 | INFO    | Loaded stage10\10B_BestSubset_MasterTable.csv  (35 rows)
2025-06-11 10:37:56,001 | INFO    | Backbone built: 34862 rows × 42 columns



===== AUROC snapshot FY-2007 =====
                     Metric  AUROC_domain  AUROC_stage  AUROC_lasso
                  NetIncome         0.656        0.729          1.0
EarningBeforeInterestAndTax         0.657        0.681          1.0
            OperatingIncome         0.678        0.678          1.0
                     EBITDA         0.678        0.704          1.0
          OperatingCashFlow         0.904        0.823          1.0
               FreeCashFlow         0.585        0.601          1.0
                       Cash         0.608        0.634          1.0
     CashAndCashEquivalents         0.682        0.692          1.0
               TotalRevenue         0.660        0.650          1.0
                GrossProfit         0.729        0.682          1.0

Ratios appearing in ≥3 best-subset models
Ratio
netdebt_to_ocf_q           9
capex_to_depreciation_q    3


2025-06-11 10:37:56,864 | INFO    | Probability matrix written (30 columns)
2025-06-11 10:37:56,871 | INFO    | ✅ STAGE 11 complete – artefacts saved in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609\stage11



Run metadata
Unique firms (Stage3)                   2,426
Records in Stage3                      34,862
Records in Stage5                         nan
Rows with domain prob                     974
Rows with stage prob                      974
FY-2007 snapshot rows                     974

✅ Stage 11 complete – outputs in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609\stage11

