In [1]:
#!/usr/bin/env python
"""
Stage 11 · End-to-End Review Dashboard
======================================

Reads all earlier-stage artefacts, merges probabilities, evaluates
snapshot AUROC, and saves run-wide metadata.

Outputs  ( <run>/stage11/ )
    11_ModelQuality.csv
    11_BestSubset_RatioFrequency.csv
    11_RISE_Probabilities_All.csv
    11_RunMetadata.csv
"""
from __future__ import annotations

# ── stdlib / 3-rd-party ────────────────────────────────────────────
from pathlib import Path
import os, logging, warnings
from typing import Dict

import numpy as np
import pandas as pd
import seaborn as sns, matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

# ── shared helpers ────────────────────────────────────────────────
from pipeline_utils import load_cfg, resolve_run_dir         # NEW

warnings.filterwarnings("ignore", category=FutureWarning)
plt.rcParams["figure.dpi"] = 110
sns.set_style("whitegrid")

# ╔══════════════════════════════════════════════════════════════════╗
# 0 · BOOTSTRAP  (cfg + run-folder + logger)                          #
# ╚══════════════════════════════════════════════════════════════════╝
CFG: Dict      = load_cfg()
EVENTS: Dict   = {str(k): v for k, v in CFG.get("events", {}).items()}

SWAN_YEAR_STR  = os.getenv("SWAN_YEAR") or next(iter(EVENTS))
if SWAN_YEAR_STR not in EVENTS:
    raise KeyError(f"SWAN_YEAR={SWAN_YEAR_STR} not listed in events block")
SWAN_YEAR      = int(SWAN_YEAR_STR)
PRE_YEAR       = SWAN_YEAR - 1

# pick the latest run that already contains Stage-10 outputs
RUN_DIR  = resolve_run_dir(must_have="stage10/10B_BestSubset_MasterTable.csv")
RUN_DATE = RUN_DIR.name

STAGE_DIR = RUN_DIR / "stage11"
STAGE_DIR.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[
        logging.FileHandler(STAGE_DIR / "stage11.log", mode="w", encoding="utf-8"),
        logging.StreamHandler(),
    ],
)
logger = logging.getLogger(__name__)
logger.info("==========  STAGE 11: END-TO-END DASHBOARD ==========")
logger.info("RUN_DIR=%s  SWAN_YEAR=%s  RUN_DATE=%s", RUN_DIR, SWAN_YEAR, RUN_DATE)

# ╔══════════════════════════════════════════════════════════════════╗
# 1 · LOAD ARTEFACTS                                                 #
# ╚══════════════════════════════════════════════════════════════════╝
DATE_COL, ID_COL = "ReportDate", "Symbol"

def load_csv(stage_sub: str, names: list[str]) -> pd.DataFrame | None:
    """
    Try each name inside <RUN_DIR>/<stage_sub>/ ; return the first match or None.
    Lower-cases any column containing an underscore and ensures ReportDate dtype.
    """
    for nm in names:
        fp = RUN_DIR / stage_sub / nm
        if fp.is_file():
            df = pd.read_csv(fp, low_memory=False)
            df.rename(columns={c: c.lower().strip()
                               for c in df.columns if "_" in c}, inplace=True)
            if DATE_COL.lower() in df.columns:
                df[DATE_COL.lower()] = pd.to_datetime(df[DATE_COL.lower()],
                                                      errors="coerce")
            logger.info("Loaded %s  (%d rows)", fp.relative_to(RUN_DIR), len(df))
            return df
    logger.warning("None of %s found in %s", names, stage_sub)
    return None

df3   = load_csv("stage03", ["Stage3_Data_WithRatios.csv"])
df5   = load_csv("stage05", ["05B_QuintilesAndScores.csv",
                             "Stage5B_QuintilesAndScores.csv"])
df6   = load_csv("stage06", ["06_RISE_Predictions.csv",
                             "Stage6_RISE_Predictions.csv"])
df6b  = load_csv("stage06", ["06B_Stage_RISE_Predictions.csv",
                             "Stage6B_Stage_RISE_Predictions.csv"])
df8   = load_csv("stage08", [f"08_pre{SWAN_YEAR}_AllMetrics_RScores.csv"])
coef8 = load_csv("stage08", [f"08_pre{SWAN_YEAR}_CoefficientSummary.csv"])
df10  = load_csv("stage10", ["10B_BestSubset_MasterTable.csv"])

if df3 is None:
    raise RuntimeError("Stage-03 artefacts missing – cannot proceed.")

# ╔══════════════════════════════════════════════════════════════════╗
# 2 · MERGE INTO BACKBONE                                            #
# ╚══════════════════════════════════════════════════════════════════╝
METRICS = ["NetIncome","EarningBeforeInterestAndTax","OperatingIncome","EBITDA",
           "OperatingCashFlow","FreeCashFlow","Cash","CashAndCashEquivalents",
           "TotalRevenue","GrossProfit"]

flag_cols = [f"flag_{m.lower()}" for m in METRICS if f"flag_{m.lower()}" in df3.columns]
backbone  = df3[[ID_COL, DATE_COL] + flag_cols].copy()

def merge_prob(src: pd.DataFrame | None, suffix: str) -> None:
    """Left-join onto *backbone* every column in *src* that ends with *suffix*."""
    global backbone
    if src is None:
        return
    cols = [c for c in src.columns if c.endswith(suffix)]
    if cols:
        backbone = backbone.merge(
            src[[ID_COL, DATE_COL] + cols],
            on=[ID_COL, DATE_COL],
            how="left",
            copy=False
        )

merge_prob(df6 , "_rise_prob")
merge_prob(df6b, "_stagerise_prob")
merge_prob(df8 , f"_pre{SWAN_YEAR}")

logger.info("Backbone built: %d rows × %d columns", len(backbone), backbone.shape[1])

# ensure datetime dtype
if not pd.api.types.is_datetime64_any_dtype(backbone[DATE_COL]):
    backbone[DATE_COL] = pd.to_datetime(backbone[DATE_COL], errors="coerce")

# ╔══════════════════════════════════════════════════════════════════╗
# 3 · MODEL-QUALITY TABLE (snapshot AUROC)                           #
# ╚══════════════════════════════════════════════════════════════════╝
snap = backbone[backbone[DATE_COL].dt.year == PRE_YEAR]

def safe_auc(y_series: pd.Series, col_name: str) -> float | np.nan:
    if col_name not in snap.columns: return np.nan
    y, p = y_series, snap[col_name]
    msk = y.notna() & p.notna()
    if msk.sum() < 2 or y[msk].nunique() < 2 or p[msk].nunique() < 2:
        return np.nan
    try:
        return roc_auc_score(y[msk], p[msk])
    except ValueError:
        return np.nan

quality_rows = []
for m in METRICS:
    flag = f"flag_{m.lower()}"
    if flag not in snap.columns: continue
    y = snap[flag]
    quality_rows.append({
        "Metric": m,
        "AUROC_domain": safe_auc(y, f"{m.lower()}_rise_prob"),
        "AUROC_stage" : safe_auc(y, f"{m.lower()}_stagerise_prob"),
        "AUROC_lasso" : safe_auc(y, f"rscoreprob_{m.lower()}_pre{SWAN_YEAR}")
    })
quality_df = pd.DataFrame(quality_rows).round(3)
quality_df.to_csv(STAGE_DIR/"11_ModelQuality.csv", index=False)

print(f"\n===== AUROC snapshot FY-{PRE_YEAR} =====")
print(quality_df.to_string(index=False))

# ╔══════════════════════════════════════════════════════════════════╗
# 4 · BEST-SUBSET RATIO FREQUENCY                                    #
# ╚══════════════════════════════════════════════════════════════════╝
if df10 is not None and "ratio" in (c.lower() for c in df10.columns):
    ratio_col = next(c for c in df10.columns if c.lower() == "ratio")
    freq = (df10[ratio_col].str.lower().str.strip()
                      .value_counts()
                      .rename("AppearsIn"))
    freq = freq[freq >= 3]
    if not freq.empty:
        freq.to_csv(STAGE_DIR/"11_BestSubset_RatioFrequency.csv")
        print("\nRatios appearing in ≥3 best-subset models")
        print(freq.to_string())
else:
    print("\nStage10 coefficients missing – ratio frequency skipped")

# ╔══════════════════════════════════════════════════════════════════╗
# 5 · FULL PROBABILITY MATRIX                                        #
# ╚══════════════════════════════════════════════════════════════════╝
prob_cols = [c for c in backbone.columns
             if c.endswith(("_rise_prob", "_stagerise_prob", f"_pre{SWAN_YEAR}"))]
if prob_cols:
    backbone[[ID_COL, DATE_COL] + prob_cols]\
        .to_csv(STAGE_DIR/"11_RISE_Probabilities_All.csv", index=False)
    logger.info("Probability matrix written (%d columns)", len(prob_cols))

# ╔══════════════════════════════════════════════════════════════════╗
# 6 · RUN METADATA                                                   #
# ╚══════════════════════════════════════════════════════════════════╝
meta = {
    "Unique firms (Stage3)"       : df3[ID_COL].nunique(),
    "Records in Stage3"           : len(df3),
    "Records in Stage5"           : (len(df5)  if df5  is not None else np.nan),
    "Rows with domain prob"       : (len(df6)  if df6  is not None else np.nan),
    "Rows with stage prob"        : (len(df6b) if df6b is not None else np.nan),
    f"FY-{PRE_YEAR} snapshot rows": len(snap),
}
pd.Series(meta).to_frame("Value").to_csv(STAGE_DIR/"11_RunMetadata.csv")

print("\nRun metadata")
for k, v in meta.items():
    print(f"{k:<35s}{v:>10,.0f}")

logger.info("✅ STAGE 11 complete – artefacts saved in %s", STAGE_DIR)
print(f"\n✅ Stage 11 complete – outputs in {STAGE_DIR}\n")

2025-06-11 10:37:49,360 | INFO    | RUN_DIR=C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609  SWAN_YEAR=2008  RUN_DATE=20250609
2025-06-11 10:37:55,495 | INFO    | Loaded stage03\Stage3_Data_WithRatios.csv  (34862 rows)
2025-06-11 10:37:55,654 | INFO    | Loaded stage06\Stage6_RISE_Predictions.csv  (974 rows)
2025-06-11 10:37:55,794 | INFO    | Loaded stage06\Stage6B_Stage_RISE_Predictions.csv  (974 rows)
2025-06-11 10:37:55,884 | INFO    | Loaded stage08\08_pre2008_AllMetrics_RScores.csv  (34862 rows)
2025-06-11 10:37:55,900 | INFO    | Loaded stage08\08_pre2008_CoefficientSummary.csv  (800 rows)
2025-06-11 10:37:55,918 | INFO    | Loaded stage10\10B_BestSubset_MasterTable.csv  (35 rows)
2025-06-11 10:37:56,001 | INFO    | Backbone built: 34862 rows × 42 columns



===== AUROC snapshot FY-2007 =====
                     Metric  AUROC_domain  AUROC_stage  AUROC_lasso
                  NetIncome         0.656        0.729          1.0
EarningBeforeInterestAndTax         0.657        0.681          1.0
            OperatingIncome         0.678        0.678          1.0
                     EBITDA         0.678        0.704          1.0
          OperatingCashFlow         0.904        0.823          1.0
               FreeCashFlow         0.585        0.601          1.0
                       Cash         0.608        0.634          1.0
     CashAndCashEquivalents         0.682        0.692          1.0
               TotalRevenue         0.660        0.650          1.0
                GrossProfit         0.729        0.682          1.0

Ratios appearing in ≥3 best-subset models
Ratio
netdebt_to_ocf_q           9
capex_to_depreciation_q    3


2025-06-11 10:37:56,864 | INFO    | Probability matrix written (30 columns)
2025-06-11 10:37:56,871 | INFO    | ✅ STAGE 11 complete – artefacts saved in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609\stage11



Run metadata
Unique firms (Stage3)                   2,426
Records in Stage3                      34,862
Records in Stage5                         nan
Rows with domain prob                     974
Rows with stage prob                      974
FY-2007 snapshot rows                     974

✅ Stage 11 complete – outputs in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609\stage11

