In [None]:
#!/usr/bin/env python
"""
STAGE-05B · DOMAIN / STAGE / RISE SCORES  — flavour-separated
v5.2 2025-07-04
──────────────────────────────────────────────────────────────
Reads :  Stage-05A_QuintilesAndScores_<YR>.csv  (48 “*_Q” cols)
        Tmp_/Imp_/Dyn_Stage5A_BestRatioPerBucket_<YR>.csv
Writes:  *
        Tmp_05B_AllScores_<YR>.csv
        Imp_05B_AllScores_<YR>.csv
        Dyn_05B_AllScores_<YR>.csv
        + 3× coverage PNGs  + 3× column-count PNGs

Scores (computed independently for each flavour)
─────────────────────────────────────────────────
 bucket R-score  = quintile value itself  (5 good · 1 bad)
 domain R-score  = mean of 4 bucket scores within the domain
 stage  R-score  = mean of 4 bucket scores within the stage
 RISE  Score     = mean of that flavour’s 16 bucket scores
"""
from __future__ import annotations
import logging, os, warnings
from pathlib import Path
from typing  import Dict, List

import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix        # optional diagnostics
from  pipeline_utils import load_cfg, resolve_run_dir

warnings.filterwarnings("ignore", category=RuntimeWarning)
plt.rcParams["figure.dpi"] = 110
sns.set_style("whitegrid")

# ═════════════ 0 · CONFIG & PATHS ═══════════════════════════════
CFG     = load_cfg()
EVENTS  = {str(k): v for k, v in CFG.get("events", {}).items()}
SWAN    = os.getenv("SWAN_YEAR") or next(iter(EVENTS))

RUN_DIR = resolve_run_dir(
            swan_year = SWAN,
            run_tag   = os.getenv("RUN_TAG"),
            must_have = f"stage05a/Stage5A_QuintilesAndScores_{SWAN}.csv",
)
ST05A   = RUN_DIR / "stage05a"
ST05B   = RUN_DIR / "stage05b"; ST05B.mkdir(exist_ok=True)

logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(ST05B / "stage05b.log", "w", "utf-8"),
              logging.StreamHandler()],
)
log = logging.getLogger(__name__)
log.info("==========  STAGE-05B  (SWAN %s) ==========", SWAN)

# ═════════════ 1 · LOAD QUINTILE MATRIX ═════════════════════════
df = pd.read_csv(ST05A / f"Stage5A_QuintilesAndScores_{SWAN}.csv")

# flavour → winner-CSV (written by Stage-05A)
FLAVOURS = {
    "Tmp": ST05A / f"Tmp_Stage5A_BestRatioPerBucket_{SWAN}.csv",
    "Imp": ST05A / f"Imp_Stage5A_BestRatioPerBucket_{SWAN}.csv",
    "Dyn": ST05A / f"Dyn_Stage5A_BestRatioPerBucket_{SWAN}.csv",
}

domain_map: Dict[str, str] = {"Phys": "Physical",
                              "Info": "Information",
                              "Cog" : "Cognitive",
                              "Soc" : "Social"}
stages: List[str] = ["Prepare", "Absorb", "Recover", "Adapt"]

ID_COL   = "Symbol"          # override here if different in your data
YEAR_COL = "Year"

# ═════════════ 2 · LOOP OVER FLAVOURS ═══════════════════════════
for tag, meta_csv in FLAVOURS.items():
    if not meta_csv.is_file():
        log.warning("⏭️  %s winner file missing – flavour skipped", tag)
        continue

    buckets = pd.read_csv(meta_csv)["Bucket"].tolist()
    bqcols  = [f"{tag}_{b}_Q" for b in buckets if f"{tag}_{b}_Q" in df.columns]

    if len(bqcols) != 16:
        raise RuntimeError(f"[{tag}] Expected 16 bucket columns, found {len(bqcols)}. "
                           "Check Stage-05A output.")

    # ── 2A · copy bucket R-scores (already 1-5) ─────────────────
    sub = df[[ID_COL, YEAR_COL] + bqcols].copy()

    # ── 2B · domain averages ────────────────────────────────────
    for code, full in domain_map.items():
        cols = [f"{tag}_{b}_Q" for b in buckets if b.startswith(code)]
        sub[f"{tag}_{full}_Score"] = sub[cols].mean(axis=1)

    # ── 2C · stage averages ─────────────────────────────────────
    for st in stages:
        cols = [f"{tag}_{b}_Q" for b in buckets if b.endswith(f"-{st}")]
        sub[f"{tag}_{st}_Score"] = sub[cols].mean(axis=1)

    # ── 2D · flavour-specific RISE ──────────────────────────────
    sub[f"{tag}_RISE_Score"] = sub[bqcols].mean(axis=1)

    # ── 3 · WRITE CSV ───────────────────────────────────────────
    out_csv = ST05B / f"{tag}_05B_AllScores_{SWAN}.csv"
    sub.to_csv(out_csv, index=False)
    log.info("✓ [%s] CSV saved → %s  (rows=%d, cols=%d)",
             tag, out_csv.name, *sub.shape)

    # ═════════ 4 · DIAGNOSTICS (optional) ═══════════════════════
    # 4A coverage
    cov = sub[bqcols].notna().mean().sort_index()*100
    plt.figure(figsize=(min(22,0.45*len(cov)),4))
    sns.barplot(x=cov.index.str.replace(f"{tag}_","").str.replace("_Q",""),
                y=cov.values, color="steelblue")
    plt.xticks(rotation=90, ha="center", fontsize=8)
    plt.ylabel("% with score")
    plt.title(f"{tag} – bucket coverage")
    plt.tight_layout()
    plt.savefig(ST05B / f"{tag}_05B_RScoreCoverage_{SWAN}.png")
    plt.close()

    # 4B column-count summary
    counts = {"Bucket R-scores": len(bqcols),
              "Domain scores" :  len(domain_map),
              "Stage scores"  :  len(stages),
              f"{tag}_RISE":    1}
    plt.figure(figsize=(6,4))
    sns.barplot(x=list(counts.keys()), y=list(counts.values()), palette="mako")
    plt.ylabel("Column count")
    plt.title(f"{tag} – score columns written")
    plt.tight_layout()
    plt.savefig(ST05B / f"{tag}_05B_RScoreColumns_{SWAN}.png")
    plt.close()

    # 4C quick exploratory summaries (uncomment if running interactively)
    """
    print(f"\n[{tag}]  Top 5 by {tag}_RISE_Score")
    print(sub.nlargest(5,f"{tag}_RISE_Score")
              [[ID_COL,YEAR_COL,f"{tag}_RISE_Score"]]
              .to_string(index=False))

    plt.figure(figsize=(8,4))
    plt.hist(sub[f"{tag}_RISE_Score"].dropna(), bins=20, edgecolor="k")
    plt.title(f"{tag} – distribution of RISE scores"); plt.tight_layout(); plt.show()

    # boxplots per domain
    plt.figure(figsize=(8,6))
    dcols=[f"{tag}_{v}_Score" for v in domain_map.values()]
    sub[dcols].boxplot()
    plt.title(f"{tag} – domain score boxplots"); plt.tight_layout(); plt.show()

    # scatter matrix of stage scores + RISE
    ncols=[f"{tag}_{s}_Score" for s in stages]+[f"{tag}_RISE_Score"]
    scatter_matrix(sub[ncols], figsize=(10,10))
    plt.suptitle(f"{tag} – stage vs RISE"); plt.show()
    """

log.info("✅  STAGE-05B complete — artefacts in %s", ST05B)


2025-06-16 16:11:57,065 | INFO    | Snapshot FY-1999 → 639 rows × 810 cols
2025-06-16 16:11:57,090 | INFO    | ✓ Domain speed  NetIncome                 (n=598)
2025-06-16 16:11:57,112 | INFO    | ✓ Stage  speed  NetIncome                 (n=624)
2025-06-16 16:11:57,142 | INFO    | ✓ Domain speed  EarningBeforeInterestAndTax  (n=598)
2025-06-16 16:11:57,169 | INFO    | ✓ Stage  speed  EarningBeforeInterestAndTax  (n=624)
2025-06-16 16:11:57,193 | INFO    | ✓ Domain speed  OperatingIncome           (n=598)
2025-06-16 16:11:57,214 | INFO    | ✓ Stage  speed  OperatingIncome           (n=624)
2025-06-16 16:11:57,236 | INFO    | ✓ Domain speed  EBITDA                    (n=598)
2025-06-16 16:11:57,259 | INFO    | ✓ Stage  speed  EBITDA                    (n=624)
2025-06-16 16:11:57,282 | INFO    | ✓ Domain speed  OperatingCashFlow         (n=598)
2025-06-16 16:11:57,305 | INFO    | ✓ Stage  speed  OperatingCashFlow         (n=624)
2025-06-16 16:11:57,325 | INFO    | ✓ Domain speed  FreeCas



2025-06-16 16:11:58,066 | INFO    | ✓ Domain blend  OperatingIncome           (n=599)
2025-06-16 16:11:58,090 | INFO    | ✓ Stage  blend  OperatingIncome           (n=624)
2025-06-16 16:11:58,117 | INFO    | ✓ Domain blend  EBITDA                    (n=599)
2025-06-16 16:11:58,144 | INFO    | ✓ Stage  blend  EBITDA                    (n=624)
2025-06-16 16:11:58,170 | INFO    | ✓ Domain blend  OperatingCashFlow         (n=599)
2025-06-16 16:11:58,203 | INFO    | ✓ Stage  blend  OperatingCashFlow         (n=624)
2025-06-16 16:11:58,236 | INFO    | ✓ Domain blend  FreeCashFlow              (n=599)
2025-06-16 16:11:58,262 | INFO    | ✓ Stage  blend  FreeCashFlow              (n=624)
2025-06-16 16:11:58,286 | INFO    | ✓ Domain blend  Cash                      (n=599)
2025-06-16 16:11:58,307 | INFO    | ✓ Stage  blend  Cash                      (n=624)
2025-06-16 16:11:58,331 | INFO    | ✓ Domain blend  CashAndCashEquivalents    (n=599)
2025-06-16 16:11:58,354 | INFO    | ✓ Stage  blend  Ca