In [None]:
#!/usr/bin/env python
"""
STAGE-06A · DOMAIN-WEIGHTED RISE PREDICTIONS  (flavour-separated)
v5.2 — 2025-07-04
───────────────────────────────────────────────────────────────
For each resilience flavour (Temporal · Impact · Dynamic) it

1. loads the FY-(SWAN-1) “AllScores” snapshot from Stage-05B
2. pulls the four-domain logit coefficients produced in 05C
3. reproduces the z-score transformation used in training
4. writes predicted lin-pred & probability for every metric
5. dumps a quick numeric / graphic diagnostics set.

Outputs land in  stage06a/ :

    Tmp_Stage6A_Temporal_RISE_Predictions_<SWAN>.csv
    Imp_Stage6A_Impact_RISE_Predictions_<SWAN>.csv
    Dyn_Stage6A_Dynamic_RISE_Predictions_<SWAN>.csv
    … plus PNG / CSV diagnostics for each flavour
"""

from __future__ import annotations
import logging, os, warnings
from pathlib import Path
from typing  import Dict, List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pipeline_utils import load_cfg, resolve_run_dir

warnings.filterwarnings("ignore", category=RuntimeWarning)
plt.rcParams["figure.dpi"] = 110

# ═════════════ 0 · PATHS / LOGGING ═════════════════════════════
CFG   = load_cfg()
EVENT = {str(k): v for k, v in CFG["events"].items()}

SWAN        = os.getenv("SWAN_YEAR") or next(iter(EVENT))
PRE_YEAR    = int(SWAN) - 1

RUN_DIR = resolve_run_dir(
            swan_year = SWAN,
            run_tag   = os.getenv("RUN_TAG"),
            must_have = f"stage05b/Tmp_05B_AllScores_{SWAN}.csv"  # any one file
)
ST05B = RUN_DIR / "stage05b"
ST05C = RUN_DIR / "stage05c"
OUT   = RUN_DIR / "stage06a"; OUT.mkdir(exist_ok=True)

logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(OUT / "stage06a.log", "w", "utf-8"),
              logging.StreamHandler()],
)
log = logging.getLogger(__name__)
log.info("==========  STAGE-06A  (SWAN %s) ==========", SWAN)

# ═════════════ 1 · CONSTANTS ═══════════════════════════════════
METRICS = [
    "NetIncome","EarningBeforeInterestAndTax","OperatingIncome","EBITDA",
    "OperatingCashFlow","FreeCashFlow","Cash","CashAndCashEquivalents",
    "TotalRevenue","GrossProfit"
]

# flavour-tag → meta
FLAV: Dict[str, Dict] = {
    "Tmp": {"title": "Temporal", "score": ST05B / f"Tmp_05B_AllScores_{SWAN}.csv"},
    "Imp": {"title": "Impact",   "score": ST05B / f"Imp_05B_AllScores_{SWAN}.csv"},
    "Dyn": {"title": "Dynamic",  "score": ST05B / f"Dyn_05B_AllScores_{SWAN}.csv"},
}

DOMAINS = ["Physical_Score", "Information_Score",
           "Cognitive_Score", "Social_Score"]

ID_COL, DATE_COL = "Symbol", "ReportDate"

# ═════════════ 2 · HELPERS ═════════════════════════════════════
def coef_file(tag: str, metric: str) -> Path | None:
    p = ST05C / f"{tag}_Domain_{metric}_{SWAN}_Coefficients.csv"
    return p if p.is_file() else None

def zscore(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    z = df[cols].copy()
    for c in cols:
        z[c] = (z[c] - z[c].mean()) / (z[c].std(ddof=0) + 1e-9)
    return z

def diag(out_df: pd.DataFrame, prob_cols: List[str],
         tag: str, nice: str) -> None:
    """save summary, hist grid, bar and top/bottom 10 files"""
    if not prob_cols:
        return

    # summary CSV
    summ = (out_df[prob_cols]
            .describe(percentiles=[.25,.5,.75])
            .T.round(3))
    summ.to_csv(OUT / f"{tag}_Stage6A_Summary_{SWAN}.csv")

    # bar of means
    summ["mean"].sort_values(ascending=False).plot(kind="bar", figsize=(10,4))
    plt.ylim(0,1); plt.ylabel("Probability")
    plt.title(f"{nice} — mean predicted probability by metric")
    plt.tight_layout()
    plt.savefig(OUT / f"{tag}_Stage6A_MeanBar_{SWAN}.png")
    plt.close()

    # histogram grid
    n, cols = len(prob_cols), 4
    rows    = -(-n // cols)
    fig, axarr = plt.subplots(rows, cols, figsize=(4*cols, 3*rows))
    for ax, col in zip(axarr.flatten(), prob_cols):
        ax.hist(out_df[col].dropna(), bins=20, edgecolor="k")
        ax.set_title(col.replace("_prob","")); ax.set_xlim(0,1)
    for ax in axarr.flatten()[n:]:
        ax.axis("off")
    plt.tight_layout()
    fig.savefig(OUT / f"{tag}_Stage6A_HistGrid_{SWAN}.png")
    plt.close()

    # top / bottom 10
    out_df["MeanProb"] = out_df[prob_cols].mean(axis=1)
    out_df.nlargest(10, "MeanProb")[[ID_COL, DATE_COL, "MeanProb"]] \
          .to_csv(OUT / f"{tag}_Stage6A_Top10_{SWAN}.csv", index=False)
    out_df.nsmallest(10, "MeanProb")[[ID_COL, DATE_COL, "MeanProb"]] \
          .to_csv(OUT / f"{tag}_Stage6A_Bottom10_{SWAN}.csv", index=False)

# ═════════════ 3 · MAIN LOOP ═══════════════════════════════════
for tag, meta in FLAV.items():
    f_csv = meta["score"]
    if not f_csv.is_file():
        log.warning("⏭️  %s snapshot missing — flavour skipped", tag)
        continue

    df = pd.read_csv(f_csv)
    df = df.query("Year == @PRE_YEAR").copy()
    if df.empty:
        log.warning("⏭️  %s no FY-%d rows", tag, PRE_YEAR);  continue

    # flip 1-5 so higher = better
    for c in df.filter(regex=r"(_Score$|_Q$)").columns:
        df[c] = 6 - df[c]

    # predictor names have flavour prefix, e.g. Tmp_Physical_Score
    dom_pred = [f"{tag}_{d}" for d in DOMAINS]

    preds_z  = zscore(df, dom_pred)
    prob_cols: List[str] = []

    for met in METRICS:
        cf = coef_file(tag, met)
        if cf is None:
            continue
        beta = pd.read_csv(cf).set_index("Term")["Coefficient"]

        # linear predictor
        lin = np.full(len(df), beta.get("const", 0.0))
        for p in dom_pred:
            if p not in beta:
                log.warning("%s coef missing %s (%s)", tag, p, met)
                break
            lin += beta[p] * preds_z[p]
        else:
            lp_col   = f"{tag}_{met}_RISE_linpred"
            prob_col = f"{tag}_{met}_RISE_prob"
            df[lp_col]   = lin
            df[prob_col] = 1 / (1 + np.exp(-lin))
            prob_cols.append(prob_col)

    if not prob_cols:
        log.warning("⚠️  %s produced no probabilities", tag);  continue

    out_csv = OUT / f"{tag}_Stage6A_{meta['title']}_RISE_Predictions_{SWAN}.csv"
    df.to_csv(out_csv, index=False)
    log.info("✓ %-5s predictions written (%d metrics)", tag, len(prob_cols))

    diag(df, prob_cols, tag, meta['title'])

log.info("🎉  STAGE-06A complete — artefacts in %s", OUT)



2025-06-15 15:10:03,019 | INFO    | Snapshot FY-2007 rows: 974
2025-06-15 15:10:03,069 | INFO    | 🎉 Stage 06 complete — artefacts in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\2025-06-15\stage06
