In [None]:
#!/usr/bin/env python
"""
STAGE 06 · COMBINED WEIGHTED RISE PREDICTIONS
──────────────────────────────────────────────
Writes six prediction tables plus diagnostics.  Filenames are aligned with
Stage-11 expectations (speed / depth, domain / stage, and two blends).
"""
from __future__ import annotations
from pathlib import Path
import logging, os, warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pipeline_utils import load_cfg, resolve_run_dir
warnings.filterwarnings("ignore", category=RuntimeWarning)

# ─────────────────── bootstrap / paths ────────────────────────────
CFG    = load_cfg()
EVENTS = {str(k): v for k, v in CFG["events"].items()}

SWAN_YEAR = str(os.getenv("SWAN_YEAR") or next(iter(EVENTS)))
SWAN_INT  = int(SWAN_YEAR)

RUN_DIR  = resolve_run_dir(
    swan_year = SWAN_YEAR,
    must_have = f"stage05a/Stage5A_QuintilesAndScores_{SWAN_YEAR}.csv",
    run_tag   = os.getenv("RUN_TAG"),
)
ST05A = RUN_DIR / "stage05a"
ST05B = RUN_DIR / "stage05b"
ST06  = RUN_DIR / "stage06"; ST06.mkdir(exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(ST06 / "stage06.log", "w", "utf-8"),
              logging.StreamHandler()],
)
log = logging.getLogger(__name__)

DATE_COL, ID_COL = "ReportDate", "Symbol"

# ─────────────────── snapshot data ────────────────────────────────
snap = pd.read_csv(ST05A / f"Stage5A_QuintilesAndScores_{SWAN_YEAR}.csv")
if "Year" not in snap.columns:
    snap["Year"] = pd.to_datetime(snap[DATE_COL], errors="coerce").dt.year
SNAP_YR = SWAN_INT - 1
snap    = snap[snap["Year"] == SNAP_YR].copy()
log.info("Snapshot FY-%d rows: %s", SNAP_YR, f"{len(snap):,}")

# ─────────────────── constants ────────────────────────────────────
METRICS     = ["NetIncome","EarningBeforeInterestAndTax","OperatingIncome","EBITDA",
               "OperatingCashFlow","FreeCashFlow","Cash","CashAndCashEquivalents",
               "TotalRevenue","GrossProfit"]
DOMAIN_COLS = ["Physical_Score","Information_Score","Cognitive_Score","Social_Score"]
STAGE_COLS  = ["Prepare_Score","Absorb_Score","Recover_Score","Adapt_Score"]

# stem → (coef prefixes,  csv stem,  col-suffix)
SETTING = {
    "A": (["Stage05B_Domain"],      "Stage6_RISE_Predictions_",        "_RISE_prob"),
    "B": (["Stage05B_Stage"],       "Stage6B_Stage_RISE_Predictions_", "_StageRISE_prob"),
    "C": (["Stage05B_DepthDomain"], "Stage6C_Depth_RISE_Predictions_", "_DepthRISE_prob"),
    "D": (["Stage05B_DepthStage"],  "Stage6D_DepthStage_RISE_Predictions_",
                                     "_DepthStageRISE_prob"),
}

# ─────────────────── helpers ──────────────────────────────────────
def _coef_file(prefixes:list[str], metric:str)->Path|None:
    for pre in prefixes:
        p = ST05B / f"{pre}_{metric}_Coefficients_{SWAN_YEAR}.csv"
        if p.exists(): return p
        legacy = ST05B / f"{pre}_{metric}_Coefficients.csv"
        if legacy.exists(): return legacy
    return None

def _diagnostics(df:pd.DataFrame, pcols:list[str], stem:str):
    if not pcols: return
    mean_col = f"Mean{pcols[0].split('_',1)[1]}"
    df[mean_col] = df[pcols].mean(axis=1)

    stem_tag = stem if stem else "A"
    (df[pcols].describe(percentiles=[.25,.5,.75]).T.round(3)
       .to_csv(ST06/f"Stage6{stem_tag}_Summary_Probs_{SWAN_YEAR}.csv"))

    df.nlargest(10, mean_col)[[ID_COL,"Year",mean_col]]\
      .to_csv(ST06/f"Stage6{stem_tag}_Top10_{SWAN_YEAR}.csv", index=False)
    df.nsmallest(10, mean_col)[[ID_COL,"Year",mean_col]]\
      .to_csv(ST06/f"Stage6{stem_tag}_Bottom10_{SWAN_YEAR}.csv", index=False)

    r, c = (-(-len(pcols)//4)), 4
    fig, axs = plt.subplots(r, c, figsize=(4*c, 3*r))
    for ax, col in zip(axs.flatten(), pcols):
        df[col].dropna().hist(bins=20, ax=ax, edgecolor="k"); ax.set_xlim(0,1)
        ax.set_title(col.replace(pcols[0].split('_')[-1],""))
    for ax in axs.flatten()[len(pcols):]: ax.axis("off")
    fig.tight_layout()
    fig.savefig(ST06/f"Stage6{stem_tag}_HistGrid_{SWAN_YEAR}.png", dpi=110)
    plt.close(fig)

def build(stem:str)->pd.DataFrame|None:
    prefixes, csv_stem, suffix = SETTING[stem]
    df = snap.copy(); made=False
    for m in METRICS:
        fp=_coef_file(prefixes, m)
        if not fp: continue
        beta = pd.read_csv(fp).set_index("Term")["Coefficient"].to_dict()
        cols = DOMAIN_COLS if "Domain" in fp.name else STAGE_COLS
        lin  = np.full(len(df), beta.get("const",0.0))
        for c in cols: lin += beta.get(c,0.0)*df.get(c,0)
        df[f"{m}{suffix.replace('prob','linpred')}"] = lin
        df[f"{m}{suffix}"] = 1/(1+np.exp(-lin)); made=True
    out = ST06 / f"{csv_stem}{SWAN_YEAR}.csv"
    if made:
        df.to_csv(out,index=False)
        _diagnostics(df,[c for c in df if c.endswith(suffix)], stem)
        log.info("✓ %s", out.name); return df
    pd.DataFrame(columns=[ID_COL,DATE_COL]).to_csv(out,index=False)
    log.warning("placeholder for %s … no coefficients", csv_stem.rstrip("_")); return None

# ─────────────────── main flavours ────────────────────────────────
speed_dom  = build("A")
speed_stg  = build("B")
depth_dom  = build("C")
depth_stg  = build("D")

# ─────────────────── blends ───────────────────────────────────────
def make_blend(df1, df2, suff1, suff2, csvname, diag_stem):
    out = ST06/csvname
    if df1 is None or df2 is None:
        pd.DataFrame(columns=[ID_COL,DATE_COL]).to_csv(out,index=False); return
    blend=snap.copy()
    for m in METRICS:
        c1,f1 = f"{m}{suff1}", m
        c2,f2 = f"{m}{suff2}", m
        if c1 in df1 and c2 in df2:
            blend[f"{m}_blend{suff1.split('_',1)[1]}"]=(df1[c1]+df2[c2])/2
    blend.to_csv(out,index=False)
    _diagnostics(blend,[c for c in blend if "blend" in c.lower() and c.endswith('prob')],
                 diag_stem)
    log.info("✓ %s", out.name)

make_blend(speed_dom, depth_dom,
           SETTING["A"][2], SETTING["C"][2],
           f"Stage6E_Blend_RISE_Predictions_{SWAN_YEAR}.csv", "E")

make_blend(speed_stg, depth_stg,
           SETTING["B"][2], SETTING["D"][2],
           f"Stage6F_BlendStage_RISE_Predictions_{SWAN_YEAR}.csv", "F")

log.info("🎉 Stage 06 complete — artefacts in %s", ST06)




2025-06-15 15:10:03,019 | INFO    | Snapshot FY-2007 rows: 974
2025-06-15 15:10:03,069 | INFO    | 🎉 Stage 06 complete — artefacts in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\2025-06-15\stage06
