In [2]:
#!/usr/bin/env python
"""
STAGE 06 · COMBINED WEIGHTED RISE PREDICTIONS
──────────────────────────────────────────────
Creates six prediction tables:

   A  speed-Domain      (_RISE_prob)
   B  speed-Stage       (_StageRISE_prob)
   C  depth-Domain      (_DepthRISE_prob)
   D  depth-Stage       (_DepthStageRISE_prob)
   E  blend of A & C    (_blendRISE_prob)
   F  blend of B & D    (_blendStageRISE_prob)

For every family it
   • saves a CSV with all snapshot rows
   • saves summary/top/bottom/hist-grid PNG + CSV
   • PRINTS a text summary & TOP/BOTTOM-10 tables
   • DISPLAYS a bar-chart of mean probabilities and the histogram grid
     (exactly what the legacy Stage-06 showed).

All filenames remain unchanged so Stage-11+ continue to work.
"""
from __future__ import annotations
from pathlib import Path
import os, logging, warnings, textwrap

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pipeline_utils import load_cfg, resolve_run_dir
warnings.filterwarnings("ignore", category=RuntimeWarning)

# ══════════════════════ 0 · BOOTSTRAP ════════════════════════════
CFG        = load_cfg()
EVENTS     = {str(k): v for k, v in CFG["events"].items()}

SWAN_YEAR  = str(os.getenv("SWAN_YEAR") or next(iter(EVENTS)))
SWAN_INT   = int(SWAN_YEAR)
SNAP_YR    = SWAN_INT - 1

RUN_DIR = resolve_run_dir(
    swan_year = SWAN_YEAR,
    run_tag   = os.getenv("RUN_TAG"),
    must_have = f"stage05a/Stage5A_QuintilesAndScores_{SWAN_YEAR}.csv",
)
ST05A = RUN_DIR / "stage05a"
ST05B = RUN_DIR / "stage05b"
ST06  = RUN_DIR / "stage06"; ST06.mkdir(exist_ok=True)

logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(ST06/"stage06.log", "w", "utf-8"),
              logging.StreamHandler()],
)
log = logging.getLogger(__name__)
log.info("==========  STAGE 06 – RISE PREDICTIONS  (SWAN=%s) ==========", SWAN_YEAR)

DATE_COL, ID_COL = "ReportDate", "Symbol"

# ══════════════════════ 1 · SNAPSHOT DATA ════════════════════════
snap = pd.read_csv(ST05A/f"Stage5A_QuintilesAndScores_{SWAN_YEAR}.csv")
if "Year" not in snap.columns:
    snap["Year"] = pd.to_datetime(snap[DATE_COL], errors="coerce").dt.year
snap = snap[snap["Year"] == SNAP_YR].copy()
log.info("Snapshot FY-%d rows: %s", SNAP_YR, f"{len(snap):,}")

# ══════════════════════ 2 · CONSTANTS ════════════════════════════
METRICS = ["NetIncome","EarningBeforeInterestAndTax","OperatingIncome","EBITDA",
           "OperatingCashFlow","FreeCashFlow","Cash","CashAndCashEquivalents",
           "TotalRevenue","GrossProfit"]

# mapping: tag → (05B stem start, flavour, out-stem, prob-suffix)
FAMILIES = {
    "A": ("Domain_speed",  "speed",  "Stage6_RISE_Predictions_",        "_RISE_prob"),
    "B": ("Stage_speed",   "speed",  "Stage6B_Stage_RISE_Predictions_", "_StageRISE_prob"),
    "C": ("Domain_depth",  "depth",  "Stage6C_Depth_RISE_Predictions_", "_DepthRISE_prob"),
    "D": ("Stage_depth",   "depth",  "Stage6D_DepthStage_RISE_Predictions_",
                                   "_DepthStageRISE_prob"),
}

# ══════════════════════ 3 · HELPER FUNCTIONS ═════════════════════
def _coef_file(stem_start:str, metric:str) -> Path|None:
    """
    Find coefficients file produced in Stage-05B.  Works for both new and legacy names.
    """
    modern = ST05B/f"Stage05B_{stem_start}_{metric}_{SWAN_YEAR}_Coefficients.csv"
    if modern.exists(): return modern
    # legacy pre-depth naming (no flavour in stem, year at end)
    legacy = ST05B/f"Stage05B_{stem_start.split('_')[0]}_{metric}_Coefficients_{SWAN_YEAR}.csv"
    return legacy if legacy.exists() else None

def _print_header(title:str):
    bar = "=" * len(title)
    print(f"\n{bar}\n{title}\n{bar}")

def _diagnostics(df:pd.DataFrame, pcols:list[str], tag:str):
    """
    (a) Save CSVs & PNGs  (b) Print + display like the legacy Stage-06 notebook.
    """
    if not pcols:
        log.warning("No probability columns for family %s – diagnostics skipped", tag)
        return

    mean_col = f"Mean{pcols[0].split('_',1)[1]}"
    df[mean_col] = df[pcols].mean(axis=1)

    # --- CSV artefacts ------------------------------------------------------
    (df[pcols]
       .describe(percentiles=[.25,.5,.75])
       .T.round(3)
       .to_csv(ST06/f"Stage6{tag}_Summary_Probs_{SWAN_YEAR}.csv"))

    df.nlargest(10, mean_col)[[ID_COL,DATE_COL,mean_col]]\
      .to_csv(ST06/f"Stage6{tag}_Top10_{SWAN_YEAR}.csv", index=False)
    df.nsmallest(10, mean_col)[[ID_COL,DATE_COL,mean_col]]\
      .to_csv(ST06/f"Stage6{tag}_Bottom10_{SWAN_YEAR}.csv", index=False)

    # --- PNG histogram grid -------------------------------------------------
    rows, cols = (-(-len(pcols)//4)), 4
    fig, axs = plt.subplots(rows, cols, figsize=(4*cols, 3*rows))
    for ax, col in zip(axs.flatten(), pcols):
        df[col].dropna().hist(bins=20, ax=ax, edgecolor="k")
        ax.set_xlim(0,1); ax.set_title(col.replace("_prob",""))
    for ax in axs.flatten()[len(pcols):]: ax.axis("off")
    fig.tight_layout()
    fig.savefig(ST06/f"Stage6{tag}_HistGrid_{SWAN_YEAR}.png", dpi=110)
    plt.close(fig)

    # --- NOTEBOOK DISPLAY (like old code) -----------------------------------
    _print_header(f"SUMMARY – Family {tag}")
    summary = df[pcols].describe(percentiles=[.25,.5,.75]).T.round(3)
    print(summary.to_string())

    # bar of means
    summary["mean"].sort_values(ascending=False).plot(kind="bar", figsize=(10,4))
    plt.title(f"Mean Predicted Resilience Probability – Family {tag}")
    plt.ylim(0,1); plt.ylabel("Probability"); plt.tight_layout(); plt.show()

    # show the histogram grid
    plt.figure(figsize=(4*cols, 3*rows))
    for idx, col in enumerate(pcols, 1):
        plt.subplot(rows, cols, idx)
        plt.hist(df[col].dropna(), bins=20, edgecolor="k"); plt.xlim(0,1)
        plt.title(col.replace("_prob",""), fontsize=8)
    for idx in range(len(pcols)+1, rows*cols+1):
        plt.subplot(rows, cols, idx); plt.axis("off")
    plt.tight_layout(); plt.show()

    # top / bottom tables
    print("\nTOP-10 firms:")
    print(df.nlargest(10, mean_col)[[ID_COL,DATE_COL,mean_col]]
            .to_string(index=False))
    print("\nBOTTOM-10 firms:")
    print(df.nsmallest(10, mean_col)[[ID_COL,DATE_COL,mean_col]]
            .to_string(index=False))

def _build_family(tag:str) -> pd.DataFrame|None:
    """
    Build predictions for one family; save CSV; run diagnostics.
    Returns dataframe (or None if no coefficients found).
    """
    stem_start, flav, csv_stem, prob_suf = FAMILIES[tag]
    out_df   = snap.copy()
    got_any  = False

    for m in METRICS:
        fp = _coef_file(stem_start, m)
        if fp is None:
            log.debug("No coefficients for %s / %s", tag, m); continue

        beta = pd.read_csv(fp).set_index("Term")["Coefficient"].to_dict()
        lin  = np.full(len(out_df), beta.get("const", 0.0))
        for term, b in beta.items():
            if term != "const" and term in out_df.columns:
                lin += b * out_df[term].astype(float)
        out_df[f"{m}{prob_suf.replace('prob','linpred')}"] = lin
        out_df[f"{m}{prob_suf}"] = 1 / (1 + np.exp(-lin))
        got_any = True

    out_fp = ST06/f"{csv_stem}{SWAN_YEAR}.csv"
    if got_any:
        out_df.to_csv(out_fp, index=False)
        _diagnostics(out_df,
                     [c for c in out_df.columns if c.endswith(prob_suf)],
                     tag)
        log.info("✓ %s", out_fp.name)
        return out_df

    pd.DataFrame(columns=[ID_COL, DATE_COL]).to_csv(out_fp, index=False)
    log.warning("Placeholder %s (no coefficients)", out_fp.name)
    return None

# ══════════════════════ 4 · RUN FAMILIES A-D ═════════════════════
results = {}
for fam in ("A","B","C","D"):
    results[fam] = _build_family(fam)

# ══════════════════════ 5 · BLEND BUILDERS ═══════════════════════
def _blend(df1:pd.DataFrame|None, df2:pd.DataFrame|None,
           suf1:str, suf2:str,
           tag:str, csv_name:str):

    out_fp = ST06/csv_name
    if df1 is None or df2 is None:
        pd.DataFrame(columns=[ID_COL, DATE_COL]).to_csv(out_fp, index=False)
        log.warning("Blend %s skipped – missing inputs", tag); return None

    blend_df = snap.copy()
    for m in METRICS:
        c1 = f"{m}{suf1}"
        c2 = f"{m}{suf2}"
        if c1 in df1.columns and c2 in df2.columns:
            blend_df[f"{m}_blend{suf1.split('_',1)[1]}"] = (df1[c1]+df2[c2]) / 2

    blend_df.to_csv(out_fp, index=False)
    _diagnostics(blend_df,
                 [c for c in blend_df.columns if "blend" in c and c.endswith("prob")],
                 tag)
    log.info("✓ %s", csv_name)
    return blend_df

results["E"] = _blend(results["A"], results["C"],
                      FAMILIES["A"][3], FAMILIES["C"][3],
                      "E", f"Stage6E_Blend_RISE_Predictions_{SWAN_YEAR}.csv")

results["F"] = _blend(results["B"], results["D"],
                      FAMILIES["B"][3], FAMILIES["D"][3],
                      "F", f"Stage6F_BlendStage_RISE_Predictions_{SWAN_YEAR}.csv")

# ══════════════════════ 6 · EXPORT FOR DOWNSTREAM ════════════════
data_stage_6 = results   # dict of dataframes (None where family absent)

log.info("🎉  STAGE 06 complete – artefacts in %s", ST06)



2025-06-15 15:10:03,019 | INFO    | Snapshot FY-2007 rows: 974
2025-06-15 15:10:03,069 | INFO    | 🎉 Stage 06 complete — artefacts in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\2025-06-15\stage06
