In [13]:
#!/usr/bin/env python
# ======================================================================
#  STAGE 15 · MASTER NARRATIVE & CONSOLIDATED TABLES   (pipeline-v2)
# ======================================================================

from __future__ import annotations
import os, io, logging, importlib.util
from pathlib import Path
import pandas as pd
from IPython.display import display   # inline in notebooks

# ── logging ─────────────────────────────────────────────────────────
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s | %(levelname)-7s | %(message)s")
log = logging.getLogger("stage15")

# ── env config ──────────────────────────────────────────────────────
SWAN_YEAR   = int(os.getenv("SWAN_YEAR", 2008))
OUTPUT_ROOT = Path(os.getenv("OUTPUT_ROOT", "outputs_rff")).expanduser()

def _env_path(var: str) -> Path | None:
    v = os.getenv(var)
    if v:
        p = Path(v).expanduser()
        if not p.exists():
            raise FileNotFoundError(f"{var}='{p}' not found")
        return p
    return None

if _env_path("RUN_DIR"):
    RUN_DIR = _env_path("RUN_DIR")            # type: ignore
else:
    latest = sorted(
        (OUTPUT_ROOT / f"event={SWAN_YEAR}").glob("*/stage14/Stage14_SectorYearSummary.csv"),
        key=lambda p: p.stat().st_mtime,
        reverse=True,
    )
    if not latest:
        raise RuntimeError("No Stage-14 outputs found – run Stage 14 first.")
    RUN_DIR = latest[0].parents[1]

STAGE_DIR = RUN_DIR / "stage15"
MS_DIR    = STAGE_DIR / "master_summary"
MS_DIR.mkdir(parents=True, exist_ok=True)
log.info("Stage 15 run folder: %s", RUN_DIR)

# ── helpers ─────────────────────────────────────────────────────────
def _maybe(rel: str, **kw) -> pd.DataFrame | None:
    fp = RUN_DIR / rel
    return pd.read_csv(fp, **kw) if fp.is_file() else None

def _first(df: pd.DataFrame | None, cols: list[str]):
    if df is None: return None
    for c in cols:
        if c in df.columns and pd.notna(df[c].iloc[0]):
            return df[c].iloc[0]
    return None

def _xl_engine() -> str | None:
    for eng in ("xlsxwriter", "openpyxl"):
        if importlib.util.find_spec(eng): return eng
    return None

# ── load artefacts ──────────────────────────────────────────────────
meta          = _maybe("stage11/Stage11_RunMetadata.csv", index_col=0)
model_q       = _maybe("stage11/Stage11_ModelQuality.csv")
pca_var       = _maybe("stage12/Stage12A_PCA_Variance.csv")
sector_year   = _maybe("stage14/Stage14_SectorYearSummary.csv")
cluster_sum   = _maybe("stage13/Stage13_ClusterSummary.csv")
bimodal_secs  = _maybe("stage13/Stage13_BimodalSectors.csv")
friedman      = _maybe("stage14/Stage14_FriedmanPerSector.csv")
mixed_coef    = _maybe("stage14/Stage14_MixedModel_Coefficients.csv")
kw_sector     = _maybe("stage13/Stage13_Sector_Kruskal.csv")
anova_clust   = _maybe("stage13/Stage13_Cluster_ANOVA.csv")

# ── KPI table ───────────────────────────────────────────────────────
kpi_rows: list[dict[str,str]] = []
def _add(label, val):
    if val is not None and pd.notna(val):
        kpi_rows.append({"KPI": label, "Value": f"{val:,.4g}"})

_add("Unique firms (Stage3)",  _first(meta, ["Value"]))
_add("Best AUROC – domain",    model_q["AUROC_domain"].max() if model_q is not None else None)
_add("Best AUROC – stage",     model_q["AUROC_stage"].max()  if model_q is not None else None)
_add("Best AUROC – lasso",     model_q["AUROC_lasso"].max()  if model_q is not None else None)
_add("Mixed-model year slope", _first(mixed_coef, ["Coef.","Coefficient","coef"]))
_add("Kruskal p (sector)",     _first(kw_sector , ["p_value"]))
_add("ANOVA p (cluster)",      _first(anova_clust, ["p_value"]))
_add("No. bimodal sectors",    0 if bimodal_secs is None else len(bimodal_secs))

kpis = pd.DataFrame(kpi_rows)
kpis.to_csv(MS_DIR / "KeyFindings.csv", index=False)
display(kpis)                    # ← inline
print("\nExecutive KPIs saved → KeyFindings.csv")

# ── Excel workbook (optional) ───────────────────────────────────────
if (eng := _xl_engine()):
    with pd.ExcelWriter(MS_DIR / "MasterTables.xlsx", engine=eng) as xl:
        for name, df in [("Metadata", meta), ("ModelQuality", model_q),
                         ("SectorYear", sector_year), ("ClusterSummary", cluster_sum),
                         ("BimodalSectors", bimodal_secs), ("PCA_Var", pca_var),
                         ("MixedModel", mixed_coef), ("Friedman", friedman)]:
            if df is not None:
                df.to_excel(xl, sheet_name=name[:31], index=False)
    print("Excel workbook saved → MasterTables.xlsx")
else:
    print("Excel skipped (xlsxwriter / openpyxl not installed)")

# ── inline previews of other major tables ──────────────────────────
for lbl, df in [("Model Quality (top 10)",  model_q.head(10) if model_q is not None else None),
                ("Sector-Year Summary (head)", sector_year.head(10) if sector_year is not None else None),
                ("Cluster Summary (head)", cluster_sum.head(10) if cluster_sum is not None else None),
                ("Friedman Results (head)", friedman.head(10) if friedman is not None else None)]:
    if df is not None and not df.empty:
        print(f"\n### {lbl}")
        display(df)

# ── markdown storyboard ─────────────────────────────────────────────
md = io.StringIO()
md.write("# Master Narrative – Resilience Project\n")
md.write(f"_Generated: {pd.Timestamp.now():%Y-%m-%d %H:%M}_\n\n")
md.write("## Executive KPIs\n")
for r in kpi_rows: md.write(f"* **{r['KPI']}**: {r['Value']}\n")
md.write("\n---\n")

def _section(ttl, df, n=12):
    if df is None or df.empty: return
    md.write(f"## {ttl}\n")
    md.write(df.head(n).to_markdown(index=False) if importlib.util.find_spec("tabulate")
             else "```\n"+df.head(n).to_string(index=False)+"\n```\n\n")

_section("Model quality (AUROC)",            model_q)
_section("Sector trajectory (summary)",      sector_year)
_section("Cluster summary",                  cluster_sum)
_section("PCA variance (first 10 PCs)",      pca_var.head(10) if pca_var is not None else None)
_section("Bimodal sectors (ΔBIC > 10)",      bimodal_secs)

figs = sorted([p.name for p in (RUN_DIR/"stage13").glob("*.png")]
              + [p.name for p in (RUN_DIR/"stage14").glob("*.png")])
if figs:
    md.write("## Figures generated\n" + "\n".join(f"* {p}" for p in figs) + "\n")

(MS_DIR / "StoryBoard.md").write_text(md.getvalue(), encoding="utf-8")
print("Markdown storyboard saved → StoryBoard.md")

# ╔══════════════════════════════════════════════════════════════════╗
# 6 · CONSOLE SNAPSHOT                                               #
# ╚══════════════════════════════════════════════════════════════════╝
for title, df in [("MODEL QUALITY",  model_quality),
                  ("SECTOR SUMMARY", sector_year),
                  ("CLUSTER SUMMARY", cluster_sum),
                  ("BIMODAL SECTORS", bimodal_secs)]:
    _echo(title, df)

if figs:
    print("\n── FIGURES SAVED " + "─"*70)
    print("\n".join(figs))


# ── DONE ────────────────────────────────────────────────────────────
print("\n✓ Stage 15 complete – see master_summary folder for all outputs.\n")
log.info("Stage 15 artefacts written to %s", MS_DIR)


2025-06-10 21:08:34,219 | INFO    | Stage 15 run folder: outputs_rff\event=2008\20250609


Unnamed: 0,KPI,Value
0,Mixed-model year slope,0.747
1,Kruskal p (sector),7.575e-69
2,ANOVA p (cluster),2.661e-07
3,No. bimodal sectors,2.0



Executive KPIs saved → KeyFindings.csv
Excel workbook saved → MasterTables.xlsx

### Sector-Year Summary (head)


Unnamed: 0,sectorname,year,count,median,mean,std
0,Basic Materials,2004,199,0.6788,0.6888,0.1406
1,Basic Materials,2005,114,0.7175,0.7012,0.1354
2,Basic Materials,2006,226,0.6798,0.6921,0.1368
3,Basic Materials,2007,301,0.6913,0.688,0.0652
4,Communication Services,2004,20,0.7791,0.7239,0.1701
5,Communication Services,2005,6,0.8889,0.8048,0.2091
6,Communication Services,2006,13,0.7843,0.755,0.1575
7,Communication Services,2007,21,0.7608,0.7598,0.0662
8,Consumer Cyclical,2004,42,0.7746,0.7323,0.1459
9,Consumer Cyclical,2005,26,0.792,0.7596,0.1378



### Cluster Summary (head)


Unnamed: 0,cluster,count,median,mean,std,min,max
0,0.0,66,0.7055,0.7022,0.048,0.6006,0.7934
1,1.0,875,0.7546,0.7473,0.0693,0.5341,0.8481



### Friedman Results (head)


Unnamed: 0,sector,n_firms,Friedman_χ²,p_value
0,Energy,12,16.0,0.001134
1,Financial Services,87,15.207,0.001648
2,Consumer Defensive,5,15.0,0.001817
3,Healthcare,22,4.909,0.178576
4,Consumer Cyclical,9,3.0,0.391625
5,Real Estate,6,2.0,0.572407
6,Basic Materials,44,1.091,0.779269
7,Industrials,17,0.176,0.981295
8,Technology,12,0.0,1.0


2025-06-10 21:08:34,423 | INFO    | Stage 15 artefacts written to outputs_rff\event=2008\20250609\stage15\master_summary


Markdown storyboard saved → StoryBoard.md

── MODEL QUALITY ─────────────────────────────────────────────────────────────────────────────
(empty)

── SECTOR SUMMARY ────────────────────────────────────────────────────────────────────────────
            sectorname  year  count  median   mean    std
       Basic Materials  2004    199  0.6788 0.6888 0.1406
       Basic Materials  2005    114  0.7175 0.7012 0.1354
       Basic Materials  2006    226  0.6798 0.6921 0.1368
       Basic Materials  2007    301  0.6913 0.6880 0.0652
Communication Services  2004     20  0.7791 0.7239 0.1701
Communication Services  2005      6  0.8889 0.8048 0.2091
... (38 more rows)

── CLUSTER SUMMARY ───────────────────────────────────────────────────────────────────────────
 cluster  count  median   mean    std    min    max
     0.0     66  0.7055 0.7022 0.0480 0.6006 0.7934
     1.0    875  0.7546 0.7473 0.0693 0.5341 0.8481

── BIMODAL SECTORS ─────────────────────────────────────────────────────────────