In [None]:
#!/usr/bin/env python
"""
STAGE-07 · RISE-PROBABILITY EXPLORATION & LEADERBOARDS
v4.4 – 2025-06-20   (pathless-ExcelWriter fix)

▪ Produces leader-boards, distribution plots, optional Excel Top-10 workbooks.
▪ Runs even if neither `xlsxwriter` nor `openpyxl` is installed
  (Excel workbook is simply skipped in that case).
"""

from __future__ import annotations
import os, logging, math, warnings
from pathlib import Path
from typing import Dict, List

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pipeline_utils import load_cfg, resolve_run_dir

warnings.filterwarnings("ignore", category=RuntimeWarning)
plt.rcParams["figure.dpi"] = 110
sns.set_style("whitegrid")

# ═════════ 0 · CONFIG ══════════════════════════════════════════════
CFG    = load_cfg()
EVENTS = {str(k): v for k, v in CFG["events"].items()}
SWAN   = os.getenv("SWAN_YEAR") or next(iter(EVENTS))

RUN = resolve_run_dir(
    swan_year = SWAN,
    run_tag   = os.getenv("RUN_TAG"),
    must_have = f"stage06a/Stage6A_Temporal_RISE_Predictions_{SWAN}.csv",
)

OUT = RUN / "stage07"; OUT.mkdir(exist_ok=True)

logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(OUT / "stage07.log", "w", "utf-8"),
              logging.StreamHandler()],
)
log = logging.getLogger(__name__)
log.info("==========  STAGE-07  (SWAN %s) ==========", SWAN)

# ═════════ 1 · FAMILY DEFINITIONS ═══════════════════════════════════
S06A, S06B = RUN / "stage06a", RUN / "stage06b"

FLAV: Dict[str, Dict] = {
    # DOMAIN-weighted
    "temporal": {"file": S06A / f"Stage6A_Temporal_RISE_Predictions_{SWAN}.csv",
                 "suf": "_TempRISE_prob",     "title": "Temporal – domain"},
    "impact":   {"file": S06A / f"Stage6A_Impact_RISE_Predictions_{SWAN}.csv",
                 "suf": "_ImpactRISE_prob",   "title": "Impact – domain"},
    "dynamic":  {"file": S06A / f"Stage6A_Dynamic_RISE_Predictions_{SWAN}.csv",
                 "suf": "_DynamicRISE_prob",  "title": "Dynamic – domain"},
    # STAGE-weighted
    "temporalStage": {"file": S06B / f"Stage6B_TemporalStage_RISE_Predictions_{SWAN}.csv",
                      "suf": "_StageTempRISE_prob", "title": "Temporal – stage"},
    "impactStage":   {"file": S06B / f"Stage6B_ImpactStage_RISE_Predictions_{SWAN}.csv",
                      "suf": "_StageImpactRISE_prob","title": "Impact – stage"},
    "dynamicStage":  {"file": S06B / f"Stage6B_DynamicStage_RISE_Predictions_{SWAN}.csv",
                      "suf": "_StageDynRISE_prob",   "title": "Dynamic – stage"},
}

ID_COL, DATE_COL = "Symbol", "ReportDate"

# ═════════ 2 · HELPERS ═════════════════════════════════════════════
def leaderboard(df: pd.DataFrame, cols: List[str], suf: str) -> pd.DataFrame:
    return (df[cols].mean()
              .rename("MeanProb")
              .sort_values(ascending=False)
              .reset_index()
              .rename(columns={"index": "Metric"})
              .assign(Metric=lambda d: d["Metric"].str.replace(suf, "", regex=False)))

def hist_grid(df: pd.DataFrame, cols: List[str], title: str,
              out: Path, suf: str) -> None:
    if not cols:
        return
    C, R = 4, math.ceil(len(cols) / 4)
    fig, axs = plt.subplots(R, C, figsize=(4*C, 3*R))
    for ax, col in zip(axs.flatten(), cols):
        df[col].dropna().hist(bins=20, edgecolor="k", ax=ax)
        ax.set_xlim(0, 1); ax.set_title(col.replace(suf, ""), fontsize=8)
    for ax in axs.flatten()[len(cols):]:
        ax.axis("off")
    plt.suptitle(title, y=1.02, fontsize=12)
    plt.tight_layout(); plt.savefig(out, dpi=110); plt.close()

def box_plot(df: pd.DataFrame, cols: List[str], title: str, out: Path) -> None:
    if not cols:
        return
    plt.figure(figsize=(max(6, 0.6*len(cols)), 5))
    df[cols].boxplot(rot=45)
    plt.ylim(0, 1); plt.ylabel("Predicted probability")
    plt.title(title); plt.tight_layout()
    plt.savefig(out, dpi=110); plt.close()

def excel_writer(path: Path):
    """Return a usable ExcelWriter or None if no engine is available."""
    for eng in ("xlsxwriter", "openpyxl", None):
        try:
            return pd.ExcelWriter(path, engine=eng) if eng else None
        except ModuleNotFoundError:
            continue
    return None

# ═════════ 3 · PROCESS EACH FAMILY ═════════════════════════════════
leaderboards: Dict[str, pd.DataFrame] = {}

for flav, info in FLAV.items():
    fp, suf = info["file"], info["suf"]
    if not fp.exists() or fp.stat().st_size == 0:
        log.warning("Family '%s' skipped – %s missing/empty", flav, fp.name)
        continue

    df = pd.read_csv(fp)
    cols = [c for c in df.columns if c.endswith(suf)]
    if not cols:
        log.warning("Family '%s' has no *%s columns", flav, suf)
        continue

    # leaderboard CSV + graphics
    lb = leaderboard(df, cols, suf)
    lb.to_csv(OUT / f"Stage7_Rank_{flav}_{SWAN}.csv", index=False)
    hist_grid(df, cols, f"{info['title']} – distribution", OUT / f"Stage7_HistGrid_{flav}_{SWAN}.png", suf)
    box_plot(df, cols, f"{info['title']} – box-plot", OUT / f"Stage7_Boxplot_{flav}_{SWAN}.png")

    # Excel Top-10 workbook (optional)
    wrk_path = OUT / f"Stage7_Top10_{flav}_{SWAN}.xlsx"
    ew = excel_writer(wrk_path)
    if ew:
        name_col = "CompanyName" if "CompanyName" in df.columns else ID_COL
        with ew as xw:
            for col in cols:
                metric = col.replace(suf, "")
                (df[[name_col, ID_COL, DATE_COL, col]]
                   .sort_values(col, ascending=False)
                   .head(10)
                   .rename(columns={name_col: "CompanyName",
                                    ID_COL  : "Symbol",
                                    DATE_COL: "ReportDate",
                                    col     : "Probability"})
                   .to_excel(xw, sheet_name=metric[:31], index=False))
        log.info("✓ workbook %s written", wrk_path.name)
    else:
        log.warning("Excel engines missing – workbook for '%s' skipped", flav)

    leaderboards[flav] = lb
    log.info("✓ family '%s' processed", flav)

# ═════════ 4 · TEMPORAL vs IMPACT (domain) ═════════════════════════
if {"temporal", "impact"} <= leaderboards.keys():
    comp = (leaderboards["temporal"]
              .merge(leaderboards["impact"], on="Metric",
                     suffixes=("_Temporal", "_Impact"))
              .assign(Diff=lambda d: d["MeanProb_Temporal"]
                                   - d["MeanProb_Impact"])
              .sort_values("MeanProb_Temporal", ascending=False))
    comp.to_csv(OUT / f"Stage7_Temporal_vs_Impact_{SWAN}.csv", index=False)

    plt.figure(figsize=(10, 4))
    sns.barplot(data=comp, x="Metric", y="Diff", palette="vlag")
    plt.title("Temporal – Impact (mean probability) – domain family")
    plt.axhline(0, color="k"); plt.ylabel("Δ")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(OUT / f"Stage7_TempImpact_DiffBar_{SWAN}.png", dpi=110)
    plt.close()
    log.info("✓ temporal vs impact comparison written")

log.info("🎉  STAGE-07 complete — artefacts in %s", OUT)


