In [None]:
#!/usr/bin/env python
"""
STAGE-04D · TOP-20 RATIO SUMMARY PER METRIC
v1.9 – 2025-06-20
• Compatible with the v3.1 naming convention
  (Stage4A_ / Stage4B_ / Stage4C_ files).
• Still guards against locked files and missing score columns.
"""
from __future__ import annotations
import logging, os, time, tempfile
from pathlib import Path
from typing  import Dict, List

import pandas as pd
from pipeline_utils import load_cfg, resolve_run_dir

# ── helper: atomic writer tolerant of locked files ──────────────
def safe_to_csv(df: pd.DataFrame, path: Path, max_retry: int = 3) -> Path:
    tmp = Path(tempfile.mkstemp(suffix=".csv", dir=path.parent)[1])
    df.to_csv(tmp, index=False)

    target = path
    for i in range(max_retry):
        try:
            os.replace(tmp, target)
            return target
        except PermissionError:
            time.sleep(0.5)
            target = target.with_name(f"{path.stem}_retry{i+1}{path.suffix}")

    logging.getLogger(__name__).warning(
        "Destination %s remained locked; using temp file %s instead.",
        path.name, tmp.name)
    return tmp


# ── bootstrap & paths ───────────────────────────────────────────
CFG    = load_cfg()
EVENTS = {str(k): v for k, v in CFG["events"].items()}
YEAR   = str(os.getenv("SWAN_YEAR") or next(iter(EVENTS)))

RUN_DIR = resolve_run_dir(
    swan_year = YEAR,
    run_tag   = os.getenv("RUN_TAG"),
    must_have = f"stage04c/Stage4C_winsor_RatioRanking_{YEAR}.csv",   # at least one exists
)
OUT_DIR = RUN_DIR / "stage04d"; OUT_DIR.mkdir(exist_ok=True)

logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(OUT_DIR / "stage04d.log", "w", "utf-8"),
              logging.StreamHandler()],
)
log = logging.getLogger(__name__)
log.info("==========  STAGE-04D  (SWAN %s) ==========", YEAR)

# ── locate available ranking tables ─────────────────────────────
CANDIDATES: Dict[str, Path] = {
    "Temporal": RUN_DIR / "stage04"  / f"Stage4A_winsor_RatioRanking_{YEAR}.csv",
    "Impact"  : RUN_DIR / "stage04b" / f"Stage4B_winsor_RatioRanking_{YEAR}.csv",
    "Dynamic" : RUN_DIR / "stage04c" / f"Stage4C_winsor_RatioRanking_{YEAR}.csv",
}
tables: Dict[str, pd.DataFrame] = {}
for dim, fp in CANDIDATES.items():
    if fp.is_file() and fp.stat().st_size:
        tables[dim] = pd.read_csv(fp)
        log.info("Loaded %s (%d rows)", fp.name, len(tables[dim]))
    else:
        log.warning("⏭️  %s missing or empty – skipped", fp.name)

if not tables:
    raise RuntimeError("No ranking tables found – nothing to summarise")

metrics = sorted({m for t in tables.values() for m in t["Metric"]})
preferred_scores = ["AbsRho", "|rho|", "AUROC", "PseudoR2"]
all_rows: List[Dict] = []

# ── per-metric aggregation ─────────────────────────────────────
for met in metrics:
    rows: List[Dict] = []
    for dim, tbl in tables.items():
        sub = tbl[tbl["Metric"] == met]
        if sub.empty:
            continue
        score = next((c for c in preferred_scores if c in sub.columns), None)
        if not score:
            continue
        top = (sub.nlargest(20, score)
                 .loc[:, ["Ratio"] + [c for c in preferred_scores if c in sub]]
                 .round(3))
        top.insert(0, "Dimension", dim)
        rows.extend(top.to_dict("records"))

    if rows:
        df_out = pd.DataFrame(rows)
        final  = safe_to_csv(df_out, OUT_DIR / f"Stage4D_{met}_{YEAR}.csv")
        log.info("→ %s (%d rows)", final.name, len(df_out))
        all_rows.extend(df_out.assign(Metric=met).to_dict("records"))

# ── stacked summary across all metrics ─────────────────────────
if all_rows:
    df_all = pd.DataFrame(all_rows)
    keep   = ["Metric", "Dimension", "Ratio"] + \
             [c for c in preferred_scores if c in df_all.columns]
    df_all = df_all[keep]
    safe_to_csv(df_all, OUT_DIR / f"Stage4D_AllMetrics_{YEAR}.csv")
    log.info("Stacked summary saved (%d rows, %d cols).", *df_all.shape)

log.info("✅  STAGE-04D complete — artefacts in %s", OUT_DIR)
