In [None]:
#!/usr/bin/env python
"""
STAGE-05A · BEST-RATIO SELECTION & YEAR-WISE QUINTILES  (flavour-separated)
v5.2 – 2025-07-04
"""
from __future__ import annotations
import inspect, logging, os, shutil, warnings, yaml
from pathlib import Path
from typing  import Dict, List
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
from  pipeline_utils import load_cfg, resolve_run_dir
warnings.filterwarnings("ignore", category=RuntimeWarning)
plt.rcParams["figure.dpi"] = 110
sns.set_style("whitegrid")

# ═════════ 0 · CONFIG ═════════════════════════════════════════════
CFG     = load_cfg()
EVENTS  = {str(k): v for k, v in CFG.get("events", {}).items()}
SWAN    = os.getenv("SWAN_YEAR") or next(iter(EVENTS))

RUN_DIR = resolve_run_dir(
            swan_year = SWAN,
            run_tag   = os.getenv("RUN_TAG"),
            must_have = f"stage04/Stage4A_winsor_RatioRanking_{SWAN}.csv",
)
OUT      = RUN_DIR / "stage05a";  OUT.mkdir(exist_ok=True)
ST03_CSV = RUN_DIR / "stage03" / f"Stage3_Data_WithRatios_{SWAN}.csv"
ST03_PQ  = ST03_CSV.with_suffix(".parquet")

logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(OUT / "stage05a.log", "w", "utf-8"),
              logging.StreamHandler()],
)
log = logging.getLogger(__name__)
log.info("==========  STAGE-05A  (SWAN %s) ==========", SWAN)

SEL_STAT  = os.getenv("SELECT_STAT", "AvgRho")           # AvgRho | AvgAUROC | AvgPR2 | Composite
COMP_WTS  = tuple(float(x) for x in os.getenv("COMPOS_WTS", "0.5,0.3,0.2").split(","))
MIN_AUROC = float(os.getenv("MIN_AUROC", 0.55))
MIN_PR2   = float(os.getenv("MIN_PR2",   0.01))

# ═════════ 1 · BUCKET MAP ════════════════════════════════════════
import pipeline_utils as _pu
repo_root = Path(inspect.getfile(_pu)).resolve().parent
yaml_src  = next((p for p in (RUN_DIR / "ratio_domain_stage_map.yaml",
                              repo_root / "ratio_domain_stage_map.yaml",
                              repo_root / "stage04" / "ratio_domain_stage_map.yaml")
                 if p.is_file()), None)
if yaml_src is None:
    raise FileNotFoundError("ratio_domain_stage_map.yaml not found")
MAP_YAML = RUN_DIR / "ratio_domain_stage_map.yaml"
if not MAP_YAML.exists():
    shutil.copy2(yaml_src, MAP_YAML)
with MAP_YAML.open(encoding="utf-8") as fh:
    raw = yaml.safe_load(fh) or {}
bucket_map: Dict[str, List[str]] = {r: (v if isinstance(v, (list, tuple)) else [v])
                                    for r, v in raw.items()}

# ═════════ 2 · LOAD STAGE-03 ═════════════════════════════════════
def _read_stage3() -> pd.DataFrame:
    if ST03_PQ.exists():
        return pd.read_parquet(ST03_PQ)
    return pd.read_csv(ST03_CSV, parse_dates=["ReportDate"], low_memory=False)

df = _read_stage3()
df = df[df["Symbol"].astype(str).str.fullmatch(r"[A-Z]{3}")]
df["Year"] = pd.to_datetime(df["ReportDate"]).dt.year.astype("Int16")

# ═════════ 3 · LOOP OVER FLAVOURS ════════════════════════════════
FLAVOURS = {
    "Tmp": RUN_DIR / "stage04"  / f"Stage4A_winsor_RatioRanking_{SWAN}.csv",
    "Imp": RUN_DIR / "stage04b" / f"Stage4B_winsor_RatioRanking_{SWAN}.csv",
    "Dyn": RUN_DIR / "stage04c" / f"Stage4C_winsor_RatioRanking_{SWAN}.csv",
}
qcols_all: List[str] = []
for tag, fpath in FLAVOURS.items():
    if not fpath.exists():
        log.warning("⏭️  %s ranking missing – flavour skipped", tag)
        continue
    rk = pd.read_csv(fpath)

    # ── build per-ratio stats inside this flavour ───────────────
    rows = []
    for ratio, buckets in bucket_map.items():
        sub = rk[rk["Ratio"] == ratio]
        if sub.empty:
            continue
        row = dict(Ratio=ratio,
                   Buckets=buckets,
                   AvgRho=sub["|rho|"].mean(),
                   AvgAUROC=sub["AUROC"].mean(),
                   AvgPR2=sub["PseudoR2"].mean(),
                   Coverage=sub["Coverage%"].max())
        row["Weak"] = (row["AvgAUROC"] < MIN_AUROC) and (row["AvgPR2"] < MIN_PR2)
        if SEL_STAT.lower() == "composite":
            wρ,wauc,wpr2 = COMP_WTS
            row["Composite"] = (wρ   * sub["|rho|"].rank(pct=True).mean() +
                                wauc * sub["AUROC"].rank(pct=True).mean() +
                                wpr2 * sub["PseudoR2"].rank(pct=True).mean())
        rows.append(row)
    stats = pd.DataFrame(rows)
    if stats.empty:
        log.warning("No ratios for flavour %s", tag); continue

    # ── pick winner per bucket (strong→weak fallback) ────────────
    sel_col = SEL_STAT if SEL_STAT in stats.columns else "AvgRho"
    def _pick(g):
        strong = g[~g["Weak"]]
        tgt = strong if not strong.empty else g
        return tgt.sort_values([sel_col,"Coverage"],ascending=[False,False]).iloc[0]
    win = (stats.explode("Buckets", ignore_index=True)
                 .rename(columns={"Buckets":"Bucket"})
                 .groupby("Bucket",as_index=False)
                 .apply(_pick))

    win.to_csv(OUT / f"{tag}_Stage5A_BestRatioPerBucket_{SWAN}.csv", index=False)

    # ── derive year-wise quintiles for winners ──────────────────
    def _q5(s):
        v=s.dropna(); 
        if v.nunique()<5: return pd.Series(np.nan,index=s.index)
        try:
            return pd.qcut(v.rank(method="first"),5,labels=[1,2,3,4,5],
                           duplicates="drop").astype("Int8").reindex(s.index)
        except ValueError:
            return pd.Series(np.nan,index=s.index)

    for _, r in win.iterrows():
        b, ratio = r["Bucket"], r["Ratio"]
        qcol = f"{tag}_{b}_Q"
        qcols_all.append(qcol)
        df[qcol] = (_q5(df.groupby("Year")[ratio]) if ratio in df.columns else np.nan)
        log.info("%s · quintiled %-34s → %s", tag, ratio, qcol)

# ═════════ 4 · WRITE OUTPUTS ════════════════════════════════════
out_csv = OUT / f"Stage5A_QuintilesAndScores_{SWAN}.csv"
df.to_csv(out_csv, index=False)
log.info("Saved combined quintiles → %s  (cols=%d)", out_csv.name, len(qcols_all))

# optional coverage heat-map per flavour
for tag in FLAVOURS:
    cols = [c for c in qcols_all if c.startswith(f"{tag}_")]
    if not cols: continue
    cov = df.groupby("Year")[cols].apply(lambda x: x.notna().mean()*100).T
    plt.figure(figsize=(0.42*len(cov)+2,6))
    sns.heatmap(cov,cmap="YlGnBu",vmin=0,vmax=100,cbar_kws={"label":"% with quintile"})
    plt.title(f"{tag} quintile coverage — SWAN {SWAN}")
    plt.tight_layout()
    plt.savefig(OUT / f"{tag}_Stage5A_Coverage_{SWAN}.png")
    plt.close()

log.info("✅ STAGE-05A complete — artefacts in %s", OUT)



FileNotFoundError: No run directories under C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008