In [None]:
#!/usr/bin/env python
"""
STAGE-04C · DYNAMIC-RESILIENCE RANKINGS   v3.1 – 2025-06-20
Ranks candidate financial ratios against Dynamic resilience metrics
(RateDown/Up, Asymmetry, Convexity).  “Stability” was dropped.
"""

from __future__ import annotations
import inspect, logging, os, warnings, yaml
from pathlib  import Path
from typing   import Dict, List, Literal

import numpy as np
import pandas as pd
import statsmodels.api as sm
from   scipy.stats   import spearmanr
from   numpy.linalg  import LinAlgError
from   statsmodels.tools.sm_exceptions import MissingDataError

from pipeline_utils import (
    load_cfg, resolve_run_dir, ensure_three_letter_tickers as _flt3
)

# ═════════════════ 1 · CONFIG ═══════════════════════════════════
CFG        = load_cfg()
EVENTS     = {str(k): v for k, v in CFG.get("events", {}).items()}
C4         = CFG.get("stage4c", CFG.get("stage4", {}))
DEF        = CFG["defaults"]

YR         = os.getenv("SWAN_YEAR") or next(iter(EVENTS))
YR_I       = int(YR)

SAVE_FMT: Literal["csv", "parquet"] = os.getenv(
    "SAVE_FORMAT", DEF.get("SAVE_FORMAT", "csv")
).lower()

DATE_COL   = C4.get("date_col", "ReportDate")
WIN_PCT    = float(C4.get("winsor_pct", 0.01))
MIN_COV    = float(os.getenv("MIN_COVERAGE", C4.get("min_coverage", 30)))

# ═════════════════ 2 · RUN DIR & STAGE-03 FILE ══════════════════
RUN_DIR = resolve_run_dir(
    swan_year = YR,
    run_tag   = os.getenv("RUN_TAG"),
    must_have = f"stage03/Stage3_Data_WithRatios_{YR}.{SAVE_FMT}",
)
OUT_DIR  = RUN_DIR / "stage04c"; OUT_DIR.mkdir(exist_ok=True)
STAGE3_F = RUN_DIR / "stage03" / f"Stage3_Data_WithRatios_{YR}.{SAVE_FMT}"

# ═════════════════ 3 · BUCKET / STAGE / DOMAIN MAPS ═════════════
import pipeline_utils as _pu
repo_root = Path(inspect.getfile(_pu)).resolve().parent

search_yaml = [
    RUN_DIR  / "ratio_domain_stage_map.yaml",
    repo_root / "ratio_domain_stage_map.yaml",
    repo_root / "stage04" / "ratio_domain_stage_map.yaml",
]
yaml_src = next((p for p in search_yaml if p.is_file()), None)
if yaml_src is None:
    raise FileNotFoundError(
        "ratio_domain_stage_map.yaml not found in:\n  " +
        "\n  ".join(map(str, search_yaml))
)

with yaml_src.open("r", encoding="utf-8", errors="replace") as fh:
    bucket_raw = yaml.safe_load(fh) or {}

bucket_map: Dict[str, List[str]] = {
    r: ([v] if isinstance(v, str) else v) for r, v in bucket_raw.items()
}

STAGE_NAMES  = ["Prepare", "Absorb", "Recover", "Adapt"]
DOMAIN_NAMES = ["Phys", "Info", "Cog", "Soc"]

STAGE_MAP  = {s: [r for r, bs in bucket_map.items()
                  if any(b.endswith(f"-{s}") for b in bs)]
              for s in STAGE_NAMES}
DOMAIN_MAP = {d: [r for r, bs in bucket_map.items()
                  if any(str(b).startswith(d) for b in bs)]
              for d in DOMAIN_NAMES}

# ═════════════════ 4 · LOGGER ═══════════════════════════════════
logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(OUT_DIR / "stage04C.log", "w", "utf-8"),
              logging.StreamHandler()],
)
log = logging.getLogger(__name__)
warnings.filterwarnings("ignore", category=RuntimeWarning)
log.info("==========  STAGE-04C (Dynamic) — SWAN %s ==========", YR)
log.info("Loading Stage-03 file: %s", STAGE3_F.name)

# ═════════════════ 5 · LOAD & PREP ══════════════════════════════
df3  = (pd.read_parquet(STAGE3_F) if SAVE_FMT == "parquet"
        else pd.read_csv(STAGE3_F, parse_dates=[DATE_COL], low_memory=False))
df3  = _flt3(df3, "Symbol")

pre  = df3[df3[DATE_COL].dt.year < YR_I].copy()
if pre.empty:
    raise RuntimeError("No pre-event rows found")

ratio_names = sorted({c[:-4] for c in pre if c.endswith("_raw") and c[:-4] in pre})
raw_cols, win_cols = [f"{r}_raw" for r in ratio_names], ratio_names

def _winsor(s: pd.Series, p: float = WIN_PCT) -> pd.Series:
    if s.notna().sum() < 3:
        return s
    lo, hi = np.nanpercentile(s.dropna(), [p*100, (1-p)*100])
    return s.clip(lo, hi) if lo < hi else s

wins = pre.copy()
wins[win_cols] = wins[win_cols].apply(_winsor)

# ═════════════════ 6 · METRICS & HELPERS ════════════════════════
METRICS  = [
    "NetIncome","EarningBeforeInterestAndTax","OperatingIncome","EBITDA",
    "OperatingCashFlow","FreeCashFlow","Cash","CashAndCashEquivalents",
    "TotalRevenue","GrossProfit",
]
SUFFIXES = ["RateDown", "RateUp", "Asymmetry", "Convexity"]  # stability removed

def _abs_rho(x: pd.Series, y: pd.Series) -> float:
    ok = x.notna() & y.notna()
    return np.nan if ok.sum() < 100 else abs(spearmanr(x[ok], y[ok]).correlation)

# ═════════════════ 7 · DIAGNOSTICS ══════════════════════════════
def _diag(df: pd.DataFrame, cols: List[str], tag: str) -> None:
    if not cols:
        return
    (df.assign(Y=df[DATE_COL].dt.year)
       .groupby("Y")[cols]
       .agg(['mean', 'std', 'median', 'count'])
       .stack(level=1)
       .reset_index()) \
      .to_csv(OUT_DIR / f"Stage4C_{tag}_RatioDiagnostics_{YR}.csv", index=False)

# ═════════════════ 8 · RANKER ═══════════════════════════════════
def _rank(tag: str, cols: List[str], base: pd.DataFrame) -> None:
    rows: List[Dict] = []
    for col in cols:
        cov = base[col].notna().mean() * 100
        if cov < MIN_COV:
            continue
        for m in METRICS:
            for suf in SUFFIXES:
                target = f"{suf}_{m}"
                if target not in base:
                    continue
                rho = _abs_rho(base[col], base[target])
                rows.append(
                    dict(
                        Ratio       = col.replace("_raw", ""),
                        Metric      = target,
                        CoveragePct = round(cov, 1),     # ← fixed name
                        **{"|rho|": round(rho, 3)}
                    )
                )

    rk = pd.DataFrame(rows)
    rk.to_csv(OUT_DIR / f"Stage4C_{tag}_RatioRanking_{YR}.csv", index=False)


    # ---- helper: top-3 tables ---------------------------------
    def _top3(d: pd.DataFrame) -> pd.DataFrame:
        return d.nlargest(3, "|rho|")

    def _leader(mapping: Dict[str, List[str]], stem: str) -> None:
        outs: List[Dict] = []
        for k, members in mapping.items():
            sub = rk[rk["Ratio"].isin(members)]
            for t in rk["Metric"].unique():
                outs += (_top3(sub[sub["Metric"] == t])
                         .assign(**{stem: k})
                         .to_dict("records"))
        if outs:
            pd.DataFrame(outs).to_csv(
                OUT_DIR / f"Stage4C_{tag}_{stem}Top3_{YR}.csv", index=False
            )

    _leader(bucket_map, "Bucket")
    _leader(STAGE_MAP , "Stage")      # fixed
    _leader(DOMAIN_MAP, "Domain")     # fixed

    pd.concat([_top3(rk[rk["Metric"] == t]) for t in rk["Metric"].unique()]) \
      .to_csv(OUT_DIR / f"Stage4C_{tag}_OverallTop3_{YR}.csv", index=False)

# ═════════════════ 9 · RUN ══════════════════════════════════════
_diag(pre , raw_cols , "raw")
_diag(wins, win_cols, "winsor")

_rank("raw"   , raw_cols , pre )
_rank("winsor", win_cols , wins)

log.info("✅  STAGE-04C complete — artefacts in %s", OUT_DIR)

