In [None]:
#!/usr/bin/env python
"""
STAGE-05C · SNAPSHOT LOGIT REGRESSIONS  (flavour-separated)
v5.2 — 2025-07-04
────────────────────────────────────────────────────────────
For each resilience flavour:

    • Temporal →  flags  FlagTemporal_<Metric>
    • Impact   →  flags  FlagImpact_<Metric>
    • Dynamic  →  flags  FlagDynamic_<Metric>   (skip if missing)

…fits one-year (FY SWAN-1) logits using either
    ▸ domain predictors  (Physical_…, Information_…, …)   or
    ▸ stage  predictors  (Prepare_…, Absorb_…, …)

Inputs   :  Tmp_/Imp_/Dyn_05B_AllScores_<SWAN>.csv
Outputs  :  <flav>_Domain_<Metric>_Coefficients.csv
            <flav>_Stage_<Metric>_Coefficients.csv
            <flav>_…_Summary.txt            (statsmodels .summary())
"""

from __future__ import annotations
import logging, os, warnings
from pathlib import Path
from typing  import List, Dict

import numpy as np
import pandas as pd
import statsmodels.api as sm

from pipeline_utils import load_cfg, resolve_run_dir

warnings.filterwarnings("ignore", category=RuntimeWarning)

# ═════════════ 0 · CONFIG ══════════════════════════════════════
GOOD_HIGH = True          # flip 1↔5 so higher = better
MIN_ROWS  = 80            # min obs per metric × flavour

CFG       = load_cfg()
EVENTS    = {str(k): v for k, v in CFG["events"].items()}

SWAN      = os.getenv("SWAN_YEAR") or next(iter(EVENTS))
SWAN_I    = int(SWAN)
PRE_YEAR  = SWAN_I - 1

RUN_DIR   = resolve_run_dir(
              swan_year = SWAN,
              run_tag   = os.getenv("RUN_TAG"),
              must_have = f"stage05b/Tmp_05B_AllScores_{SWAN}.csv",  # any one file
)
ST05B     = RUN_DIR / "stage05b"
OUT_DIR   = RUN_DIR / "stage05c";  OUT_DIR.mkdir(exist_ok=True)

logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(OUT_DIR / "stage05c.log", "w", "utf-8"),
              logging.StreamHandler()],
)
log = logging.getLogger(__name__)
log.info("==========  STAGE-05C  (SWAN %s) ==========", SWAN)

# ═════════════ 1 · CONSTANTS ═══════════════════════════════════
METRICS = [
    "NetIncome","EarningBeforeInterestAndTax","OperatingIncome","EBITDA",
    "OperatingCashFlow","FreeCashFlow","Cash","CashAndCashEquivalents",
    "TotalRevenue","GrossProfit",
]

DOMAIN_BASE = ["Physical_Score", "Information_Score",
               "Cognitive_Score", "Social_Score"]
STAGE_BASE  = ["Prepare_Score", "Absorb_Score",
               "Recover_Score", "Adapt_Score"]

FLAV: Dict[str, Dict] = {        # tag → {flag-prefix, score-file}
    "Tmp": {"prefix": "FlagTemporal_",
            "file":   ST05B / f"Tmp_05B_AllScores_{SWAN}.csv"},
    "Imp": {"prefix": "FlagImpact_",
            "file":   ST05B / f"Imp_05B_AllScores_{SWAN}.csv"},
    "Dyn": {"prefix": "FlagDynamic_",
            "file":   ST05B / f"Dyn_05B_AllScores_{SWAN}.csv"},
}

ID_COL   = "Symbol"
YEAR_COL = "Year"

# ═════════════ 2 · HELPER ─ FIT ONE MODEL ══════════════════════
def _fit(df: pd.DataFrame, metric: str,
         preds: List[str], flav_tag: str, model_tag: str,
         flag_col: str) -> None:
    """fit logit & write CSV/TXT"""
    if flag_col not in df.columns:
        return
    if any(p not in df.columns for p in preds):
        return

    sub = df[[flag_col] + preds].dropna()
    if len(sub) < MIN_ROWS or sub[flag_col].nunique() < 2:
        log.info("skip %-3s %-7s (%s) – n=%d",
                 flav_tag, metric, model_tag, len(sub))
        return

    X = sub[preds].apply(lambda s: (s - s.mean()) / (s.std(ddof=0)+1e-9))
    X = sm.add_constant(X, prepend=True)
    y = sub[flag_col].astype(int)

    try:
        mdl = sm.Logit(y, X).fit(disp=False)
    except Exception as exc:
        log.warning("fit failed  %-3s %-7s (%s) – %s",
                    flav_tag, metric, model_tag, exc)
        return

    stem = f"{flav_tag}_{model_tag}_{metric}_{SWAN}"

    coef = (mdl.summary2().tables[1]
              .rename_axis("Term").reset_index()
              .rename(columns={"Coef.":"Coefficient",
                               "Std.Err.":"StdErr",
                               "P>|z|":"PValue"}))
    coef["OddsRatio"] = np.exp(coef["Coefficient"])
    coef.round(6).to_csv(OUT_DIR / f"{stem}_Coefficients.csv", index=False)

    (OUT_DIR / f"{stem}_Summary.txt").write_text(mdl.summary().as_text())

    log.info("✓ %-3s %-7s (%s)  obs=%d", flav_tag, metric, model_tag, len(sub))

# ═════════════ 3 · MAIN LOOP ═══════════════════════════════════
for tag, info in FLAV.items():
    fpath = info["file"]
    if not fpath.is_file():
        log.warning("⏭️  %s file missing – flavour skipped", tag)
        continue

    df = pd.read_csv(fpath)
    df = df.query(f"{YEAR_COL} == @PRE_YEAR").copy()

    if df.empty:
        log.warning("⏭️  %s no rows for FY-%d", tag, PRE_YEAR)
        continue

    # flip 1–5 so higher = better
    if GOOD_HIGH:
        for c in df.filter(regex=r"(_Score$|_Q$)").columns:
            df[c] = 6 - df[c]

    dom_preds  = [f"{tag}_{b}" for b in DOMAIN_BASE]
    stg_preds  = [f"{tag}_{b}" for b in STAGE_BASE]

    for met in METRICS:
        flag = f"{info['prefix']}{met}"
        _fit(df, met, dom_preds, tag, "Domain", flag)
        _fit(df, met, stg_preds, tag, "Stage",  flag)

log.info("✅  STAGE-05C complete — artefacts in %s", OUT_DIR)

