In [3]:
#!/usr/bin/env python
"""
Stage 05B · Snapshot Logit Regressions
=====================================

Replaces the legacy “Stage 5C” and “Stage 5D”.

Given the quintile / score table produced by Stage 05A, run two **pre-Swan
(FY-SWAN-1)** logit models for every resilience metric:

A. Domain scores   (Physical / Information / Cognitive / Social)  
B. Stage  scores   (Prepare / Absorb / Recover / Adapt)

Inputs  (auto-discovered if RUN_DIR / RUN_DATE not provided)
-----------------------------------------------------------
<run>/stage05a/Stage5A_QuintilesAndScores.csv

Outputs
-------
<run>/stage05b/
   ├─ Stage5C_<Metric>_Coefficients.csv      (domain scores)
   ├─ Stage5D_<Metric>_StageCoefficients.csv (stage  scores)
   └─ stage05b.log
"""

from __future__ import annotations
from pathlib import Path
import os, sys, logging, yaml, warnings
from typing import Dict, List

import numpy as np
import pandas as pd
import statsmodels.api as sm

# ╔══════════════════════════════════════════════════════════════════╗
# 0 · PIPELINE CONFIG & PATH DISCOVERY                               #
# ╚══════════════════════════════════════════════════════════════════╝
CFG_FILE = Path(os.getenv("PIPELINE_CFG", "pipeline_config.yaml")).expanduser()
if not CFG_FILE.is_file():
    raise FileNotFoundError(f"pipeline_config.yaml not found at {CFG_FILE}")

CFG: Dict = yaml.safe_load(CFG_FILE.read_text()) or {}
DEFAULTS  = CFG.get("defaults", {})
EVENTS    = {str(k): v for k, v in CFG.get("events", {}).items()}

SWAN_YEAR = int(os.getenv("SWAN_YEAR", next(iter(EVENTS))))
if str(SWAN_YEAR) not in EVENTS:
    raise KeyError(f"SWAN_YEAR={SWAN_YEAR} not present in pipeline_config events:")

OUTPUT_ROOT = Path(DEFAULTS["OUTPUT_ROOT"]).expanduser()
EVENT_DIR   = OUTPUT_ROOT / f"event={SWAN_YEAR}"

# explicit override
RUN_DIR: Path | None = None
if os.getenv("RUN_DIR"):
    RUN_DIR = Path(os.getenv("RUN_DIR")).expanduser()
elif os.getenv("RUN_DATE"):
    RUN_DIR = EVENT_DIR / os.getenv("RUN_DATE")

# auto-detect latest Stage05A output if still None
if RUN_DIR is None:
    if not EVENT_DIR.is_dir():
        raise FileNotFoundError(f"No outputs for event={SWAN_YEAR} in {EVENT_DIR}")
    cand = list(EVENT_DIR.glob("*/stage05a/Stage5A_QuintilesAndScores.csv"))
    if not cand:
        raise FileNotFoundError(
            f"Stage5A_QuintilesAndScores.csv not found anywhere under {EVENT_DIR}. "
            "Run Stage 05A first."
        )
    QS_FILE = max(cand, key=lambda p: p.stat().st_mtime)
    RUN_DIR = QS_FILE.parents[1]          # …/<run_tag>/
else:
    QS_FILE = RUN_DIR / "stage05a" / "Stage5A_QuintilesAndScores.csv"
    if not QS_FILE.is_file():
        raise FileNotFoundError(f"{QS_FILE} not found – run Stage 05A first")

RUN_DATE    = RUN_DIR.name
STAGE5B_DIR = RUN_DIR / "stage05b"
STAGE5B_DIR.mkdir(parents=True, exist_ok=True)

# ╔══════════════════════════════════════════════════════════════════╗
# 1 · LOGGER                                                         #
# ╚══════════════════════════════════════════════════════════════════╝
logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[
        logging.FileHandler(STAGE5B_DIR / "stage05b.log", mode="w", encoding="utf-8"),
        logging.StreamHandler(sys.stdout),
    ],
)
logger = logging.getLogger(__name__)
logger.info("==========  STAGE 05B: SNAPSHOT LOGIT REGRESSIONS ==========")
logger.info("SWAN_YEAR=%s  RUN_DATE=%s  RUN_DIR=%s", SWAN_YEAR, RUN_DATE, RUN_DIR)
warnings.filterwarnings("ignore", category=RuntimeWarning)

# ╔══════════════════════════════════════════════════════════════════╗
# 2 · LOAD QUINTILES & SCORES                                        #
# ╚══════════════════════════════════════════════════════════════════╝
df = pd.read_csv(QS_FILE)
logger.info("Loaded Stage05A table: %d rows × %d cols", *df.shape)

if "Year" not in df.columns:
    raise KeyError("'Year' column missing – Stage 05A should write it.")

PRE_YEAR = SWAN_YEAR - 1
snap = df[df["Year"] == PRE_YEAR].copy()
if snap.empty:
    raise ValueError(f"No FY-{PRE_YEAR} rows – check SWAN_YEAR & Stage 05A output.")
logger.info("Snapshot sample (FY-%d): %d rows", PRE_YEAR, len(snap))

# ╔══════════════════════════════════════════════════════════════════╗
# 3 · MODEL HELPERS                                                  #
# ╚══════════════════════════════════════════════════════════════════╝
METRICS = ["NetIncome","EarningBeforeInterestAndTax","OperatingIncome","EBITDA",
           "OperatingCashFlow","FreeCashFlow","Cash","CashAndCashEquivalents",
           "TotalRevenue","GrossProfit"]

DOMAINS = ["Physical_Score","Information_Score","Cognitive_Score","Social_Score"]
STAGES  = ["Prepare_Score","Absorb_Score","Recover_Score","Adapt_Score"]

def _fit_and_save(metric: str, predictors: List[str], tag: str) -> None:
    flag = f"Flag_{metric}"
    if flag not in snap.columns:
        logger.warning("%s: %s missing – skipped", metric, flag)
        return
    sub = snap[[flag] + predictors].dropna()
    if len(sub) < 50 or sub[flag].nunique() < 2:
        logger.warning("%s: insufficient data – skipped", metric)
        return

    y = sub[flag].astype(int)
    X = sm.add_constant(sub[predictors])
    try:
        mdl = sm.Logit(y, X).fit(disp=False)
    except Exception as e:
        logger.error("Fit error %s (%s): %s", metric, tag, e)
        return

    # notebook-friendly summary
    print(f"\n===== LOGIT ({tag}) – {metric}  (FY-{PRE_YEAR}) =====\n")
    print(mdl.summary())

    coef = (mdl.summary2().tables[1]
              .rename_axis("Term").reset_index()
              .rename(columns={"Coef.":"Coefficient",
                               "Std.Err.":"StdErr",
                               "P>|z|":"PValue"}))
    coef["OddsRatio"] = np.exp(coef["Coefficient"])
    coef = coef[["Term","Coefficient","StdErr","z","PValue",
                 "[0.025","0.975]","OddsRatio"]]
    out_csv = STAGE5B_DIR / f"{tag}_{metric}_Coefficients.csv"
    coef.to_csv(out_csv, index=False)
    print(f"→ saved {out_csv.name}")

# ╔══════════════════════════════════════════════════════════════════╗
# 4 · DOMAIN-SCORE MODELS (legacy 5C)                                #
# ╚══════════════════════════════════════════════════════════════════╝
missing_dom = [d for d in DOMAINS if d not in snap.columns]
if missing_dom:
    raise RuntimeError(f"Domain score columns missing: {missing_dom}")

logger.info("Running domain-score logits …")
for m in METRICS:
    _fit_and_save(m, DOMAINS, "Stage5C")

# ╔══════════════════════════════════════════════════════════════════╗
# 5 · STAGE-SCORE MODELS (legacy 5D)                                 #
# ╚══════════════════════════════════════════════════════════════════╝
missing_stg = [s for s in STAGES if s not in snap.columns]
if missing_stg:
    raise RuntimeError(f"Stage score columns missing: {missing_stg}")

logger.info("Running stage-score logits …")
for m in METRICS:
    _fit_and_save(m, STAGES, "Stage5D")

logger.info("✅  STAGE 05B complete – outputs in %s", STAGE5B_DIR)
print(f"\n✅ Stage 05B complete – coefficient CSVs saved to {STAGE5B_DIR}\n")

2025-06-10 13:09:22,528 | INFO    | SWAN_YEAR=2008  RUN_DATE=20250609  RUN_DIR=C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609
2025-06-10 13:09:24,796 | INFO    | Loaded Stage05A table: 34862 rows × 686 cols
2025-06-10 13:09:24,809 | INFO    | Snapshot sample (FY-2007): 974 rows
2025-06-10 13:09:24,812 | INFO    | Running domain-score logits …

===== LOGIT (Stage5C) – NetIncome  (FY-2007) =====

                           Logit Regression Results                           
Dep. Variable:         Flag_NetIncome   No. Observations:                  903
Model:                          Logit   Df Residuals:                      898
Method:                           MLE   Df Model:                            4
Date:                Tue, 10 Jun 2025   Pseudo R-squ.:                 0.04488
Time:                        13:09:24   Log-Likelihood:                -522.83
converged:                       True   LL-Null:                       -547.40
Covarianc