In [2]:
"""
STAGE 2 · RESILIENCE METRICS & BASELINE RATIOS
────────────────────────────────────────────────────────────────────────
Self-contained, works both as a stand-alone script and inside the same
Python session right after Stage 1.

Key features
• No manual parameters – everything comes from pipeline_config.yaml.
• Robust when __file__ is missing (e.g. Jupyter).
• Accepts either int or str keys under events: in the YAML.
• Re-uses `data_stage_1` if it exists in memory; otherwise loads the
  latest Stage 1 CSV.
• Produces  
  <OUTPUT_ROOT>/event=<SWAN_YEAR>/<RUN_DATE>/stage02/Stage2_Data_WithMetrics.csv
"""

from __future__ import annotations
from pathlib import Path
import os, sys, logging, yaml, io
from typing import Dict, List

import numpy as np
import pandas as pd

# ──────────────────────────────────────────────────────────────────────
# 0 · LOAD PIPELINE CONFIG  (handles missing __file__)
# ──────────────────────────────────────────────────────────────────────
if os.getenv("PIPELINE_CFG"):
    cfg_file = Path(os.getenv("PIPELINE_CFG")).expanduser()
else:
    try:                        # script run
        cfg_file = Path(__file__).resolve().parent / "pipeline_config.yaml"
    except NameError:           # Jupyter / interactive
        cfg_file = Path.cwd() / "pipeline_config.yaml"
cfg_file = cfg_file.expanduser()

if not cfg_file.is_file():
    raise FileNotFoundError(f"pipeline_config.yaml not found at {cfg_file}")

with cfg_file.open(encoding="utf-8") as fh:
    CFG: Dict = yaml.safe_load(fh) or {}

defaults: Dict = CFG.get("defaults", {})
events:   Dict = CFG.get("events",   {})
st2_cfg:  Dict = CFG.get("stage2",   {})

# helper: treat YAML keys as both str and int
event_keys_str = {str(k): v for k, v in events.items()}

# ──────────────────────────────────────────────────────────────────────
# 1 · CORE PARAMS
# ──────────────────────────────────────────────────────────────────────
SWAN_YEAR = os.getenv("SWAN_YEAR", next(iter(event_keys_str)))        # str
if SWAN_YEAR not in event_keys_str:
    raise KeyError(f"SWAN_YEAR={SWAN_YEAR} not present in YAML `events:` block")

MAX_YEARS = int(st2_cfg.get("max_years", 4))
DATE_COL  = st2_cfg.get("date_col", "ReportDate")
ID_COL    = st2_cfg.get("id_col",   "Symbol")

METRICS: List[str] = st2_cfg.get(
    "metrics",
    [
        "NetIncome", "EarningBeforeInterestAndTax", "OperatingIncome", "EBITDA",
        "OperatingCashFlow", "FreeCashFlow", "Cash", "CashAndCashEquivalents",
        "TotalRevenue", "GrossProfit",
    ],
)
METRIC_SIGN: Dict[str, bool] = {m: True for m in METRICS}   # True ⇒ higher better

# ──────────────────────────────────────────────────────────────────────
# 2 · PATHS & RUN_DATE
# ──────────────────────────────────────────────────────────────────────
OUTPUT_ROOT = Path(defaults["OUTPUT_ROOT"]).expanduser()
EVENT_DIR   = OUTPUT_ROOT / f"event={SWAN_YEAR}"

if os.getenv("RUN_DATE"):
    RUN_DATE = os.getenv("RUN_DATE")
elif "RUN_DATE" in globals():
    RUN_DATE = globals()["RUN_DATE"]
else:
    if not EVENT_DIR.is_dir():
        raise FileNotFoundError(f"No Stage 1 outputs at {EVENT_DIR}")
    candidates = sorted(p.name for p in EVENT_DIR.iterdir()
                        if p.is_dir() and p.name.isdigit())
    if not candidates:
        raise FileNotFoundError(f"No dated run folders in {EVENT_DIR}. Run Stage 1 first.")
    RUN_DATE = candidates[-1]

STAGE1_FILE = EVENT_DIR / RUN_DATE / "stage01" / "stage01_cleaned.csv"
OUT_DIR     = EVENT_DIR / RUN_DATE / "stage02"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ──────────────────────────────────────────────────────────────────────
# 3 · LOGGER
# ──────────────────────────────────────────────────────────────────────
if "logger" in globals() and isinstance(globals()["logger"], logging.Logger):
    logger = logging.getLogger(__name__)
    if not any(isinstance(h, logging.FileHandler) and h.baseFilename.endswith("stage02.log")
               for h in logger.handlers):
        logger.addHandler(logging.FileHandler(OUT_DIR / "stage02.log", mode="w", encoding="utf-8"))
else:
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s | %(levelname)-7s | %(message)s",
        handlers=[
            logging.FileHandler(OUT_DIR / "stage02.log", mode="w", encoding="utf-8"),
            logging.StreamHandler(sys.stdout),
        ],
    )
    logger = logging.getLogger(__name__)

logger.info("==========  STAGE 2: RESILIENCE METRICS ==========")
logger.info("Config file      : %s", cfg_file)
logger.info("SWAN_YEAR=%s  RUN_DATE=%s  MAX_YEARS=%s", SWAN_YEAR, RUN_DATE, MAX_YEARS)
logger.info("DATE_COL / ID_COL = %s / %s", DATE_COL, ID_COL)

# ──────────────────────────────────────────────────────────────────────
# 4 · LOAD STAGE 1 DATA  (memory → disk fallback)
# ──────────────────────────────────────────────────────────────────────
if "data_stage_1" in globals():
    df = globals()["data_stage_1"].copy()
    logger.info("Stage 1 data reused from memory.")
else:
    if not STAGE1_FILE.is_file():
        raise FileNotFoundError(f"Stage 1 CSV not found at {STAGE1_FILE}")
    df = pd.read_csv(STAGE1_FILE, parse_dates=[DATE_COL], low_memory=False)
    logger.info("Stage 1 CSV loaded: %s rows", f"{len(df):,}")

df["Year"] = df[DATE_COL].dt.year.astype("Int16")

# ──────────────────────────────────────────────────────────────────────
# 5 · DERIVED COLUMNS
# ──────────────────────────────────────────────────────────────────────
if {"PretaxIncome", "EffectiveTaxRateAsReported"}.issubset(df.columns):
    df["IncomeTaxExpense"] = df["PretaxIncome"] * df["EffectiveTaxRateAsReported"]
    med = df.groupby(["SectorName", "Year"])["IncomeTaxExpense"].transform("median")
    df["IncomeTaxExpense"].fillna(med, inplace=True)
else:
    df["IncomeTaxExpense"] = np.nan
    logger.warning("IncomeTaxExpense derivation skipped (missing inputs)")

if {"TotalAssets", "TotalLiabilitiesAsReported"}.issubset(df.columns):
    df["NetAssets"] = df["TotalAssets"] - df["TotalLiabilitiesAsReported"]
elif {"TotalAssets", "TotalLiabilities"}.issubset(df.columns):
    df["NetAssets"] = df["TotalAssets"] - df["TotalLiabilities"]
else:
    df["NetAssets"] = np.nan
    logger.warning("NetAssets derivation skipped (missing inputs)")

# ──────────────────────────────────────────────────────────────────────
# 6 · HELPERS
# ──────────────────────────────────────────────────────────────────────
SWAN_YEAR_INT = int(SWAN_YEAR)           # for numeric comparison

def _last_pre_swan(series: pd.Series) -> float:
    pre = series.dropna()
    pre = pre[pre.index < SWAN_YEAR_INT]
    return pre.iloc[-1] if not pre.empty else np.nan

def _first_recovery(series: pd.Series, baseline: float, higher_ok: bool) -> float:
    if pd.isna(baseline):
        return np.nan
    cond = series >= baseline if higher_ok else series <= baseline
    cand = series[(series.index >= SWAN_YEAR_INT) & cond]
    return cand.index.min() if not cand.empty else np.nan

# ──────────────────────────────────────────────────────────────────────
# 7 · METRIC LOOP
# ──────────────────────────────────────────────────────────────────────
for metric in METRICS:
    if metric not in df.columns:
        logger.warning("⏭️ %-30s missing — skipped", metric)
        continue

    grp = df.set_index("Year").groupby(ID_COL)[metric]

    baseline = grp.apply(_last_pre_swan).rename("Baseline")
    rec_year = grp.apply(
        lambda s: _first_recovery(s, baseline.get(s.name), METRIC_SIGN[metric])
    ).rename("RecYear")

    rp = (rec_year - SWAN_YEAR_INT + 1).clip(lower=1, upper=MAX_YEARS) \
                                       .fillna(MAX_YEARS).astype("int16")
    score = ((rp - 1) / (MAX_YEARS - 1)).round(4)     # 0 best … 1 worst
    flag  = (rp < rp.median()).astype("int8")

    df = (
        df.merge(rp.rename(f"RP_{metric}"),       on=ID_COL, how="left")
          .merge(score.rename(f"Score_{metric}"), on=ID_COL, how="left")
          .merge(flag.rename(f"Flag_{metric}"),   on=ID_COL, how="left")
    )

    logger.info("%-30s baseline %.1f%% | recovery %.1f%%",
                metric, baseline.notna().mean()*100, rp.notna().mean()*100)

# ──────────────────────────────────────────────────────────────────────
# 8 · EXPORT
# ──────────────────────────────────────────────────────────────────────
out_csv = OUT_DIR / "Stage2_Data_WithMetrics.csv"
df.to_csv(out_csv, index=False)

buf = io.StringIO(); df.info(buf=buf)
logger.info("Final DataFrame info:\n%s", buf.getvalue())
logger.info("Saved Stage 2 CSV → %s", out_csv)

data_stage_2 = df.copy()
logger.info("✅ STAGE 2 complete — `data_stage_2` ready")

2025-06-10 12:45:03,695 | INFO    | Config file      : c:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\pipeline_config.yaml
2025-06-10 12:45:03,696 | INFO    | SWAN_YEAR=2008  RUN_DATE=20250609  MAX_YEARS=4
2025-06-10 12:45:03,697 | INFO    | DATE_COL / ID_COL = ReportDate / Symbol
2025-06-10 12:45:04,693 | INFO    | Stage 1 CSV loaded: 34,862 rows
2025-06-10 12:45:05,640 | INFO    | NetIncome                      baseline 48.9% | recovery 100.0%
2025-06-10 12:45:06,494 | INFO    | EarningBeforeInterestAndTax    baseline 48.9% | recovery 100.0%
2025-06-10 12:45:07,598 | INFO    | OperatingIncome                baseline 48.9% | recovery 100.0%
2025-06-10 12:45:08,824 | INFO    | EBITDA                         baseline 48.9% | recovery 100.0%
2025-06-10 12:45:09,819 | INFO    | OperatingCashFlow              baseline 48.9% | recovery 100.0%
2025-06-10 12:45:10,938 | INFO    | FreeCashFlow                   baseline 48.9% | recovery 100.0%
2025-06-10 12:45:12,034 | INFO    | Cash  