In [None]:
#!/usr/bin/env python
"""
STAGE 02 · RESILIENCE METRICS
────────────────────────────
Adds recovery‑speed and draw‑down resilience outcomes to the cleaned Stage‑01
financials and saves a wide CSV for Stage‑03.

Outputs
=======
<OUTPUT_ROOT>/event=<SWAN_YEAR>/<RUN_TAG>/stage02/
    └─ Stage2_Data_WithMetrics_<SWAN_YEAR>.csv

Outcome families created per *metric* (EBITDA, OpCF, …):
    • RP_<metric>             – integer years to recovery (1 … MAX_YEARS)
    • Score_<metric>          – scaled 0 (best) → 1 (worst) recovery‑speed
    • Flag_<metric>           – 1 if faster than median, else 0
    • DD_<metric>             – % draw‑down depth (0 best … 1 worst)
    • ScoreDepth_<metric>     – scaled draw‑down (min‑max normalization) rounded
    • FlagDepth_<metric>      – 1 if shallower than median, else 0

Down‑stream notebooks treat any column starting with Score*/Flag* as a resilience
outcome, so no further code changes outside this stage are required.
"""
from __future__ import annotations
import os, sys, io, logging
from pathlib import Path
from typing import Dict, List

import pandas as pd
import numpy as np

from pipeline_utils import load_cfg, resolve_run_dir

# ──────────────────────────────────────────────────────────
# 1 · CONFIG & PATHS
# ──────────────────────────────────────────────────────────
CFG      = load_cfg()                                       # pipeline_config.yaml
EVENTS   = {str(k): v for k, v in CFG.get("events", {}).items()}
ST2_CFG  = CFG.get("stage2", {})                            # optional YAML block

# – runtime parameters ––––––––––––––––––––––––––––––––––––
SWAN_YEAR = str(os.getenv("SWAN_YEAR") or next(iter(EVENTS)))
if SWAN_YEAR not in EVENTS:
    raise KeyError(f"SWAN_YEAR={SWAN_YEAR} not in YAML `events:` block")

RUN_DIR = resolve_run_dir(                                  # …/event=<YEAR>/<RUN_TAG>/
            swan_year=SWAN_YEAR,
            must_have=f"stage01/stage01_cleaned_{SWAN_YEAR}.csv")

STAGE_DIR = RUN_DIR / "stage02"
STAGE_DIR.mkdir(parents=True, exist_ok=True)

STAGE1_CSV = RUN_DIR / "stage01" / f"stage01_cleaned_{SWAN_YEAR}.csv"
OUT_CSV    = STAGE_DIR / f"Stage2_Data_WithMetrics_{SWAN_YEAR}.csv"

DATE_COL  = ST2_CFG.get("date_col", "ReportDate")
ID_COL    = ST2_CFG.get("id_col",   "Symbol")
MAX_YEARS = int(ST2_CFG.get("max_years", 4))

# – metrics to score ––––––––––––––––––––––––––––––––––––––
METRICS: List[str] = ST2_CFG.get(
    "metrics",
    [
        "NetIncome", "EarningBeforeInterestAndTax", "OperatingIncome",
        "EBITDA", "OperatingCashFlow", "FreeCashFlow",
        "Cash", "CashAndCashEquivalents", "TotalRevenue", "GrossProfit",
    ],
)
# True ⇒ higher is better
METRIC_SIGN: Dict[str, bool] = {m: True for m in METRICS}

# ──────────────────────────────────────────────────────────
# 2 · LOGGER
# ──────────────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[
        logging.FileHandler(STAGE_DIR / "stage02.log", mode="w", encoding="utf-8"),
        logging.StreamHandler(sys.stdout),
    ],
)
log = logging.getLogger(__name__)
log.info("==========  STAGE 02  ==========")
log.info("RUN_DIR   : %s", RUN_DIR)
log.info("SWAN_YEAR : %s", SWAN_YEAR)

# ──────────────────────────────────────────────────────────
# 3 · LOAD STAGE‑01 DATA
# ──────────────────────────────────────────────────────────
if "data_stage_1" in globals():
    df = globals()["data_stage_1"].copy()
    log.info("Re‑used Stage‑01 DataFrame from memory.")
else:
    df = pd.read_csv(STAGE1_CSV, parse_dates=[DATE_COL], low_memory=False)
    log.info("Loaded Stage‑01 CSV: %s rows", f"{len(df):,}")

df["Year"] = df[DATE_COL].dt.year.astype("Int16")
SWAN_YEAR_INT = int(SWAN_YEAR)

# ──────────────────────────────────────────────────────────
# 4 · HELPERS
# ──────────────────────────────────────────────────────────

def last_pre_swan(series: pd.Series) -> float:
    """Last non‑NA value before the crisis year."""
    pre = series[series.index < SWAN_YEAR_INT].dropna()
    return pre.iloc[-1] if not pre.empty else np.nan


def first_recovery(series: pd.Series, baseline: float, higher_ok: bool) -> float:
    """Earliest year ≥ swan where metric reaches the baseline again."""
    if pd.isna(baseline):
        return np.nan  # no baseline ⇒ no recovery
    cond = series >= baseline if higher_ok else series <= baseline
    candidates = series[(series.index >= SWAN_YEAR_INT) & cond]
    return candidates.index.min() if not candidates.empty else np.nan

# ──────────────────────────────────────────────────────────
# 5 · CORE LOOP – one metric at a time
# ──────────────────────────────────────────────────────────
for metric in METRICS:

    if metric not in df.columns:
        log.warning("⏭️  %-28s not found – skipped", metric)
        continue

    # group as Series indexed by fiscal Year
    g = df.set_index("Year").groupby(ID_COL)[metric]

    # ── speed / recovery period ───────────────────────────
    baseline  = g.apply(last_pre_swan)
    rec_year  = g.apply(lambda s: first_recovery(s, baseline.get(s.name), METRIC_SIGN[metric]))

    rp = (rec_year - SWAN_YEAR_INT + 1) \
           .clip(lower=1, upper=MAX_YEARS) \
           .fillna(MAX_YEARS).astype("int16")

    score_speed = ((rp - 1) / (MAX_YEARS - 1)).round(4)  # 0 best … 1 worst
    flag_speed  = (rp < rp.median()).astype("int8")

    # ── depth / draw‑down ─────────────────────────────────
    trough = g.apply(lambda s: s[(s.index >= SWAN_YEAR_INT) &
                                 (s.index <  SWAN_YEAR_INT + MAX_YEARS)].min())

    dd_raw = (baseline - trough) / baseline
    dd_raw = dd_raw.replace([np.inf, -np.inf], np.nan).clip(lower=0)

    # New ScoreDepth calculation with min‑max normalization
    if dd_raw.max() == dd_raw.min():
        score_depth = dd_raw.round(4)
    else:
        score_depth = ((dd_raw - dd_raw.min()) / (dd_raw.max() - dd_raw.min())).round(4)
    
    flag_depth  = (dd_raw <= dd_raw.median()).astype("int8")

    # ── merge into df ────────────────────────────────────
    df = (df.merge(rp.rename(f"RP_{metric}"),                 on=ID_COL, how="left")
            .merge(score_speed.rename(f"Score_{metric}"),       on=ID_COL, how="left")
            .merge(flag_speed.rename(f"Flag_{metric}"),         on=ID_COL, how="left")
            .merge(dd_raw.rename(f"DD_{metric}"),               on=ID_COL, how="left")
            .merge(score_depth.rename(f"ScoreDepth_{metric}"),  on=ID_COL, how="left")
            .merge(flag_depth.rename(f"FlagDepth_{metric}"),    on=ID_COL, how="left"))

    log.info("%-28s  baseline %.1f%% | recovery %.1f%% | depth %.1f%%",
             metric,
             baseline.notna().mean()*100,
             rp.notna().mean()*100,
             dd_raw.notna().mean()*100)

# ──────────────────────────────────────────────────────────
# 6 · SAVE & FINISH
# ──────────────────────────────────────────────────────────
df.to_csv(OUT_CSV, index=False)
buf = io.StringIO(); df.info(buf=buf)
log.info("Final DataFrame info:\n%s", buf.getvalue())
log.info("Saved → %s", OUT_CSV)

# keep a shallow copy in memory for Stage‑03 if run within same Python session
data_stage_2 = df.copy()
log.info("✅  STAGE 02 complete")

FileNotFoundError: No run contains stage01/stage01_cleaned.csv in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2000