In [None]:
#!/usr/bin/env python
"""
STAGE 02 · RESILIENCE METRICS
────────────────────────────
Adds Score_*  and Flag_* columns to the Stage-01 data and saves

  <OUTPUT_ROOT>/event=<SWAN_YEAR>/<RUN_TAG>/stage02/Stage2_Data_WithMetrics.csv
"""
from __future__ import annotations
import os, sys, io, logging
from pathlib import Path
from typing import Dict, List

import pandas as pd
import numpy as np

from pipeline_utils import load_cfg, resolve_run_dir


# ──────────────────────────────────────────────────────────
# 1 · CONFIG & PATHS
# ──────────────────────────────────────────────────────────
CFG      = load_cfg()                                       # pipeline_config.yaml
EVENTS   = {str(k): v for k, v in CFG.get("events", {}).items()}
ST2_CFG  = CFG.get("stage2", {})                            # optional YAML block

# – runtime parameters ––––––––––––––––––––––––––––––––––––
SWAN_YEAR = str(os.getenv("SWAN_YEAR") or next(iter(EVENTS)))
if SWAN_YEAR not in EVENTS:
    raise KeyError(f"SWAN_YEAR={SWAN_YEAR} not in YAML `events:` block")

RUN_DIR = resolve_run_dir(                                  # …/event=<YEAR>/<RUN_TAG>/
            swan_year=SWAN_YEAR,
            must_have="stage01/stage01_cleaned.csv")

STAGE_DIR = RUN_DIR / "stage02"
STAGE_DIR.mkdir(parents=True, exist_ok=True)

STAGE1_CSV = RUN_DIR / "stage01" / "stage01_cleaned.csv"
OUT_CSV    = STAGE_DIR / "Stage2_Data_WithMetrics.csv"

DATE_COL  = ST2_CFG.get("date_col", "ReportDate")
ID_COL    = ST2_CFG.get("id_col",   "Symbol")
MAX_YEARS = int(ST2_CFG.get("max_years", 4))

# – metrics to score ––––––––––––––––––––––––––––––––––––––
METRICS: List[str] = ST2_CFG.get(
    "metrics",
    [
        "NetIncome", "EarningBeforeInterestAndTax", "OperatingIncome",
        "EBITDA", "OperatingCashFlow", "FreeCashFlow",
        "Cash", "CashAndCashEquivalents", "TotalRevenue", "GrossProfit",
    ],
)
# True ⇒ higher is better
METRIC_SIGN: Dict[str, bool] = {m: True for m in METRICS}



# ──────────────────────────────────────────────────────────
# 2. LOGGER
# ──────────────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[
        logging.FileHandler(STAGE_DIR / "stage02.log", mode="w", encoding="utf-8"),
        logging.StreamHandler(sys.stdout),
    ],
)
log = logging.getLogger(__name__)
log.info("==========  STAGE 02  ==========")
log.info("RUN_DIR   : %s", RUN_DIR)
log.info("SWAN_YEAR : %s", SWAN_YEAR)

# ──────────────────────────────────────────────────────────
# 3. LOAD STAGE-01 DATA
# ──────────────────────────────────────────────────────────
if "data_stage_1" in globals():
    df = globals()["data_stage_1"].copy()
    log.info("Re-used Stage-01 DataFrame from memory.")
else:
    df = pd.read_csv(STAGE1_CSV, parse_dates=[DATE_COL], low_memory=False)
    log.info("Loaded Stage-01 CSV: %s rows", f"{len(df):,}")

df["Year"] = df[DATE_COL].dt.year.astype("Int16")
SWAN_YEAR_INT = int(SWAN_YEAR)

# ──────────────────────────────────────────────────────────
# 4. HELPERS
# ──────────────────────────────────────────────────────────
def last_pre_swan(series: pd.Series) -> float:
    """Last non-NA value before the crisis year."""
    pre = series[series.index < SWAN_YEAR_INT].dropna()
    return pre.iloc[-1] if not pre.empty else np.nan

def first_recovery(series: pd.Series, baseline: float, higher_ok: bool) -> float:
    """Earliest year >= swan where metric reaches the baseline again."""
    if pd.isna(baseline):                   # no baseline ⇒ no recovery
        return np.nan
    cond = series >= baseline if higher_ok else series <= baseline
    candidates = series[(series.index >= SWAN_YEAR_INT) & cond]
    return candidates.index.min() if not candidates.empty else np.nan

# ──────────────────────────────────────────────────────────
# 5. CORE LOOP – one metric at a time
# ──────────────────────────────────────────────────────────
for metric in METRICS:

    if metric not in df.columns:
        log.warning("⏭️  %-28s not found – skipped", metric)
        continue

    g = df.set_index("Year").groupby(ID_COL)[metric]

    baseline  = g.apply(last_pre_swan)
    rec_year  = g.apply(lambda s: first_recovery(
                            s, baseline.get(s.name), METRIC_SIGN[metric]))

    # Recovery period in years (1 = immediate, MAX_YEARS = never)
    rp = (rec_year - SWAN_YEAR_INT + 1) \
           .clip(lower=1, upper=MAX_YEARS) \
           .fillna(MAX_YEARS).astype("int16")

    score = ((rp - 1) / (MAX_YEARS - 1)).round(4)            # 0 best … 1 worst
    flag  = (rp < rp.median()).astype("int8")                 # simple binary flag

    df = (df.merge(rp.rename(f"RP_{metric}"),       on=ID_COL, how="left")
            .merge(score.rename(f"Score_{metric}"), on=ID_COL, how="left")
            .merge(flag.rename(f"Flag_{metric}"),   on=ID_COL, how="left"))

    log.info("%-28s  baseline %.1f%% | recovery %.1f%%",
             metric, baseline.notna().mean()*100, rp.notna().mean()*100)

# ──────────────────────────────────────────────────────────
# 6. SAVE & FINISH
# ──────────────────────────────────────────────────────────
df.to_csv(OUT_CSV, index=False)
buf = io.StringIO(); df.info(buf=buf)
log.info("Final DataFrame info:\n%s", buf.getvalue())
log.info("Saved → %s", OUT_CSV)

# keep a copy in memory for Stage-03 if run in same Python session
data_stage_2 = df.copy()
log.info("✅  STAGE 02 complete")

FileNotFoundError: No run contains stage01/stage01_cleaned.csv in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2000