In [1]:
#!/usr/bin/env python
"""
STAGE 02 · RESILIENCE METRICS   v3.0 – 2025-06-20
──────────────────────────────────────────────────
Adds three resilience dimensions to the cleaned Stage-01 data and
writes a wide file for Stage-03.

Temporal  → ScoreTemporal_<M>, FlagTemporal_<M>
Impact    → ScoreImpact_<M>,   FlagImpact_<M>
Dynamic   → RateDown/Up_, Asymmetry_, Convexity_,
            + ScoreDynamic_<M>, FlagDynamic_<M>
"""
from __future__ import annotations
import io, logging, math, os, sys
from pathlib import Path
from typing import Dict, List, Tuple, Literal

import numpy as np
import pandas as pd
from pipeline_utils import load_cfg, resolve_run_dir
from sklearn.preprocessing import StandardScaler   # ← new

# ─── 1 · CONFIG ─────────────────────────────────────────────────
CFG       = load_cfg()
EVENTS    = {str(k): v for k, v in CFG.get("events", {}).items()}
C2        = CFG.get("stage2", {})

SWAN_YEAR = str(os.getenv("SWAN_YEAR") or next(iter(EVENTS)))
SWAN_INT  = int(SWAN_YEAR)
SAVE_FMT : Literal["csv", "parquet"] = os.getenv(
    "SAVE_FORMAT", CFG["defaults"].get("SAVE_FORMAT", "csv")
).lower()

RUN_DIR = resolve_run_dir(
    swan_year = SWAN_YEAR,
    must_have = f"stage01/stage01_cleaned_{SWAN_YEAR}.{SAVE_FMT}",
    run_tag   = os.getenv("RUN_TAG"),
)
OUT_DIR   = RUN_DIR / "stage02";  OUT_DIR.mkdir(parents=True, exist_ok=True)
STAGE1_F  = RUN_DIR / "stage01" / f"stage01_cleaned_{SWAN_YEAR}.{SAVE_FMT}"
OUT_F     = OUT_DIR  / f"Stage2_Data_WithMetrics_{SWAN_YEAR}.{SAVE_FMT}"

DATE_COL  = C2.get("date_col", "ReportDate")
ID_COL    = C2.get("id_col",   "Symbol")
MAX_YEARS = int(C2.get("max_years", 4))          # t₀ … t₀+3

METRICS: List[str] = C2.get(
    "metrics",
    ["NetIncome","EarningBeforeInterestAndTax","OperatingIncome","EBITDA",
     "OperatingCashFlow","FreeCashFlow","Cash","CashAndCashEquivalents",
     "TotalRevenue","GrossProfit"],
)
METRIC_SIGN: Dict[str, bool] = {m: True for m in METRICS}

# ─── 2 · LOGGER ─────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(OUT_DIR/"stage02.log", "w", "utf-8"),
              logging.StreamHandler(sys.stdout)],
)
log = logging.getLogger(__name__)
log.info("==========  STAGE 02  (SWAN %s) ==========", SWAN_YEAR)
log.info("Load  : %s", STAGE1_F.name)
log.info("Save  : %s", OUT_F.name)

# ─── 3 · LOAD DATA ─────────────────────────────────────────────
if SAVE_FMT == "parquet":
    df = pd.read_parquet(STAGE1_F)
else:
    df = pd.read_csv(STAGE1_F, parse_dates=[DATE_COL], low_memory=False)

df = df[df[ID_COL].astype(str).str.fullmatch(r"[A-Z]{3}")]
df["Year"] = pd.to_datetime(df[DATE_COL]).dt.year.astype("Int16")
log.info("Rows after primary-listing filter: %s", f"{len(df):,}")

# ─── 4 · HELPERS ───────────────────────────────────────────────
def _last_pre_swan(s: pd.Series) -> Tuple[float, int | None]:
    pre = s[s.index < SWAN_INT].dropna()
    return (pre.iloc[-1], pre.index[-1]) if len(pre) else (np.nan, None)

def _first_recovery(s: pd.Series, baseline: float, higher_ok: bool):
    if math.isnan(baseline): return np.nan
    cond = s >= baseline if higher_ok else s <= baseline
    cand = s[(s.index >= SWAN_INT) & cond]
    return cand.index.min() if len(cand) else np.nan

def _safe_idxmin(s: pd.Series):
    w = s.loc[SWAN_INT:SWAN_INT+MAX_YEARS-1].dropna()
    return w.idxmin() if len(w) else np.nan

def _rate(a, b, yrs):
    try:   return (a / b) ** (1 / yrs) - 1
    except Exception: return np.nan

def _as_df(series: pd.Series, name: str) -> pd.DataFrame:
    """Series → 2-col DF [Symbol | name] for safe merge."""
    return series.rename(name).reset_index().rename(columns={"index": ID_COL})

# ─── 5 · PER-METRIC LOOP ───────────────────────────────────────
for metric in METRICS:
    if metric not in df:
        log.warning("⏭️  %-28s missing – skipped", metric)
        continue

    grp = df.set_index("Year").groupby(ID_COL)[metric]

    baseline, b_year = zip(*grp.apply(_last_pre_swan))
    baseline = pd.Series(baseline, index=grp.groups.keys())
    b_year   = pd.Series(b_year,   index=grp.groups.keys())

    rec_year = grp.apply(
        lambda s: _first_recovery(s, baseline.get(s.name), METRIC_SIGN[metric])
    )

    # ── Temporal (recovery time) ──────────────────────────────
    rp  = (rec_year - SWAN_INT + 1).clip(lower=1, upper=MAX_YEARS)\
                                   .fillna(MAX_YEARS).astype("int16")
    score_temporal = ((rp - 1)/(MAX_YEARS-1)).round(4)
    flag_temporal  = (rp < rp.median()).astype("int8")

    # ── Impact (draw-down depth) ──────────────────────────────
    trough_val = grp.apply(lambda s: s[(s.index>=SWAN_INT) &
                                       (s.index<SWAN_INT+MAX_YEARS)].min())
    dd_raw = ((baseline - trough_val) / baseline)\
              .replace([np.inf,-np.inf], np.nan).clip(lower=0)
    score_impact = ((dd_raw - dd_raw.min()) / (dd_raw.max()-dd_raw.min())
                    if dd_raw.max()!=dd_raw.min() else dd_raw).round(4)
    flag_impact  = (dd_raw <= dd_raw.median()).astype("int8")

    # ── Dynamic (speed & shape) ───────────────────────────────
    trough_year = grp.apply(_safe_idxmin)

    yrs_down = (trough_year - b_year).where(trough_year > b_year, np.nan)
    yrs_up   = (rec_year    - trough_year).where(rec_year    > trough_year, np.nan)

    rate_down = pd.Series({i: _rate(trough_val[i], baseline[i], yrs_down[i])
                           for i in grp.groups})
    rate_up   = pd.Series({i: _rate(baseline[i],  trough_val[i], yrs_up[i])
                           for i in grp.groups})
    asymmetry = (rate_up - rate_down).abs()

    def _convex(s: pd.Series):
        r_y = rec_year.get(s.name, np.nan)
        seg = s.loc[SWAN_INT:r_y].dropna()
        if len(seg) < 6:   # need ≥6 points for a half-decent curve
            return np.nan
        try: a, *_ = np.polyfit(range(len(seg)), seg.values, 2); return a
        except Exception: return np.nan
    convexity = grp.apply(_convex)

    # ── Composite + flag (Stability removed) ──────────────────
    z = lambda v: (v - v.mean()) / v.std(ddof=0)
    comp = (
        + z(rate_up)                         # faster rise good
        - z(rate_down.abs())                 # steeper fall bad
        - z(asymmetry)                       # bigger gap bad
        - z(convexity.abs())                 # more curve bad
    )
    score_dyn  = comp.round(4)
    flag_dyn   = (comp >= comp.median()).astype("int8")

    # ── MERGE ALL NEW COLUMNS ─────────────────────────────────
    df = (df
        .merge(_as_df(rp,            f"RP_{metric}"),              on=ID_COL, how="left")
        .merge(_as_df(score_temporal,f"ScoreTemporal_{metric}"),   on=ID_COL, how="left")
        .merge(_as_df(flag_temporal, f"FlagTemporal_{metric}"),    on=ID_COL, how="left")
        .merge(_as_df(dd_raw,        f"DD_{metric}"),              on=ID_COL, how="left")
        .merge(_as_df(score_impact,  f"ScoreImpact_{metric}"),     on=ID_COL, how="left")
        .merge(_as_df(flag_impact,   f"FlagImpact_{metric}"),      on=ID_COL, how="left")
        .merge(_as_df(rate_down,     f"RateDown_{metric}"),        on=ID_COL, how="left")
        .merge(_as_df(rate_up,       f"RateUp_{metric}"),          on=ID_COL, how="left")
        .merge(_as_df(asymmetry,     f"Asymmetry_{metric}"),       on=ID_COL, how="left")
        .merge(_as_df(convexity,     f"Convexity_{metric}"),       on=ID_COL, how="left")
        .merge(_as_df(score_dyn,     f"ScoreDynamic_{metric}"),    on=ID_COL, how="left")
        .merge(_as_df(flag_dyn,      f"FlagDynamic_{metric}"),     on=ID_COL, how="left")
    )

    log.info("%-28s  temporal ✓  impact ✓  dynamic ✓", metric)

# ─── 6 · SAVE ───────────────────────────────────────────────────
if SAVE_FMT == "parquet":
    df.to_parquet(OUT_F, index=False)
else:
    df.to_csv(OUT_F, index=False)

buf = io.StringIO(); df.info(buf=buf)
log.info("Final DataFrame info:\n%s", buf.getvalue())
log.info("Saved → %s", OUT_F.name)

data_stage_2 = df.copy()          # keep in-memory cache
log.info("✅  STAGE 02 complete")


FileNotFoundError: No run directories under C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008