In [None]:
#!/usr/bin/env python
"""
STAGE 03 · RATIO FABRICATION   v3.0 – 2025-06-20
─────────────────────────────────────────────────
• Reads Stage-02 wide file and fabricates every ratio defined in
  *ratio_library.py* (~400 base + optional Log_/Sqrt_ variants).

• Creates two copies per ratio
      <Ratio>_raw      – un-winsorised, unclipped
      <Ratio>          – 1 % / 99 % two-sided winsorised

Output
------
<OUTPUT_ROOT>/event=<SWAN_YEAR>/<RUN_TAG>/stage03/
    └─ Stage3_Data_WithRatios_<SWAN_YEAR>.(csv|parquet)
"""
from __future__ import annotations
import io, logging, os, warnings
from pathlib import Path
from typing import Callable, Dict, List, Literal

import numpy as np
import pandas as pd

from pipeline_utils import load_cfg, resolve_run_dir, ensure_three_letter_tickers
from ratio_library   import (
    ratio_funcs, derived_ratio_funcs, _ensure_core_columns, winsor
)

# ─── 1 · CONFIG ─────────────────────────────────────────────────
CFG          = load_cfg()
EVENTS       = {str(k): v for k, v in CFG.get("events", {}).items()}
C3           = CFG.get("stage3", {})
DEF          = CFG["defaults"]

SWAN_YEAR    = str(os.getenv("SWAN_YEAR") or next(iter(EVENTS)))
SAVE_FMT: Literal["csv", "parquet"] = os.getenv(
    "SAVE_FORMAT", DEF.get("SAVE_FORMAT", "csv")
).lower()
INCLUDE_TR   = bool(DEF.get("include_transforms", True))

RUN_DIR = resolve_run_dir(
    swan_year = SWAN_YEAR,
    run_tag   = os.getenv("RUN_TAG"),
    must_have = f"stage02/Stage2_Data_WithMetrics_{SWAN_YEAR}.{SAVE_FMT}",
)
STAGE2_F = RUN_DIR / "stage02" / f"Stage2_Data_WithMetrics_{SWAN_YEAR}.{SAVE_FMT}"

OUT_DIR  = RUN_DIR / "stage03"; OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_F    = OUT_DIR / f"Stage3_Data_WithRatios_{SWAN_YEAR}.{SAVE_FMT}"

DATE_COL = C3.get("date_col", "ReportDate")
ID_COL   = C3.get("id_col",   "Symbol")
WINSOR_PCT = float(C3.get("winsor_pct", 0.01))

# ─── 2 · LOGGER ─────────────────────────────────────────────────
logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(OUT_DIR / "stage03.log", "w", "utf-8"),
              logging.StreamHandler()],
)
log = logging.getLogger(__name__)
log.info("==========  STAGE 03  (SWAN %s) ==========", SWAN_YEAR)
log.info("Load file  : %s", STAGE2_F.name)
log.info("Save file  : %s", OUT_F.name)
log.info("Include transforms: %s", INCLUDE_TR)

warnings.filterwarnings("ignore", category=RuntimeWarning)

# ══════════ 3 · LOAD + CORE COLS ═══════════════════════════════
if SAVE_FMT == "parquet":
    df = pd.read_parquet(STAGE2_F)
else:
    df = pd.read_csv(STAGE2_F, parse_dates=[DATE_COL], low_memory=False)

df = ensure_three_letter_tickers(df, ID_COL)
df.sort_values([ID_COL, DATE_COL], inplace=True)
df = _ensure_core_columns(df)               # adds NetAssets etc.

log.info("Rows loaded: %s", f"{len(df):,}")

# ══════════ 4 · RATIO FUNCTION ROSTER ══════════════════════════
all_funcs: Dict[str, Callable] = dict(ratio_funcs)          # base ≈400
if INCLUDE_TR:
    all_funcs.update(derived_ratio_funcs or {})             # Log_/Sqrt_
else:
    # strip Log_/Sqrt_ if user disabled transforms
    for n in list(derived_ratio_funcs):
        all_funcs.pop(n, None)

ratio_names: List[str] = sorted(all_funcs)
log.info("Total candidate ratios: %d", len(ratio_names))

# ══════════ 5 · MAIN LOOP ═════════════════════════════════════
for name in ratio_names:
    fn = all_funcs[name]
    try:
        res = fn(df)                     # pass entire DF
    except Exception as exc:
        log.warning("❌ %-40s failed (%s)", name, exc)
        continue

    # normalise to Series
    if isinstance(res, (pd.Series, pd.DataFrame, np.ndarray, int, float)):
        series = (
            res if isinstance(res, pd.Series)
            else pd.Series(res, index=df.index, name=name)
        ).astype("float64")
        df[f"{name}_raw"] = series
        df[name]          = winsor(series, WINSOR_PCT)
    else:
        log.warning("⚠️  %-40s returned unsupported %s – skipped",
                    name, type(res))

log.info("Ratios computed: raw=%d | winsor=%d",
         df.filter(like="_raw").shape[1],
         len([c for c in df.columns if c in ratio_names]))

# ══════════ 6 · SAVE & LOG ═════════════════════════════════════
if SAVE_FMT == "parquet":
    df.to_parquet(OUT_F, index=False)
else:
    df.to_csv(OUT_F, index=False)

buf = io.StringIO(); df.info(buf=buf)
log.info("Final DataFrame info:\n%s", buf.getvalue())
log.info("Saved → %s", OUT_F.name)

# cache for downstream notebooks
data_stage_3 = df.copy()
log.info("✅  STAGE 03 complete")


2025-06-14 12:47:52,511 | INFO    | RUN_DIR        : C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2000\2025-06-13
2025-06-14 12:47:52,512 | INFO    | SWAN_YEAR=2000  RUN_DATE=2025-06-13  WINSOR_PCT=0.01
2025-06-14 12:47:52,512 | INFO    | DATE_COL / ID_COL = ReportDate / Symbol
2025-06-14 12:47:53,556 | INFO    | Stage-02 CSV loaded: 34862 rows
2025-06-14 12:47:54,222 | INFO    |    (base) computed  25 / 219 ratios
2025-06-14 12:47:59,161 | INFO    |    (base) computed  50 / 219 ratios
2025-06-14 12:47:59,284 | INFO    |    (base) computed  75 / 219 ratios


  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)


2025-06-14 12:48:09,291 | INFO    |    (base) computed 100 / 219 ratios


KeyboardInterrupt: 