In [1]:
#!/usr/bin/env python
"""
Stage 08 · Univariate Ratio Ranking + Pre-Swan L1-Logit R-Scores
================================================================
Artefacts written to  <run>/stage08/ :

    08_flag_Ranking.csv
    08_score_Ranking.csv
    08_RP_Ranking.csv
    08_pre<YEAR>_<Metric>_Coefficients.csv
    08_pre<YEAR>_AllMetrics_RScores.csv
    08_pre<YEAR>_CoefficientSummary.csv
"""
from __future__ import annotations

# ── stdlib / 3rd-party imports ─────────────────────────────────────
from pathlib import Path
from typing  import List
import os, sys, glob, logging, warnings

import numpy as np
import pandas as pd
import yaml, statsmodels.api as sm
from scipy.stats import spearmanr
from sklearn.metrics import roc_auc_score
from numpy.linalg import LinAlgError
from statsmodels.tools.sm_exceptions import MissingDataError
from sklearn.pipeline      import Pipeline
from sklearn.impute        import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold

# ── shared helpers ─────────────────────────────────────────────────
from pipeline_utils import load_cfg, resolve_run_dir       # NEW

warnings.filterwarnings("ignore", category=RuntimeWarning)

# ╔══════════════════════════════════════════════════════════════════╗
# 0 · BOOTSTRAP  (cfg + run-folder + logger)                          #
# ╚══════════════════════════════════════════════════════════════════╝
CFG               = load_cfg()
EVENTS            = {str(k): v for k, v in CFG.get("events", {}).items()}

SWAN_YEAR_STR     = os.getenv("SWAN_YEAR") or next(iter(EVENTS))
if SWAN_YEAR_STR not in EVENTS:
    raise KeyError(f"SWAN_YEAR={SWAN_YEAR_STR} missing in events block")
SWAN_YEAR         = int(SWAN_YEAR_STR)           # numeric copy used later

# pick latest run that already contains Stage-03 data
RUN_DIR           = resolve_run_dir(
    must_have="stage03/Stage3_Data_WithRatios.csv")
RUN_DATE          = RUN_DIR.name

STAGE3_FILE       = RUN_DIR / "stage03" / "Stage3_Data_WithRatios.csv"
if not STAGE3_FILE.is_file():
    raise FileNotFoundError(f"{STAGE3_FILE} missing – run Stage 03 first")

STAGE_DIR         = RUN_DIR / "stage08"
STAGE_DIR.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[
        logging.FileHandler(STAGE_DIR / "stage08.log", mode="w", encoding="utf-8"),
        logging.StreamHandler(sys.stdout),
    ],
)
logger = logging.getLogger(__name__)
logger.info("==========  STAGE 08 ==========")
logger.info("RUN_DIR=%s  SWAN_YEAR=%s  RUN_DATE=%s", RUN_DIR, SWAN_YEAR, RUN_DATE)

# ═══════════════════════════════════════════════════════════════════
# 1 · PARAMETERS  (two defaults changed vs legacy)
# ═══════════════════════════════════════════════════════════════════
SNAPSHOT_YR  = int(os.getenv("SNAPSHOT_YEAR", SWAN_YEAR - 1))
USE_SNAPSHOT = os.getenv("USE_SNAPSHOT", "True").lower() == "true"

MIN_OBS, COVER_PCT = 100, 60
TOP_N      = int(os.getenv("TOP_N", 80))          # was 40
MIN_ROWS   = int(os.getenv("MIN_ROWS", 100))
CV_SPLITS  = int(os.getenv("CV_SPLITS", 3))
MIN_CLASS  = CV_SPLITS
MIN_CV_AUC = float(os.getenv("MIN_CV_AUC", 0.55)) # was 0.60
CS_GRID    = np.logspace(-4, 2, 25)
SEED       = 42

DATE_COL, ID_COL = "ReportDate", "Symbol"

METRICS = ["NetIncome","EarningBeforeInterestAndTax","OperatingIncome","EBITDA",
           "OperatingCashFlow","FreeCashFlow","Cash","CashAndCashEquivalents",
           "TotalRevenue","GrossProfit"]

# ═══════════════════════════════════════════════════════════════════
# 2 · LOAD DATA
# ═══════════════════════════════════════════════════════════════════
logger.info("Load Stage-03 CSV → %s", STAGE3_FILE)
df_full = pd.read_csv(STAGE3_FILE, low_memory=False)
df_full[DATE_COL] = pd.to_datetime(df_full[DATE_COL], errors="coerce")

df = df_full.copy()
if USE_SNAPSHOT:
    df = df[df[DATE_COL].dt.year == SNAPSHOT_YR].copy()
    logger.info("Snapshot mode: %d rows kept (FY-%d)", len(df), SNAPSHOT_YR)
else:
    logger.info("All-years mode: %d rows", len(df))

# ═══════════════════════════════════════════════════════════════════
# 3 · UNIVARIATE RANKING  (unchanged logic)
# ═══════════════════════════════════════════════════════════════════
valid_ratios = {c for c in df.columns if "_" in c}
ratio_cols   = [c for c in valid_ratios
                if pd.api.types.is_numeric_dtype(df[c])
                and not c.endswith("_raw")]

def _safe_spearman(x, y):
    m = x.notna() & y.notna()
    return np.nan if m.sum() < MIN_OBS else abs(spearmanr(x[m], y[m]).correlation)

def _logit_stats(x, y_flag):
    m = x.notna() & y_flag.isin([0,1])
    if m.sum() < MIN_OBS or y_flag[m].nunique() < 2:
        return np.nan, np.nan
    try:
        mdl = sm.Logit(y_flag[m], sm.add_constant(x[m])).fit(disp=False)
        return mdl.prsquared, roc_auc_score(y_flag[m], mdl.predict())
    except (ValueError, LinAlgError, MissingDataError):
        return np.nan, np.nan

rows_flag, rows_score, rows_rp = [], [], []

for ratio in ratio_cols:
    cov = df[ratio].notna().mean() * 100
    if cov < COVER_PCT:
        continue
    x = df[ratio]
    for met in METRICS:
        f, s, rp = f"Flag_{met}", f"Score_{met}", f"RP_{met}"
        if f in df.columns:
            pr2, auc = _logit_stats(x, df[f])
            rows_flag.append({"Ratio":ratio,"Metric":met,"Coverage%":round(cov,1),
                              "PseudoR2":round(pr2,3) if pd.notna(pr2) else np.nan,
                              "AUROC":   round(auc,3) if pd.notna(auc) else np.nan})
        if s in df.columns:
            rows_score.append({"Ratio":ratio,"Metric":met,"Coverage%":round(cov,1),
                               "|rho|":round(_safe_spearman(x, df[s]),3)})
        if rp in df.columns:
            rows_rp.append({"Ratio":ratio,"Metric":met,"Coverage%":round(cov,1),
                            "|rho|":round(_safe_spearman(x, df[rp]),3)})

pd.DataFrame(rows_flag ).to_csv(STAGE_DIR/"08_flag_Ranking.csv",  index=False)
pd.DataFrame(rows_score).to_csv(STAGE_DIR/"08_score_Ranking.csv", index=False)
pd.DataFrame(rows_rp   ).to_csv(STAGE_DIR/"08_RP_Ranking.csv",    index=False)
logger.info("✓ Univariate ranking tables written")

# ═══════════════════════════════════════════════════════════════════
# 4 · L1-LOGIT R-SCORES
# ═══════════════════════════════════════════════════════════════════
rank   = pd.read_csv(STAGE_DIR/"08_flag_Ranking.csv")
train  = df_full[df_full[DATE_COL].dt.year == (SWAN_YEAR - 1)].copy()
if train.empty:
    raise RuntimeError("No snapshot rows – cannot train.")

prob_cols: List[str] = []

def _fit(metric: str):
    flag = f"Flag_{metric}"
    if flag not in train.columns:
        logger.warning("%s missing – skipped", flag); return

    y_all = train[flag]
    if y_all.value_counts().min() < MIN_CLASS:
        logger.warning("%s: minority class too small – skipped", metric); return

    cand = (rank.query("Metric == @metric")
                .nlargest(TOP_N, "AUROC")["Ratio"].unique().tolist())
    cand = [c for c in cand if c in train.columns]
    if not cand:
        logger.warning("%s: no predictor candidates", metric); return

    sub = train[[flag] + cand].dropna(subset=[flag])
    if sub.shape[0] < MIN_ROWS:
        logger.warning("%s: < %d rows – skipped", metric, MIN_ROWS); return

    y = sub[flag]
    if isinstance(y, pd.DataFrame):   # duplicate name safety
        y = y.iloc[:, 0]
    y = y.astype(int)
    if y.nunique() < 2:
        logger.warning("%s: flag constant – skipped", metric); return

    X_df = sub[cand].loc[:, ~sub[cand].columns.duplicated()]

    pipe = Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc",  StandardScaler()),
        ("clf", LogisticRegressionCV(
            Cs       = CS_GRID,
            penalty  = "l1",
            solver   = "saga",
            scoring  = "roc_auc",
            cv       = StratifiedKFold(CV_SPLITS, shuffle=True, random_state=SEED),
            max_iter = 5000,
            n_jobs   = -1,
            refit    = True,
            random_state = SEED)),
    ])
    pipe.fit(X_df, y)

    clf    = pipe.named_steps["clf"]
    cv_auc = clf.scores_[1].mean()
    if cv_auc < MIN_CV_AUC:
        logger.warning("%s: cv-AUC %.3f < %.2f – discarded", metric, cv_auc, MIN_CV_AUC); return

    beta = clf.coef_[0]
    keep = beta != 0
    if keep.sum() == 0:
        logger.warning("%s: all β shrunk to 0 – skipped", metric); return

    # full coefficient list
    full_coef = pd.DataFrame({"Ratio": X_df.columns,
                              "Coefficient": beta})
    full_coef["OddsRatio"] = np.exp(full_coef["Coefficient"])
    full_coef.to_csv(STAGE_DIR/f"08_pre{SWAN_YEAR}_{metric}_Coefficients.csv",
                     index=False)

    # summary – top |β|
    top = full_coef.copy(); top["abscoef"] = top["Coefficient"].abs()
    (top.nlargest(20, "abscoef")
        .drop(columns="abscoef")
        .to_csv(STAGE_DIR/f"08_pre{SWAN_YEAR}_{metric}_TopCoefs.csv", index=False))

    # predicted probabilities (for every row in df_full)
    df_full[f"RScoreProb_{metric}_pre{SWAN_YEAR}"] = \
        pipe.predict_proba(df_full[X_df.columns])[:, 1]
    prob_cols.append(f"RScoreProb_{metric}_pre{SWAN_YEAR}")

    logger.info("%-25s n=%4d kept=%2d cv-AUC=%.3f",
                metric, len(sub), keep.sum(), cv_auc)

for m in METRICS:
    _fit(m)

# ── consolidate probabilities & coefficient summary ────────────────
if prob_cols:
    df_full[[ID_COL, DATE_COL] + prob_cols]\
        .to_csv(STAGE_DIR/f"08_pre{SWAN_YEAR}_AllMetrics_RScores.csv", index=False)

    frames = []
    for f in glob.glob(str(STAGE_DIR/f"08_pre{SWAN_YEAR}_*_Coefficients.csv")):
        metric = Path(f).stem.split("_")[2]
        tmp = pd.read_csv(f); tmp["Metric"] = metric; frames.append(tmp)
    (pd.concat(frames, ignore_index=True)
         [["Metric","Ratio","Coefficient","OddsRatio"]]
         .to_csv(STAGE_DIR/f"08_pre{SWAN_YEAR}_CoefficientSummary.csv", index=False))

logger.info("✅ STAGE 08 complete – artefacts in %s", STAGE_DIR)
print(f"\n✅ Stage 08 complete – outputs in {STAGE_DIR}\n")

2025-06-11 10:33:31,380 | INFO    | RUN_DIR=C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250610  SWAN_YEAR=2008  RUN_DATE=20250610
2025-06-11 10:33:31,381 | INFO    | Load Stage-03 CSV → C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250610\stage03\Stage3_Data_WithRatios.csv
2025-06-11 10:33:35,860 | INFO    | Snapshot mode: 974 rows kept (FY-2007)




2025-06-11 10:33:52,318 | INFO    | ✓ Univariate ranking tables written
2025-06-11 10:33:54,374 | INFO    | NetIncome                 n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:33:56,354 | INFO    | EarningBeforeInterestAndTax n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:33:58,251 | INFO    | OperatingIncome           n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:00,387 | INFO    | EBITDA                    n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:02,402 | INFO    | OperatingCashFlow         n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:06,040 | INFO    | FreeCashFlow              n= 973 kept= 2 cv-AUC=0.860
2025-06-11 10:34:07,809 | INFO    | Cash                      n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:09,652 | INFO    | CashAndCashEquivalents    n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:13,211 | INFO    | TotalRevenue              n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:15,253 | INFO    | GrossProfit               n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:15,783 | 