In [None]:
#!/usr/bin/env python
# =====================================================================
#  STAGE-08 · UNIVARIATE RANKING + SPARSE (L1) “R-SCORE” MODELS
#  v2.4 · 2025-06-17
#     – depth/∞ sanitiser (v2.3)
#     – ratio-column filter fix  ⟵ NEW
# =====================================================================
"""
Builds probability families for every *fundamental* metric:

    Speed  →  RScoreProb_<Metric>_preYYYY
    Depth  →  RScoreDepthProb_<Metric>_preYYYY

Key changes
-----------
• `ratio_cols` now contains **only** derived ratios:
      <ratio>_raw  +  <ratio>
  i.e. both columns must exist, so standalone raw financial series are
  excluded.

• ±∞ values are converted to NaN globally (df_full) and locally
  (X_pred) to prevent crashes in SimpleImputer.transform().
"""
from __future__ import annotations
from pathlib import Path
from typing  import Dict, List
import glob, logging, os, warnings

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.pipeline      import Pipeline
from sklearn.impute        import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LogisticRegressionCV
from sklearn.metrics       import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from numpy.linalg import LinAlgError
import statsmodels.api as sm
from statsmodels.tools.sm_exceptions import (
    MissingDataError, PerfectSeparationError)

from pipeline_utils import load_cfg, resolve_run_dir
warnings.filterwarnings("ignore", category=RuntimeWarning)

# ══════════════════════ 0 · BOOTSTRAP ════════════════════════════
CFG: Dict    = load_cfg()
EVENTS: Dict = {str(k): v for k, v in CFG.get("events", {}).items()}

SWAN_YEAR = str(os.getenv("SWAN_YEAR") or next(iter(EVENTS)))
SWAN_INT  = int(SWAN_YEAR)

RUN_DIR = resolve_run_dir(
    swan_year = SWAN_YEAR,
    run_tag   = os.getenv("RUN_TAG"),
    must_have = f"stage03/Stage3_Data_WithRatios_{SWAN_YEAR}.csv",
)
STAGE_DIR = RUN_DIR / "stage08"; STAGE_DIR.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[logging.FileHandler(STAGE_DIR/"stage08.log", "w", "utf-8"),
              logging.StreamHandler()],
)
log = logging.getLogger("stage08")
log.info("==========  STAGE 08 (SWAN %s) ==========", SWAN_YEAR)

# —— tunables ————————————————————————————————————————————————
ST8_CFG        = CFG.get("stage8", {})
SNAPSHOT_YR    = int(os.getenv("SNAPSHOT_YEAR",  SWAN_INT - 1))
USE_SNAPSHOT   = os.getenv("USE_SNAPSHOT", "True").lower() == "true"
MIN_OBS        = ST8_CFG.get("min_obs",           100)
COVER_PCT      = ST8_CFG.get("min_coverage_pct",   30)
TOP_N          = ST8_CFG.get("top_n",              80)
MIN_ROWS       = ST8_CFG.get("min_rows",          100)
CV_SPLITS      = ST8_CFG.get("cv_splits",           3)
MIN_CV_AUC     = ST8_CFG.get("min_cv_auc",       0.55)
CS_GRID        = np.logspace(-4, 2, 25)
SEED           = 42

DATE_COL, ID_COL = "ReportDate", "Symbol"

METRICS = [
    "NetIncome","EarningBeforeInterestAndTax","OperatingIncome","EBITDA",
    "OperatingCashFlow","FreeCashFlow","Cash","CashAndCashEquivalents",
    "TotalRevenue","GrossProfit",
]

# ═════════════ 1 · LOAD STAGE-03 DATA ════════════════════════════
SRC = RUN_DIR/"stage03"/f"Stage3_Data_WithRatios_{SWAN_YEAR}.csv"
df_full = pd.read_csv(SRC, low_memory=False)
df_full[DATE_COL] = pd.to_datetime(df_full[DATE_COL], errors="coerce")

# --- ∞→NaN (global) -----------------------------------------------------
df_full.replace([np.inf, -np.inf], np.nan, inplace=True)

df = df_full.copy()
if USE_SNAPSHOT:
    df = df[df[DATE_COL].dt.year == SNAPSHOT_YR].copy()
    log.info("Snapshot FY-%d → %s rows", SNAPSHOT_YR, f"{len(df):,}")
else:
    log.info("All-years mode → %s rows", f"{len(df):,}")

# ═════════════ 2 · PICK TRUE RATIO COLUMNS ═══════════════════════
OUTCOME_PREFIXES = ("Score_", "Flag_", "RP_", "ScoreDepth_", "FlagDepth_", "DD_")

raw_ratio_cols = [
    c for c in df.columns
    if c.endswith("_raw")
       and (root := c[:-4]) in df.columns               # winsorised twin exists
       and not any(root.startswith(pre) for pre in OUTCOME_PREFIXES)
]

winsor_ratio_cols = [c[:-4] for c in raw_ratio_cols]
ratio_cols        = raw_ratio_cols + winsor_ratio_cols
log.info("Using %d ratio columns (raw + winsorised)", len(ratio_cols))

# ═════════════ 3 · UNIVARIATE RANKINGS ═══════════════════════════
def _safe_spearman(x, y):
    ok = x.notna() & y.notna()
    return np.nan if ok.sum() < MIN_OBS else abs(spearmanr(x[ok], y[ok]).correlation)

def _logit_stats(x, flag):
    ok = x.notna() & flag.isin([0, 1])
    if ok.sum() < MIN_OBS or flag[ok].nunique() < 2:
        return np.nan, np.nan
    try:
        mdl = sm.Logit(flag[ok], sm.add_constant(x[ok])).fit(disp=False)
        return mdl.prsquared, roc_auc_score(flag[ok], mdl.predict())
    except (ValueError, LinAlgError, MissingDataError, PerfectSeparationError):
        return np.nan, np.nan

def write_rankings(flag_prefix: str, score_prefix: str, tag: str):
    rows_flag, rows_score = [], []
    for ratio in ratio_cols:
        cov = df[ratio].notna().mean() * 100
        if cov < COVER_PCT:
            continue
        x = df[ratio]
        for m in METRICS:
            flag = f"{flag_prefix}{m}"
            score = f"{score_prefix}{m}"
            if flag in df.columns:
                pr2, auc = _logit_stats(x, df[flag])
                rows_flag.append({"Ratio": ratio, "Metric": m,
                                  "Coverage%": round(cov, 1),
                                  "PseudoR2": round(pr2, 3) if pd.notna(pr2) else np.nan,
                                  "AUROC":    round(auc, 3) if pd.notna(auc) else np.nan})
            if score in df.columns:
                rows_score.append({"Ratio": ratio, "Metric": m,
                                   "Coverage%": round(cov, 1),
                                   "|rho|":     round(_safe_spearman(x, df[score]), 3)})
    if rows_flag:
        pd.DataFrame(rows_flag)\
          .to_csv(STAGE_DIR/f"08_{tag}flag_Ranking_{SWAN_YEAR}.csv", index=False)
    if rows_score:
        pd.DataFrame(rows_score)\
          .to_csv(STAGE_DIR/f"08_{tag}score_Ranking_{SWAN_YEAR}.csv", index=False)

write_rankings("Flag_",      "Score_",       "")        # speed
write_rankings("FlagDepth_", "ScoreDepth_", "Depth_")   # depth
log.info("✓ Univariate rankings written")

# ═════════════ 4 · SPARSE (L1) LOGIT MODELS ═════════════════════=
def build_sparse(flag_prefix: str,
                 rank_csv: Path,
                 flavour_tag: str,
                 prob_prefix: str):

    rank = pd.read_csv(rank_csv)

    # training snapshot (FY - 1)
    train = (df_full[df_full[DATE_COL].dt.year == SNAPSHOT_YR]
             .loc[:, ~df_full.columns.duplicated()].copy())

    prob_cols: List[str] = []

    def _fit(metric: str):
        flag_col = f"{flag_prefix}{metric}"
        if flag_col not in train.columns:
            return
        y = pd.to_numeric(train[flag_col], errors="coerce")
        if y.value_counts().min() < CV_SPLITS:
            return

        cand = (rank.query("Metric == @metric")
                    .nlargest(TOP_N, "AUROC" if "AUROC" in rank.columns else "|rho|")
                    ["Ratio"].unique().tolist())
        cand = [c for c in cand if c in train.columns]
        if not cand:
            return

        sub = train[[flag_col] + cand].dropna(subset=[flag_col])
        if sub.shape[0] < MIN_ROWS:
            return
        ybin = sub[flag_col].astype(int)
        if ybin.nunique() < 2:
            return
        X_df = sub[cand]

        pipe = Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc",  StandardScaler()),
            ("clf", LogisticRegressionCV(
                Cs       = CS_GRID,
                penalty  = "l1",
                solver   = "saga",
                scoring  = "roc_auc",
                cv       = StratifiedKFold(CV_SPLITS, shuffle=True, random_state=SEED),
                max_iter = 5000,
                n_jobs   = -1,
                refit    = True,
                random_state = SEED))
        ])
        pipe.fit(X_df, ybin)
        cv_auc = pipe.score(X_df, ybin)
        if cv_auc < MIN_CV_AUC:
            return

        beta = pipe.named_steps["clf"].coef_[0]
        if (beta != 0).sum() == 0:
            return

        flav = flavour_tag.rstrip("_") or "Speed"
        base = f"08_pre{SWAN_YEAR}_{metric}_{flav}"

        # coefficient export
        coef = pd.DataFrame({
            "Flavour":    flav,
            "Metric":     metric,
            "Term":       X_df.columns,
            "Coefficient": beta,
            "OddsRatio":   np.exp(beta)
        })
        coef.to_csv(STAGE_DIR/f"{base}_Coefficients.csv", index=False)

        (coef.assign(abscoef=lambda d: d.Coefficient.abs())
             .nlargest(20, "abscoef")
             .drop(columns="abscoef")
             .to_csv(STAGE_DIR/f"{base}_TopCoefs.csv", index=False))

        # predict on full DF (∞ fix local)
        X_pred = (df_full[X_df.columns]
                  .replace([np.inf, -np.inf], np.nan))
        df_full[f"{prob_prefix}{metric}_pre{SWAN_YEAR}"] = \
            pipe.predict_proba(X_pred)[:, 1]
        prob_cols.append(f"{prob_prefix}{metric}_pre{SWAN_YEAR}")

        log.info("%-6s %-12s rows=%4d kept=%2d cv-AUC=%.3f",
                 flav, metric, len(sub), (beta != 0).sum(), cv_auc)

    for m in METRICS:
        _fit(m)

    # dump all-metric probability matrix (per flavour)
    if prob_cols:
        out_tag = "Depth_" if flavour_tag else ""
        df_full[[ID_COL, DATE_COL] + prob_cols]\
            .to_csv(STAGE_DIR/f"08_pre{SWAN_YEAR}_{out_tag}AllMetrics_RScores.csv",
                    index=False)

build_sparse("Flag_",
             STAGE_DIR/f"08_flag_Ranking_{SWAN_YEAR}.csv",
             "",                     # speed
             "RScoreProb_")

build_sparse("FlagDepth_",
             STAGE_DIR/f"08_Depth_flag_Ranking_{SWAN_YEAR}.csv",
             "Depth_",               # depth
             "RScoreDepthProb_")

# ═════════════ 5 · CONSOLIDATED COEFFICIENT SUMMARY ═════════════
frames = []
for fp in glob.glob(str(STAGE_DIR/f"08_pre{SWAN_YEAR}_*_Coefficients.csv")):
    frames.append(pd.read_csv(fp)[["Flavour","Metric","Term","Coefficient","OddsRatio"]])

if frames:
    pd.concat(frames, ignore_index=True)\
      .to_csv(STAGE_DIR/f"08_pre{SWAN_YEAR}_CoefficientSummary.csv", index=False)

log.info("🎉  Stage-08 complete – artefacts in %s", STAGE_DIR)
print(f"\n✅ Stage-08 complete - outputs in {STAGE_DIR}\n")

SyntaxError: unexpected character after line continuation character (4071752701.py, line 283)