In [None]:
#!/usr/bin/env python
"""
STAGE 08 · Univariate Ratio Ranking  +  L1-Logit R-Scores
──────────────────────────────────────────────────────────
• Creates univariate rankings for both *recovery-speed* and *draw-down depth*
  outcomes.

• Builds sparse (L1-logit) “R-Score” models **separately** for the two families:
      ─ speed   → RScoreProb_<Metric>_preYYYY
      ─ depth   → RScoreDepthProb_<Metric>_preYYYY

All coefficient files now carry the canonical column **Term**.

No predictor column is ever taken from the outcome set.
"""
from __future__ import annotations
from pathlib import Path
from typing  import Dict, List
import glob, logging, os, warnings

import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats                    import spearmanr
from sklearn.metrics                import roc_auc_score
from statsmodels.tools.sm_exceptions import (
    MissingDataError, PerfectSeparationError)
from numpy.linalg                   import LinAlgError
from sklearn.pipeline               import Pipeline
from sklearn.impute                 import SimpleImputer
from sklearn.preprocessing          import StandardScaler
from sklearn.linear_model           import LogisticRegressionCV
from sklearn.model_selection        import StratifiedKFold

from pipeline_utils import load_cfg, resolve_run_dir
warnings.filterwarnings("ignore", category=RuntimeWarning)

# ═════════════════════════════ 0 · BOOTSTRAP ══════════════════════
CFG: Dict = load_cfg()
EVENTS   : Dict = {str(k): v for k, v in CFG.get("events", {}).items()}

SWAN_YEAR = str(os.getenv("SWAN_YEAR") or next(iter(EVENTS)))
SWAN_INT  = int(SWAN_YEAR)
RUN_DIR   = resolve_run_dir(
    swan_year = SWAN_YEAR,
    must_have = f"stage03/Stage3_Data_WithRatios_{SWAN_YEAR}.csv",
    run_tag   = os.getenv("RUN_TAG"))
STAGE_DIR = RUN_DIR / "stage08"; STAGE_DIR.mkdir(exist_ok=True)

logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s | %(levelname)-7s | %(message)s",
                    handlers=[logging.FileHandler(STAGE_DIR/"stage08.log","w","utf-8"),
                              logging.StreamHandler()])
log = logging.getLogger("stage08")
log.info("==========  STAGE 08  (%s) ==========", SWAN_YEAR)

# ― tunables ―
SNAPSHOT_YR        = int(os.getenv("SNAPSHOT_YEAR", SWAN_INT - 1))
USE_SNAPSHOT       = os.getenv("USE_SNAPSHOT", "True").lower() == "true"
MIN_OBS, COVER_PCT = 100, 60
TOP_N, MIN_ROWS    = 80, 100
CV_SPLITS, MIN_CV_AUC = 3, 0.55
CS_GRID, SEED      = np.logspace(-4, 2, 25), 42

DATE_COL, ID_COL = "ReportDate", "Symbol"

METRICS = [
    "NetIncome","EarningBeforeInterestAndTax","OperatingIncome","EBITDA",
    "OperatingCashFlow","FreeCashFlow","Cash","CashAndCashEquivalents",
    "TotalRevenue","GrossProfit",
]

# ═════════════ 1 · LOAD STAGE-03 DATA (no leakage) ════════════════
src = RUN_DIR/"stage03"/f"Stage3_Data_WithRatios_{SWAN_YEAR}.csv"
df_full = pd.read_csv(src, low_memory=False)
df_full[DATE_COL] = pd.to_datetime(df_full[DATE_COL], errors="coerce")

df = df_full.copy()
if USE_SNAPSHOT:
    df = df[df[DATE_COL].dt.year == SNAPSHOT_YR].copy()
    log.info("Snapshot FY-%d → %s rows", SNAPSHOT_YR, f"{len(df):,}")
else:
    log.info("All-years mode → %s rows", f"{len(df):,}")

# ── predictor candidates (purged of outcomes) ─────────────────────
OUTCOME_PREFIXES = ("Score_", "Flag_", "RP_",
                    "ScoreDepth_", "FlagDepth_", "DD_")
ratio_cols = [c for c in df.columns
              if "_" in c
              and not c.endswith("_raw")
              and pd.api.types.is_numeric_dtype(df[c])
              and not c.startswith(OUTCOME_PREFIXES)]

# ═════════════ 2 · UNIVARIATE RANKINGS  (speed & depth) ═══════════
def _safe_spearman(x, y):
    ok = x.notna() & y.notna()
    return np.nan if ok.sum() < MIN_OBS else abs(spearmanr(x[ok], y[ok]).correlation)

def _logit_stats(x, flag):
    ok = x.notna() & flag.isin([0, 1])
    if ok.sum() < MIN_OBS or flag[ok].nunique() < 2: return np.nan, np.nan
    try:
        mdl = sm.Logit(flag[ok], sm.add_constant(x[ok])).fit(disp=False)
        return mdl.prsquared, roc_auc_score(flag[ok], mdl.predict())
    except (ValueError, LinAlgError, MissingDataError, PerfectSeparationError):
        return np.nan, np.nan

def write_rankings(flag_prefix: str, score_prefix: str, tag: str):
    rows_flag, rows_score = [], []
    for ratio in ratio_cols:
        cov = df[ratio].notna().mean()*100
        if cov < COVER_PCT: continue
        x = df[ratio]
        for m in METRICS:
            f, s = f"{flag_prefix}{m}", f"{score_prefix}{m}"
            if f in df.columns:
                pr2, auc = _logit_stats(x, df[f])
                rows_flag.append({"Ratio":ratio,"Metric":m,"Coverage%":round(cov,1),
                                  "PseudoR2":round(pr2,3) if pd.notna(pr2) else np.nan,
                                  "AUROC"   :round(auc,3) if pd.notna(auc) else np.nan})
            if s in df.columns:
                rows_score.append({"Ratio":ratio,"Metric":m,"Coverage%":round(cov,1),
                                   "|rho|":round(_safe_spearman(x, df[s]),3)})
    if rows_flag:
        pd.DataFrame(rows_flag)\
          .to_csv(STAGE_DIR/f"08_{tag}flag_Ranking_{SWAN_YEAR}.csv", index=False)
    if rows_score:
        pd.DataFrame(rows_score)\
          .to_csv(STAGE_DIR/f"08_{tag}score_Ranking_{SWAN_YEAR}.csv", index=False)

write_rankings("Flag_",      "Score_",       "")        # speed
write_rankings("FlagDepth_", "ScoreDepth_", "Depth_")   # depth
log.info("✓ Univariate rankings written (speed & depth)")

# ═════════════ 3 · SPARSE (L1) LOGITS – helper ════════════════════
def build_sparse(flag_prefix:str,
                 rank_csv:Path,
                 flavour_tag:str,          # "" or "Depth_"
                 prob_prefix:str):         # "RScoreProb_" | "RScoreDepthProb_"

    rank  = pd.read_csv(rank_csv)
    train = df_full[df_full[DATE_COL].dt.year == SNAPSHOT_YR]\
              .loc[:, ~df_full.columns.duplicated()].copy()

    prob_cols : List[str] = []

    def _fit(metric:str):
        flag_col = f"{flag_prefix}{metric}"
        if flag_col not in train.columns: return
        y = pd.to_numeric(train[flag_col], errors="coerce")
        if y.value_counts().min() < CV_SPLITS: return

        # top-N candidate ratios
        cand = (rank.query("Metric == @metric")
                    .nlargest(TOP_N, "AUROC" if "AUROC" in rank.columns else "|rho|")
                    ["Ratio"].unique().tolist())
        cand = [c for c in cand if c in train.columns]
        if not cand: return

        sub = train[[flag_col]+cand].dropna(subset=[flag_col])
        if sub.shape[0] < MIN_ROWS: return
        ybin = sub[flag_col].astype(int)
        if ybin.nunique() < 2: return
        X_df = sub[cand]

        pipe = Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc" , StandardScaler()),
            ("clf", LogisticRegressionCV(
                Cs       = CS_GRID,
                penalty  = "l1",
                solver   = "saga",
                scoring  = "roc_auc",
                cv       = StratifiedKFold(CV_SPLITS, shuffle=True, random_state=SEED),
                max_iter = 5000,
                n_jobs   = -1,
                refit    = True,
                random_state = SEED))
        ])
        pipe.fit(X_df, ybin)
        clf    = pipe.named_steps["clf"]
        cv_auc = clf.scores_[1].mean()
        if cv_auc < MIN_CV_AUC: return

        beta = clf.coef_[0]
        if (beta != 0).sum() == 0: return

        # ---- artefacts --------------------------------------------
        flav = flavour_tag.rstrip("_")  # "" → "", "Depth_" → "Depth"
        base = f"08_pre{SWAN_YEAR}_{metric}{'_'+flav if flav else ''}"

        coef = pd.DataFrame({"Term": X_df.columns,
                             "Coefficient": beta,
                             "OddsRatio": np.exp(beta)})
        coef.to_csv(STAGE_DIR/f"{base}_Coefficients.csv", index=False)
        (coef.assign(abscoef=lambda d: d.Coefficient.abs())
             .nlargest(20,"abscoef")
             .drop(columns="abscoef")
             .to_csv(STAGE_DIR/f"{base}_TopCoefs.csv", index=False))

        df_full[f"{prob_prefix}{metric}_pre{SWAN_YEAR}"] = \
            pipe.predict_proba(df_full[X_df.columns])[:,1]
        prob_cols.append(f"{prob_prefix}{metric}_pre{SWAN_YEAR}")

        log.info("%-6s %-12s rows=%4d kept=%2d cv-AUC=%.3f",
                 flav or "Speed", metric, len(sub),
                 (beta!=0).sum(), cv_auc)

    for m in METRICS:
        _fit(m)

    if prob_cols:
        out_tag = "Depth_" if flavour_tag else ""
        df_full[[ID_COL, DATE_COL]+prob_cols]\
            .to_csv(STAGE_DIR/f"08_pre{SWAN_YEAR}_{out_tag}AllMetrics_RScores.csv",
                    index=False)

build_sparse("Flag_",
             STAGE_DIR/f"08_flag_Ranking_{SWAN_YEAR}.csv",
             "",                # speed
             "RScoreProb_")

build_sparse("FlagDepth_",
             STAGE_DIR/f"08_Depth_flag_Ranking_{SWAN_YEAR}.csv",
             "Depth_",          # depth
             "RScoreDepthProb_")

# ═════════════ 4 · CONSOLIDATED COEFFICIENT SUMMARY ═══════════════
frames = []
for fp in glob.glob(str(STAGE_DIR/f"08_pre{SWAN_YEAR}_*_Coefficients.csv")):
    parts = Path(fp).stem.split("_")
    flavour = "Depth" if "Depth" in parts else "Speed"
    metric  = parts[2] if flavour=="Speed" else parts[3]
    tmp = pd.read_csv(fp)
    if "Ratio" in tmp.columns and "Term" not in tmp.columns:   # safety for old runs
        tmp.rename(columns={"Ratio":"Term"}, inplace=True)
    tmp["Metric"]  = metric
    tmp["Flavour"] = flavour
    frames.append(tmp)

if frames:
    (pd.concat(frames, ignore_index=True)
        [["Flavour","Metric","Term","Coefficient","OddsRatio"]]
        .to_csv(STAGE_DIR/f"08_pre{SWAN_YEAR}_CoefficientSummary.csv", index=False))

log.info("🎉  STAGE 08 complete – artefacts in %s", STAGE_DIR)



2025-06-11 10:33:31,380 | INFO    | RUN_DIR=C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250610  SWAN_YEAR=2008  RUN_DATE=20250610
2025-06-11 10:33:31,381 | INFO    | Load Stage-03 CSV → C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250610\stage03\Stage3_Data_WithRatios.csv
2025-06-11 10:33:35,860 | INFO    | Snapshot mode: 974 rows kept (FY-2007)




2025-06-11 10:33:52,318 | INFO    | ✓ Univariate ranking tables written
2025-06-11 10:33:54,374 | INFO    | NetIncome                 n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:33:56,354 | INFO    | EarningBeforeInterestAndTax n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:33:58,251 | INFO    | OperatingIncome           n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:00,387 | INFO    | EBITDA                    n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:02,402 | INFO    | OperatingCashFlow         n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:06,040 | INFO    | FreeCashFlow              n= 973 kept= 2 cv-AUC=0.860
2025-06-11 10:34:07,809 | INFO    | Cash                      n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:09,652 | INFO    | CashAndCashEquivalents    n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:13,211 | INFO    | TotalRevenue              n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:15,253 | INFO    | GrossProfit               n= 973 kept= 1 cv-AUC=0.860
2025-06-11 10:34:15,783 | 