In [None]:
import numpy as np
import pandas as pd
from pyfiles.scrpt3_data_engineering import *

from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    average_precision_score,  # PR-AUC
    precision_recall_curve,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    balanced_accuracy_score,
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# XGBoost (optional)
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False


RANDOM_STATE = 42
N_SPLITS = 3
THRESH_METHOD = "f1"


@dataclass
class FoldResult:
    model_name: str
    fold: int
    threshold: float
    pr_auc: float
    roc_auc: float
    f1: float
    precision: float
    recall: float
    balanced_acc: float
    tn: int
    fp: int
    fn: int
    tp: int


def make_y_binary(y_series: pd.Series) -> np.ndarray:
    if y_series.dtype == "O" or str(y_series.dtype).startswith("string"):
        y = y_series.astype("string").str.lower().map({"default": 1, "nondefault": 0})
        if y.isna().any():
            bad_vals = y_series[y.isna()].unique()
            raise ValueError(f"Unmapped target values found: {bad_vals}")
        return y.to_numpy(dtype=int)
    return y_series.to_numpy(dtype=int)


def choose_threshold(y_true: np.ndarray, y_prob: np.ndarray, method: str = "f1") -> float:
    prec, rec, thr = precision_recall_curve(y_true, y_prob)
    if thr.size == 0:
        return 0.5
    if method == "f1":
        f1_vals = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
        return float(thr[int(np.nanargmax(f1_vals))])
    return 0.5


def compute_metrics(y_true: np.ndarray, y_prob: np.ndarray, thr: float):
    y_pred = (y_prob >= thr).astype(int)

    pr_auc = float(average_precision_score(y_true, y_prob))
    try:
        roc_auc = float(roc_auc_score(y_true, y_prob))
    except Exception:
        roc_auc = float("nan")

    f1 = float(f1_score(y_true, y_pred, zero_division=0))
    prec = float(precision_score(y_true, y_pred, zero_division=0))
    rec = float(recall_score(y_true, y_pred, zero_division=0))
    bal_acc = float(balanced_accuracy_score(y_true, y_pred))

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    return pr_auc, roc_auc, f1, prec, rec, bal_acc, int(tn), int(fp), int(fn), int(tp)


def undersample_xy(X: np.ndarray, y: np.ndarray, random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    rng = np.random.default_rng(random_state)
    idx_pos = np.where(y == 1)[0]
    idx_neg = np.where(y == 0)[0]
    n = min(len(idx_pos), len(idx_neg))
    keep_neg = rng.choice(idx_neg, size=n, replace=False)
    keep = np.concatenate([idx_pos, keep_neg])
    rng.shuffle(keep)
    return X[keep], y[keep]


def oversample_xy(X: np.ndarray, y: np.ndarray, random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    rng = np.random.default_rng(random_state)
    idx_pos = np.where(y == 1)[0]
    idx_neg = np.where(y == 0)[0]
    if len(idx_pos) == 0 or len(idx_neg) == 0:
        return X, y
    if len(idx_pos) < len(idx_neg):
        add = rng.choice(idx_pos, size=(len(idx_neg) - len(idx_pos)), replace=True)
        keep = np.concatenate([idx_neg, idx_pos, add])
    else:
        add = rng.choice(idx_neg, size=(len(idx_pos) - len(idx_neg)), replace=True)
        keep = np.concatenate([idx_pos, idx_neg, add])
    rng.shuffle(keep)
    return X[keep], y[keep]


def build_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    num_cols = X.select_dtypes(include=["number"]).columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols]

    num_tf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    cat_tf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("oh", OneHotEncoder(handle_unknown="ignore")),
    ])

    return ColumnTransformer(
        transformers=[
            ("num", num_tf, num_cols),
            ("cat", cat_tf, cat_cols),
        ],
        remainder="drop",
        sparse_threshold=0.3
    )


def get_proba(model, X_valid_np) -> np.ndarray:
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X_valid_np)[:, 1]
    if hasattr(model, "decision_function"):
        s = model.decision_function(X_valid_np)
        return 1 / (1 + np.exp(-s))
    raise ValueError("Model has neither predict_proba nor decision_function.")


def run_cv_models_no_imblearn(
    df: pd.DataFrame,
    target_col: str,
    id_col: Optional[str] = None,
    drop_cols: Optional[List[str]] = None,
    out_dir: str = ".",
):
    drop_cols = drop_cols or []

    df = df.copy()
    if id_col is None:
        df["_row_id"] = df.index.astype(str)
        id_col = "_row_id"

    y = make_y_binary(df[target_col])
    # X = df.drop(columns=[target_col] + drop_cols)

    loantape_applicant_cols_short1 = ["account", "brrgy", "ctzna", "usrsd", "vsnla",  
                                     "cbrr", "csgny", "brrfc", "brrtc", "brrta", 
                                     "brrdt", "brrtt", "brrtb", "cbrau_age", "cbrra", 
                                     "cbrab", "cbrrb", "cbrrd", "cbrrt", "cbrrc"]
    
    loantape_applicant_cols_short2 = ["account", "brrgy", "ctzna", "usrsd", "vsnla",  
                                     "cbrr", "csgny", "brrfc", "brrtc", "brrta", 
                                     "brrdt", "brrtt", "brrtb", "cbrra", 
                                     "cbrab", "cbrrb", "cbrrd", "cbrrt", "cbrrc"]
    
    loantape_applicant_cols_short3 = ["account", "brrfc", "brrtc", "brrta", 
                                     "brrdt", "brrtt", "brrtb"]
    
    loantape_applicant_cols_short4 = ["account", "brrfc", "brrtc", "brrta", 
                                     "brrdt", "brrtt", "brrtb", "borrower_fico_dti_interxn",
                                     "borrower_fico_tliab_interxn",
                                     "borrower_fico_lasset_interxn",
                                     "borrower_tliab_to_tincome",
                                     "borrower_tliab_to_tasset",
                                     "borrower_lasset_to_tasset",
                                     "borrower_dti_to_liab",
                                     "borrower_tliab_to_tassetsLn",
                                     "borrower_tincome_to_tassetLn",
                                     "borrower_tliab_to_tincomeLn"
                                    ]
    
    
    X = df[loantape_applicant_cols_short4]

    ids = X[id_col].astype(str).to_numpy()
    X_model = X.drop(columns=[id_col])

    preprocessor = build_preprocessor(X_model)

    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

    fold_results: List[FoldResult] = []
    pred_rows = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_model, y), start=1):
        X_tr_df, X_va_df = X_model.iloc[tr_idx], X_model.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        id_va = ids[va_idx]

        # Fit preprocessor on train only, transform both
        X_tr_np = preprocessor.fit_transform(X_tr_df)
        X_va_np = preprocessor.transform(X_va_df)

        # ----- Define estimators per fold -----
        estimators = {}

        # Logistic baseline
        estimators["logreg"] = LogisticRegression(
            max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE
        )

        # Logistic undersample / oversample
        estimators["logreg_undersample"] = LogisticRegression(max_iter=2000, random_state=RANDOM_STATE)
        estimators["logreg_oversample"] = LogisticRegression(max_iter=2000, random_state=RANDOM_STATE)

        # Decision tree
        estimators["decision_tree"] = DecisionTreeClassifier(
            max_depth=5, min_samples_leaf=25, class_weight="balanced", random_state=RANDOM_STATE
        )

        # Random forest
        estimators["random_forest"] = RandomForestClassifier(
            n_estimators=400,
            min_samples_leaf=10,
            class_weight="balanced_subsample",
            n_jobs=-1,
            random_state=RANDOM_STATE
        )

        # XGBoost
        if HAS_XGB:
            # scale_pos_weight = neg/pos in the training fold
            pos = max(1, int((y_tr == 1).sum()))
            neg = max(1, int((y_tr == 0).sum()))
            spw = neg / pos
            estimators["xgboost"] = XGBClassifier(
                n_estimators=600,
                max_depth=4,
                learning_rate=0.05,
                subsample=0.9,
                colsample_bytree=0.9,
                reg_lambda=1.0,
                objective="binary:logistic",
                eval_metric="logloss",
                random_state=RANDOM_STATE,
                n_jobs=-1,
                scale_pos_weight=spw
            )

        # Neural net (MLP) - "epochs" analogue: max_iter=10
        estimators["nn_mlp"] = MLPClassifier(
            hidden_layer_sizes=(64, 32),
            activation="relu",
            alpha=1e-4,
            learning_rate_init=1e-3,
            max_iter=10,
            early_stopping=False,
            random_state=RANDOM_STATE
        )

        # ----- Fit + Evaluate each -----
        for model_name, model in estimators.items():
            # sampling for specific variants
            if model_name == "logreg_undersample":
                X_fit, y_fit = undersample_xy(X_tr_np, y_tr, random_state=RANDOM_STATE + fold)
            elif model_name == "logreg_oversample":
                X_fit, y_fit = oversample_xy(X_tr_np, y_tr, random_state=RANDOM_STATE + fold)
            else:
                X_fit, y_fit = X_tr_np, y_tr

            model.fit(X_fit, y_fit)

            y_prob = get_proba(model, X_va_np)
            thr = choose_threshold(y_va, y_prob, method=THRESH_METHOD)

            pr_auc, roc_auc, f1, prec, rec, bal_acc, tn, fp, fn, tp = compute_metrics(y_va, y_prob, thr)
            fold_results.append(FoldResult(
                model_name=model_name,
                fold=fold,
                threshold=thr,
                pr_auc=pr_auc,
                roc_auc=roc_auc,
                f1=f1,
                precision=prec,
                recall=rec,
                balanced_acc=bal_acc,
                tn=tn, fp=fp, fn=fn, tp=tp
            ))

            y_pred = (y_prob >= thr).astype(int)
            pred_rows.append(pd.DataFrame({
                "row_id": id_va,
                "model": model_name,
                "fold": fold,
                "y_true": y_va,
                "y_prob": y_prob,
                "threshold": thr,
                "y_pred": y_pred
            }))

    metrics_df = pd.DataFrame([r.__dict__ for r in fold_results])
    preds_df = pd.concat(pred_rows, ignore_index=True)

    summary_df = (
        metrics_df
        .groupby("model_name")[["pr_auc", "roc_auc", "f1", "precision", "recall", "balanced_acc"]]
        .agg(["mean", "std"])
        .reset_index()
    )


    metrics_path = f"{out_dir}/cv_metrics_by_fold_{target_col}_4.csv"
    preds_path = f"{out_dir}/cv_predictions_{target_col}_4.csv"
    summary_path = f"{out_dir}/cv_metrics_summary_{target_col}_4.csv"

    metrics_df.to_csv(metrics_path, index=False)
    preds_df.to_csv(preds_path, index=False)
    summary_df.to_csv(summary_path, index=False)

    print(f"Saved:\n- {metrics_path}\n- {summary_path}\n- {preds_path}")

    return metrics_df, preds_df, summary_df


  map_tmoinfo["original_column"].astype(str).str.contains(pattern, na=False),
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)




Empirical Distribution of Default 90 DPD
      default_type90  count       pct
0         nondefault    729  0.825595
1  default w/in 2yrs     81  0.091733
2   default w/in 1yr     43  0.048698
3  default w/in 3yrs     25  0.028313
4  default w/in 4yrs      5  0.005663


Default Definition Counts
PD90_1yr
nondefault    840
default        43
Name: count, dtype: int64
PD90_2yr
nondefault    759
default       124
Name: count, dtype: int64
Nesting violations: 0


In [4]:
ID_COL = "account"  # change if needed

DROP_COLS = [
    # targets and label helpers
    "PD90_1yr", "PD90_2yr", "default_type90", "default_type120",
    # leakage / timing fields
    "lstdd", "lncnt", "mtrty", "pdffd",
    # add any cutoff-derived fields (days_since_paid, etc.) if present
]

metrics_2yr, preds_2yr, summary_2yr = run_cv_models_no_imblearn(
    df=df_loantape_anly,
    target_col="PD90_2yr",
    id_col=ID_COL,
    drop_cols=DROP_COLS,
    out_dir="."
)

metrics_1yr, preds_1yr, summary_1yr = run_cv_models_no_imblearn(
    df=df_loantape_anly,
    target_col="PD90_1yr",
    id_col=ID_COL,
    drop_cols=DROP_COLS,
    out_dir="."
)




Saved:
- ./cv_metrics_by_fold_PD90_2yr_4.csv
- ./cv_metrics_summary_PD90_2yr_4.csv
- ./cv_predictions_PD90_2yr_4.csv




Saved:
- ./cv_metrics_by_fold_PD90_1yr_4.csv
- ./cv_metrics_summary_PD90_1yr_4.csv
- ./cv_predictions_PD90_1yr_4.csv


