In [15]:
import numpy as np
import pandas as pd
from pyfiles.scrpt3_data_engineering import *

from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    average_precision_score,  # PR-AUC
    precision_recall_curve,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    balanced_accuracy_score,
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

def label_pop_v1_graduate_to_repay(
    df: pd.DataFrame,
    cutoff,
    *,
    graduated_col: str = "grdtd",
    graduation_date_col: str = "graduation_date",
    first_payment_date_col: str = "first_payment_date",
    days_past_due_col: str = "days_past_due",   # if missing, you can map to days_late
    grace_months: int = 6,
    repay_window_months: int = 6,
    max_dpd_in_window: int = 30,
    treat_N_as_negative: bool = True,
) -> pd.DataFrame:
    """
    PoP v1: Graduate-to-Repay Transition

    Goal:
      Create an auditable proxy for "placement" using only fields you have:
      - grdtd (Y/N/NaN)
      - graduation_date
      - first_payment_date
      - days_past_due (or days_late)

    Definition (recommended):
      - Eligible window begins at graduation_date + grace_months
      - Borrower is label-eligible if:
          a) grdtd == 'Y' and cutoff >= graduation_date + grace_months + repay_window_months
         OR
          b) (optional) grdtd == 'N' treated as negative immediately (not censored)
      - Placed (PoP=1) if, among eligible:
          grdtd == 'Y'
          AND first_payment_date <= graduation_date + grace_months
          AND days_past_due <= max_dpd_in_window (proxy for "no early payment failure")
      - Not placed (PoP=0):
          grdtd == 'N' (if treat_N_as_negative)
          OR grdtd == 'Y' but fails repayment transition condition

    Outputs:
      Adds columns:
        - pop_label_eligible (1/0)
        - pop_label (1/0, NaN if not eligible)
        - pop_reason (audit trail)

    Notes:
      - This uses a *single snapshot* `days_past_due` as an approximation of early-window delinquency.
        If you later have month-by-month DPD, we can replace this with "max DPD in months 6–12 post-grad".
    """

    out = df.copy()
    cutoff = pd.Timestamp(cutoff)

    # Normalize graduation indicator
    grd = out[graduated_col].astype("string").str.upper().str.strip()
    grad_date = pd.to_datetime(out[graduation_date_col], errors="coerce")
    fpd = pd.to_datetime(out[first_payment_date_col], errors="coerce")

    # If days_past_due missing, fallback to days_late if present
    if days_past_due_col not in out.columns:
        if "days_late" in out.columns:
            days = pd.to_numeric(out["days_late"], errors="coerce")
            used_days_col = "days_late"
        else:
            raise KeyError(f"Missing `{days_past_due_col}` and no fallback `days_late` found.")
    else:
        days = pd.to_numeric(out[days_past_due_col], errors="coerce")
        used_days_col = days_past_due_col

    # Key dates
    grace_end = grad_date + pd.DateOffset(months=grace_months)
    window_end = grace_end + pd.DateOffset(months=repay_window_months)

    # Eligibility:
    # - Graduated AND enough time has passed to observe grace+repay window
    eligible_grad = (
        (grd == "Y") &
        grad_date.notna() &
        window_end.notna() &
        (cutoff >= window_end)
    ).fillna(False)

    # - Not graduated (N) optionally treated as negative label and eligible immediately
    if treat_N_as_negative:
        eligible_ng = (grd == "N").fillna(False)
    else:
        eligible_ng = pd.Series(False, index=out.index)

    eligible = (eligible_grad | eligible_ng).fillna(False)


    # Placement success (for graduated eligibles):
    # 1) entered repayment by end of grace
    paid_by_grace = (
        fpd.notna() &
        grace_end.notna() &
        (fpd <= grace_end)
    ).fillna(False)

    stable = (days.fillna(0) <= max_dpd_in_window).fillna(False)

    placed = eligible_grad & paid_by_grace & stable

    # Negative cases (eligible but not placed)
    not_placed = eligible & (~placed)

    # Assign label
    pop_label = pd.Series(np.nan, index=out.index, dtype="float")
    pop_label.loc[placed] = 1.0
    pop_label.loc[not_placed] = 0.0

    # Audit reason codes
    reason = pd.Series(pd.NA, index=out.index, dtype="string")
    reason.loc[~eligible] = "CENSORED_OR_INELIGIBLE"

    reason.loc[eligible_ng] = "NOT_GRADUATED"

    # For graduated eligibles:
    reason.loc[eligible_grad & (~paid_by_grace)] = "NO_PAYMENT_BY_END_OF_GRACE"
    reason.loc[eligible_grad & paid_by_grace & (~stable)] = f"EARLY_DELINQUENCY_{used_days_col}_GT_{max_dpd_in_window}"
    reason.loc[placed] = "PLACED_GRADUATED_AND_REPAYMENT_OK"

    out["pop_label_eligible"] = eligible.astype(int)
    out["pop_label"] = pop_label
    out["pop_reason"] = reason

    # Helpful audit columns
    out["pop_cutoff"] = cutoff
    out["pop_grace_end"] = grace_end
    out["pop_window_end"] = window_end
    out["pop_paid_by_grace"] = paid_by_grace.astype(int)
    out["pop_days_metric_used"] = used_days_col
    out["pop_days_metric_value"] = days

    return out


In [16]:
df_loantape_anly = label_pop_v1_graduate_to_repay(
    df_loantape_anly,
    cutoff="2025-12-01",
    graduated_col="grdtd",                 # your field
    graduation_date_col="grdtn",
    first_payment_date_col="fstpy",
    days_past_due_col="dysps",     # or set to "days_late" if that’s your available field
    grace_months=6,
    repay_window_months=6,
    max_dpd_in_window=30,
    treat_N_as_negative=True
)

# Sanity checks
print(df_loantape_anly["pop_label_eligible"].value_counts(dropna=False))
print(df_loantape_anly["pop_label"].value_counts(dropna=False))
print(df_loantape_anly["pop_reason"].value_counts(dropna=False).head(15))


pop_label_eligible
0    756
1    127
Name: count, dtype: int64
pop_label
NaN    756
0.0     78
1.0     49
Name: count, dtype: int64
pop_reason
CENSORED_OR_INELIGIBLE               756
NOT_GRADUATED                         52
PLACED_GRADUATED_AND_REPAYMENT_OK     49
EARLY_DELINQUENCY_dysps_GT_30         26
Name: count, dtype: Int64


In [17]:
df_pop = df_loantape_anly[df_loantape_anly["pop_label_eligible"] == 1].copy()
df_pop = df_pop[df_pop["pop_label"].notna()].copy()

target_col = "pop_label"
RANDOM_STATE = 42
N_SPLITS = 3
THRESH_METHOD = "accuracy"

y = df_pop[target_col].astype(int).to_numpy()

loantape_applicant_cols_short1 = ["account", "brrgy", "ctzna", "usrsd", "vsnla",  
                                    "cbrr", "csgny", "brrfc", "brrtc", "brrta", 
                                    "brrdt", "brrtt", "brrtb", "cbrau_age", "cbrra", 
                                    "cbrab", "cbrrb", "cbrrd", "cbrrt", "cbrrc"]

loantape_applicant_cols_short2 = ["account", "brrgy", "ctzna", "usrsd", "vsnla",  
                                    "cbrr", "csgny", "brrfc", "brrtc", "brrta", 
                                    "brrdt", "brrtt", "brrtb", "cbrra", 
                                    "cbrab", "cbrrb", "cbrrd", "cbrrt", "cbrrc"]

loantape_applicant_cols_short3 = ["account", "brrfc", "brrtc", "brrta", 
                                    "brrdt", "brrtt", "brrtb"]

loantape_applicant_cols_short4 = ["account", "brrfc", "brrtc", "brrta", 
                                    "brrdt", "brrtt", "brrtb", "borrower_fico_dti_interxn",
                                    "borrower_fico_tliab_interxn",
                                    "borrower_fico_lasset_interxn",
                                    "borrower_tliab_to_tincome",
                                    "borrower_tliab_to_tasset",
                                    "borrower_lasset_to_tasset",
                                    "borrower_dti_to_liab",
                                    "borrower_tliab_to_tassetsLn",
                                    "borrower_tincome_to_tassetLn",
                                    "borrower_tliab_to_tincomeLn"
                                ]


In [18]:
def compute_metrics_pop(y_true: np.ndarray, y_prob: np.ndarray, thr: float):
    y_pred = (y_prob >= thr).astype(int)

    pr_auc = float(average_precision_score(y_true, y_prob))
    roc_auc = float(roc_auc_score(y_true, y_prob))

    precision = float(precision_score(y_true, y_pred, zero_division=0))
    recall = float(recall_score(y_true, y_pred, zero_division=0))
    f1 = float(f1_score(y_true, y_pred, zero_division=0))
    accuracy = float((y_pred == y_true).mean())

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0

    return {
        "pr_auc": pr_auc,
        "roc_auc": roc_auc,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,          # sensitivity
        "specificity": specificity,
        "f1": f1,
        "tn": int(tn),
        "fp": int(fp),
        "fn": int(fn),
        "tp": int(tp),
    }


In [19]:
def build_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    num_cols = X.select_dtypes(include=["number"]).columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols]

    num_tf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    cat_tf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("oh", OneHotEncoder(handle_unknown="ignore")),
    ])

    return ColumnTransformer(
        transformers=[
            ("num", num_tf, num_cols),
            ("cat", cat_tf, cat_cols),
        ],
        remainder="drop",
        sparse_threshold=0.3
    )

def get_proba(model, X_valid_np) -> np.ndarray:
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X_valid_np)[:, 1]
    if hasattr(model, "decision_function"):
        s = model.decision_function(X_valid_np)
        return 1 / (1 + np.exp(-s))
    raise ValueError("Model has neither predict_proba nor decision_function.")

def choose_threshold(y_true: np.ndarray, y_prob: np.ndarray, method: str = "f1") -> float:
    prec, rec, thr = precision_recall_curve(y_true, y_prob)
    if thr.size == 0:
        return 0.5
    if method == "f1":
        f1_vals = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
        return float(thr[int(np.nanargmax(f1_vals))])
    return 0.5

def run_cv_models_pop(
    df: pd.DataFrame,
    target_col: str = "pop_label",
    id_col: str = "account",
    out_dir: str = ".",
):

    # -----------------------------
    # 1) Restrict to eligible only
    # -----------------------------
    df = df[df["pop_label_eligible"] == 1].copy()
    df = df[df[target_col].notna()].copy()

    y = df[target_col].astype(int).to_numpy()

    # Feature set (same as applicants)
    X = df[loantape_applicant_cols_short4].copy()
    ids = X[id_col].astype(str).to_numpy()
    X_model = X.drop(columns=[id_col])

    preprocessor = build_preprocessor(X_model)

    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

    fold_rows = []
    pred_rows = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_model, y), start=1):
        X_tr_df, X_va_df = X_model.iloc[tr_idx], X_model.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        id_va = ids[va_idx]

        X_tr_np = preprocessor.fit_transform(X_tr_df)
        X_va_np = preprocessor.transform(X_va_df)

        models = {
            "logreg": LogisticRegression(
                max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE
            ),
            "decision_tree": DecisionTreeClassifier(
                max_depth=4, min_samples_leaf=10, class_weight="balanced", random_state=RANDOM_STATE
            ),
            "random_forest": RandomForestClassifier(
                n_estimators=300,
                min_samples_leaf=10,
                class_weight="balanced_subsample",
                random_state=RANDOM_STATE,
                n_jobs=-1
            ),
        }

        for name, model in models.items():
            model.fit(X_tr_np, y_tr)
            y_prob = get_proba(model, X_va_np)
            thr = choose_threshold(y_va, y_prob, method="f1")

            m = compute_metrics_pop(y_va, y_prob, thr)
            m.update({"model": name, "fold": fold, "threshold": thr})
            fold_rows.append(m)

            pred_rows.append(pd.DataFrame({
                "account": id_va,
                "model": name,
                "fold": fold,
                "y_true": y_va,
                "y_prob": y_prob,
                "threshold": thr,
                "y_pred": (y_prob >= thr).astype(int)
            }))

    metrics_df = pd.DataFrame(fold_rows)
    preds_df = pd.concat(pred_rows, ignore_index=True)

    summary_df = (
        metrics_df
        .groupby("model")[[
            "accuracy", "recall", "specificity",
            "precision", "f1", "roc_auc", "pr_auc"
        ]]
        .agg(["mean", "std"])
        .reset_index()
    )

    metrics_df.to_csv(f"{out_dir}/cv_metrics_by_fold_PoP.csv", index=False)
    preds_df.to_csv(f"{out_dir}/cv_predictions_PoP.csv", index=False)
    summary_df.to_csv(f"{out_dir}/cv_metrics_summary_PoP.csv", index=False)

    return metrics_df, preds_df, summary_df


In [20]:
pop_metrics, pop_preds, pop_summary = run_cv_models_pop(
    df=df_loantape_anly,
    target_col="pop_label",
    id_col="account",
    out_dir="."
)

print(pop_summary)




           model  accuracy              recall           specificity  \
                      mean       std      mean       std        mean   
0  decision_tree  0.440015  0.102299  0.960784  0.067924    0.115385   
1         logreg  0.646364  0.090029  0.856618  0.157832    0.512821   
2  random_forest  0.543559  0.157213  0.939951  0.058862    0.294872   

            precision                  f1             roc_auc            \
        std      mean       std      mean       std      mean       std   
0  0.199852  0.410218  0.050690  0.571898  0.034942  0.577418  0.047883   
1  0.225364  0.543979  0.092359  0.653254  0.036216  0.642298  0.055833   
2  0.270145  0.469448  0.096497  0.621308  0.081631  0.615196  0.135759   

     pr_auc            
       mean       std  
0  0.449515  0.029285  
1  0.514438  0.065112  
2  0.503011  0.134649  


### Calibration of PoP

In [21]:
from sklearn.calibration import CalibratedClassifierCV

cal_pop = CalibratedClassifierCV(
    base_estimator=logreg_model,
    method="sigmoid",
    cv=3
)


NameError: name 'logreg_model' is not defined