In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    average_precision_score,  
    roc_auc_score, brier_score_loss,         
    precision_recall_curve, fbeta_score
)

In [3]:
df = pd.read_csv("BankChurners.csv")

In [5]:
TEST_SIZE = 0.25
RANDOM_STATE = 42
PRECISION_FLOOR = 0.60          
USE_FEATURE_SUBSET = True       
DO_CV = False                    

FEATURE_PRIORITY = [
    'Customer_Age','Dependent_count','Months_on_book','Total_Relationship_Count',
    'Months_Inactive_12_mon','Contacts_Count_12_mon','Credit_Limit','Total_Revolving_Bal',
    'Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1','Total_Trans_Amt','Total_Trans_Ct',
    'Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio',
    'Gender','Education_Level','Marital_Status','Income_Category','Card_Category']

In [7]:
if 'Churn' in df.columns:
    y = df['Churn'].astype(int)
    target_col = 'Churn'
elif 'Attrition_Flag' in df.columns:
    y = df['Attrition_Flag'].map({'Attrited Customer': 1, 'Existing Customer': 0}).astype(int)
    target_col = 'Attrition_Flag'
else:
    cand = [c for c in df.columns if 'churn' in c.lower()]
    if not cand:
        raise ValueError("No hay columna objetivo (Churn o Attrition_Flag).")
    target_col = cand[0]
    y = df[target_col].astype(int)

# Selección de features
drop_like = {'CLIENTNUM','CustomerId','customer_id','ID','id'}
if USE_FEATURE_SUBSET:
    cols = [c for c in FEATURE_PRIORITY if c in df.columns and c != target_col]
else:
    cols = [c for c in df.columns if c != target_col and c not in drop_like]

if not cols:
    raise ValueError("No hay columnas de entrada tras la selección; revisa nombres.")
X = df[cols].copy()

print(f"X shape: {X.shape} | y positivos: {y.sum()} ({y.mean():.2%}) | target: {target_col}")


X shape: (10127, 17) | y positivos: 1627 (16.07%) | target: Attrition_Flag


In [10]:
# Separar numéricas y categóricas
cat_cols = [c for c in X.columns if X[c].dtype == 'object']
num_cols = [c for c in X.columns if c not in cat_cols]

# Preprocesador actualizado para versiones nuevas de scikit-learn
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
    ],
    remainder="drop"
)
print(f"Num: {len(num_cols)} | Cat: {len(cat_cols)}")

Num: 12 | Cat: 5


In [11]:
def recall_at_precision_floor(y_true, y_proba, floor=0.60):
    """Devuelve (recall, f2, thr) en el punto de la curva PR con precision>=floor
    que maximiza recall. Si no existe, devuelve (nan).
    """
    prec, rec, thr = precision_recall_curve(y_true, y_proba)
    mask = prec[:-1] >= floor
    if not np.any(mask):
        return np.nan, np.nan, np.nan
    idxs = np.where(mask)[0]
    best_idx = idxs[np.argmax(rec[:-1][mask])]
    best_thr = float(thr[best_idx])
    best_rec = float(rec[:-1][best_idx])
    y_pred = (y_proba >= best_thr).astype(int)
    f2 = float(fbeta_score(y_true, y_pred, beta=2, zero_division=0))
    return best_rec, f2, best_thr


In [12]:
MODELS = {
    "LR": LogisticRegression(max_iter=500, class_weight="balanced", solver="lbfgs"),
    "RF": RandomForestClassifier(
        n_estimators=120, n_jobs=-1, random_state=RANDOM_STATE,
        class_weight="balanced", max_depth=None
    )}

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

records = []
for name, clf in MODELS.items():
    pipe = Pipeline(steps=[("pre", preprocess), ("clf", clf)])
    pipe.fit(X_train, y_train)
    proba = pipe.predict_proba(X_test)[:, 1]

    pr_auc = float(average_precision_score(y_test, proba))
    roc = float(roc_auc_score(y_test, proba))
    brier = float(brier_score_loss(y_test, proba))
    rec_p60, f2_p60, thr_p60 = recall_at_precision_floor(y_test, proba, floor=PRECISION_FLOOR)

    records.append({
        "model": name,
        "pr_auc": pr_auc,
        "roc_auc": roc,
        "brier": brier,
        "recall_at_p60": rec_p60,
        "f2_at_p60": f2_p60,
        "thr_at_p60": thr_p60
    })

results = pd.DataFrame(records).sort_values("pr_auc", ascending=False).reset_index(drop=True)
results

Unnamed: 0,model,pr_auc,roc_auc,brier,recall_at_p60,f2_at_p60,thr_at_p60
0,RF,0.892181,0.974679,0.048984,0.955774,0.858341,0.158333
1,LR,0.667538,0.904424,0.118798,0.695332,0.674131,0.687937
