In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, classification_report,
    precision_recall_curve
)
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv(r"C:\Users\dell\OneDrive\Desktop\bank churn\resources\analytical_base_table.csv")  # replace with actual data path
X = df.drop(columns=['Exited'])
y = df['Exited']

# Split into train+val and test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42)

# Define columns
scale_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
cat_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'NumOfProducts']


In [3]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), scale_cols),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)
])


In [4]:
pipeline = ImbPipeline([
    ('preprocessor',preprocessor),
    ('model',RandomForestClassifier(class_weight='balanced'))
]
)

In [7]:
param_grid={
 'model__n_estimators' : [150,200,250],
 'model__min_samples_leaf' : [15,20,25],
 'model__min_samples_split' : [3, 4,5]
}

In [8]:
grid = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    scoring = 'recall',
    cv=5,n_jobs=-1,verbose=1
)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
print("Best Params:", grid.best_params_)
print("Cross validation recall score" ,grid.best_score_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Params: {'model__min_samples_leaf': 25, 'model__min_samples_split': 3, 'model__n_estimators': 200}
Cross validation recall score 0.7438697318007663


In [9]:
probs_val = best_model.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_val, probs_val)

f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)  # avoid div-by-zero
best_idx = np.argmax(f1_scores)
best_thresh = thresholds[best_idx]

print(f"Optimal Threshold (F1 max): {best_thresh:.4f}")


Optimal Threshold (F1 max): 0.5554


In [10]:
recall_cutoff = 0.75
valid_idxs = np.where(recall >= recall_cutoff)[0]

# Of those, pick the one with highest precision
best_idx_recall = valid_idxs[np.argmax(precision[valid_idxs])]
best_thresh_recall = thresholds[best_idx_recall]

print(f"Threshold for Recall ≥ 70%: {best_thresh_recall:.4f}")

Threshold for Recall ≥ 70%: 0.4892


In [11]:
probs_test = best_model.predict_proba(X_test)[:, 1]
y_pred_test_recall = (probs_test >= best_thresh_recall).astype(int)

print(f"ROC-AUC on Test: {roc_auc_score(y_test, probs_test):.4f}")
print("Classification using Recall score threshold")
print(classification_report(y_test, y_pred_test_recall))

ROC-AUC on Test: 0.8579
Classification using Recall score threshold
              precision    recall  f1-score   support

           0       0.93      0.80      0.86      1593
           1       0.50      0.76      0.60       407

    accuracy                           0.79      2000
   macro avg       0.71      0.78      0.73      2000
weighted avg       0.84      0.79      0.81      2000



In [12]:
probs_test = best_model.predict_proba(X_test)[:, 1]
y_pred_test = (probs_test >= best_thresh).astype(int)

print(f"ROC-AUC on Test: {roc_auc_score(y_test, probs_test):.4f}")
print("Classification using F1 score threshold")
print(classification_report(y_test, y_pred_test))


ROC-AUC on Test: 0.8579
Classification using F1 score threshold
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      1593
           1       0.56      0.68      0.62       407

    accuracy                           0.83      2000
   macro avg       0.74      0.77      0.75      2000
weighted avg       0.84      0.83      0.83      2000



In [13]:
from joblib import dump

model_package = {
    'model':best_model,
    'threshold':best_thresh_recall
}
dump(model_package,'../models/rforest_model.pkl')

['../models/rforest_model.pkl']

In [14]:
best_thresh_recall

0.48916555821411317

In [15]:
probs_train = best_model.predict_proba(X_train)[:, 1]
probs_val   = best_model.predict_proba(X_val)[:, 1]
probs_test  = best_model.predict_proba(X_test)[:, 1]


In [None]:
y_train_pred = (probs_train >= best_thresh_recall).astype(int)
y_val_pred   = (probs_val >= best_thresh_recall).astype(int)
y_test_pred  = (probs_test >= best_thresh_recall).astype(int)


In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate(y_true, y_pred, name):
    print(f"\n--- {name} ---")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall   :", recall_score(y_true, y_pred))
    print("F1 Score :", f1_score(y_true, y_pred))

evaluate(y_train, y_train_pred, "Train")
evaluate(y_val, y_val_pred, "Validation")
evaluate(y_test, y_test_pred, "Test")



--- Train ---
Accuracy : 0.814375
Precision: 0.5295315682281059
Recall   : 0.7975460122699386
F1 Score : 0.6364749082007344

--- Validation ---
Accuracy : 0.7925
Precision: 0.4939516129032258
Recall   : 0.7515337423312883
F1 Score : 0.5961070559610705

--- Test ---
Accuracy : 0.7935
Precision: 0.4951923076923077
Recall   : 0.7592137592137592
F1 Score : 0.5994180407371484


In [18]:
#hence no overfiiting