In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, classification_report,
    precision_recall_curve
)
from sklearn.neighbors import KNeighborsClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv(r"C:\Users\dell\OneDrive\Desktop\bank churn\resources\analytical_base_table.csv")  # replace with actual data path
X = df.drop(columns=['Exited'])
y = df['Exited']

# Split into train+val and test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42)

# Define columns
scale_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
cat_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'NumOfProducts']


In [3]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), scale_cols),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)
])


In [4]:
pipeline = ImbPipeline([
    ('preprocessor',preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', KNeighborsClassifier(n_neighbors=5, weights='distance'))
])

In [11]:
param_grid = {
    'model__n_neighbors' : [10,11,12,13,14,15],
    'model__algorithm' : ['auto']
}


In [13]:
grid = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    scoring = 'recall',
    cv=5,n_jobs=-1,verbose=1
)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
print("Best Params:", grid.best_params_)
print("Cross validation roc auc score" ,grid.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params: {'model__algorithm': 'auto', 'model__n_neighbors': 15}
Cross validation roc auc score 0.6955496610669025


In [14]:
probs_val = best_model.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_val, probs_val)

f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)  # avoid div-by-zero
best_idx = np.argmax(f1_scores)
best_thresh = thresholds[best_idx]

print(f"Optimal Threshold (F1 max): {best_thresh:.4f}")


Optimal Threshold (F1 max): 0.5902


In [15]:
probs_test = best_model.predict_proba(X_test)[:, 1]
y_pred_test = (probs_test >= best_thresh).astype(int)

print(f"ROC-AUC on Test: {roc_auc_score(y_test, probs_test):.4f}")
print(classification_report(y_test, y_pred_test))


ROC-AUC on Test: 0.8145
              precision    recall  f1-score   support

           0       0.90      0.83      0.86      1593
           1       0.49      0.64      0.55       407

    accuracy                           0.79      2000
   macro avg       0.69      0.73      0.71      2000
weighted avg       0.82      0.79      0.80      2000



In [16]:
recall_cutoff = 0.75
valid_idxs = np.where(recall >= recall_cutoff)[0]

# Of those, pick the one with highest precision
best_idx_recall = valid_idxs[np.argmax(precision[valid_idxs])]
best_thresh_recall = thresholds[best_idx_recall]

print(f"Threshold for Recall ≥ 70%: {best_thresh_recall:.4f}")

Threshold for Recall ≥ 70%: 0.4562


In [17]:
probs_test = best_model.predict_proba(X_test)[:, 1]
y_pred_test_recall = (probs_test >= best_thresh_recall).astype(int)

print(f"ROC-AUC on Test: {roc_auc_score(y_test, probs_test):.4f}")
print("Classification using Recall score threshold")
print(classification_report(y_test, y_pred_test_recall))

ROC-AUC on Test: 0.8145
Classification using Recall score threshold
              precision    recall  f1-score   support

           0       0.92      0.71      0.80      1593
           1       0.40      0.76      0.53       407

    accuracy                           0.72      2000
   macro avg       0.66      0.74      0.67      2000
weighted avg       0.82      0.72      0.75      2000



In [18]:
from joblib import dump

model_package = {
    'model':best_model,
    'threshold':best_thresh_recall
}
dump(model_package,'../models/knn_model.pkl')

['../models/knn_model.pkl']