In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, classification_report,
    precision_recall_curve
)
# Import classifier
from sklearn.svm import SVC 
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv(r"C:\Users\dell\OneDrive\Desktop\bank churn\resources\analytical_base_table.csv")  # replace with actual data path
X = df.drop(columns=['Exited'])
y = df['Exited']

# Split into train+val and test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42)

# Define columns
scale_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
cat_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'NumOfProducts']


In [3]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), scale_cols),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)
])


In [4]:
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    #('smote', SMOTE(random_state=42)),
    ('model', SVC(random_state=42,probability=True,class_weight='balanced'))
])


In [6]:
param_grid = {
    'model__kernel': ['rbf'],
    'model__C': [5,10],
    'model__gamma': [0.1, 0.01]
}

In [7]:
grid = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    scoring = 'roc_auc',
    cv=5,n_jobs=-1,verbose=1
)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
print("Best Params:", grid.best_params_)
print("Cross validation roc auc score" ,grid.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Params: {'model__C': 10, 'model__gamma': 0.01, 'model__kernel': 'rbf'}
Cross validation roc auc score 0.8486892225962975


In [8]:
probs_val = best_model.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_val, probs_val)

f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)  # avoid div-by-zero
best_idx = np.argmax(f1_scores)
best_thresh = thresholds[best_idx]

print(f"Optimal Threshold (F1 max): {best_thresh:.4f}")


Optimal Threshold (F1 max): 0.3434


In [9]:
probs_test = best_model.predict_proba(X_test)[:, 1]
y_pred_test = (probs_test >= best_thresh).astype(int)

print(f"ROC-AUC on Test: {roc_auc_score(y_test, probs_test):.4f}")
print(classification_report(y_test, y_pred_test))


ROC-AUC on Test: 0.8590
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      1593
           1       0.61      0.63      0.62       407

    accuracy                           0.84      2000
   macro avg       0.76      0.76      0.76      2000
weighted avg       0.85      0.84      0.84      2000



In [10]:
recall_cutoff = 0.75
valid_idxs = np.where(recall >= recall_cutoff)[0]

# Of those, pick the one with highest precision
best_idx_recall = valid_idxs[np.argmax(precision[valid_idxs])]
best_thresh_recall = thresholds[best_idx_recall]

print(f"Threshold for Recall ≥ 70%: {best_thresh_recall:.4f}")

Threshold for Recall ≥ 70%: 0.1996


In [11]:
probs_test = best_model.predict_proba(X_test)[:, 1]
y_pred_test_recall = (probs_test >= best_thresh_recall).astype(int)

print(f"ROC-AUC on Test: {roc_auc_score(y_test, probs_test):.4f}")
print("Classification using Recall score threshold")
print(classification_report(y_test, y_pred_test_recall))

ROC-AUC on Test: 0.8590
Classification using Recall score threshold
              precision    recall  f1-score   support

           0       0.93      0.76      0.84      1593
           1       0.46      0.77      0.57       407

    accuracy                           0.77      2000
   macro avg       0.69      0.77      0.71      2000
weighted avg       0.83      0.77      0.79      2000



In [16]:
from joblib import dump
model_package = {
    'model':best_model,
    'threshold':best_thresh_recall
}
dump(model_package,'../models/svc_model.pkl')

['../models/svc_model.pkl']

In [12]:
probs_train = best_model.predict_proba(X_train)[:, 1]
probs_val   = best_model.predict_proba(X_val)[:, 1]
probs_test  = best_model.predict_proba(X_test)[:, 1]


In [13]:
y_train_pred = (probs_train >= best_thresh_recall).astype(int)
y_val_pred   = (probs_val >= best_thresh_recall).astype(int)
y_test_pred  = (probs_test >= best_thresh_recall).astype(int)


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate(y_true, y_pred, name):
    print(f"\n--- {name} ---")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall   :", recall_score(y_true, y_pred))
    print("F1 Score :", f1_score(y_true, y_pred))

evaluate(y_train, y_train_pred, "Train")
evaluate(y_val, y_val_pred, "Validation")
evaluate(y_test, y_test_pred, "Test")



--- Train ---
Accuracy : 0.7809375
Precision: 0.47718808193668527
Recall   : 0.786042944785276
F1 Score : 0.5938586326767091

--- Validation ---
Accuracy : 0.763125
Precision: 0.45137614678899085
Recall   : 0.754601226993865
F1 Score : 0.5648679678530425

--- Test ---
Accuracy : 0.7665
Precision: 0.45652173913043476
Recall   : 0.773955773955774
F1 Score : 0.5742935278030994
