# Bank Customer Churn - Machine Learning



In [1]:
import pandas as pd
import numpy as np
import time
import xgboost as xgb

from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
%pip install opendatasets
import opendatasets as od

#Fetching data from kaggle
dataset_url = 'https://www.kaggle.com/datasets/gauravtopre/bank-customer-churn-dataset'

od.download(dataset_url)

Skipping, found downloaded files in "./bank-customer-churn-dataset" (use force=True to force download)


# Data Preprocessing

In [3]:
# Load the dataset
data = pd.read_csv('bank-customer-churn-dataset/Bank Customer Churn Prediction.csv')

# Drop columns
data.drop(columns=['customer_id'], inplace=True)

X = data.drop(columns=['churn'])

# Scaling numerical columns
cols_to_scale = ['credit_score', 'balance', 'estimated_salary']
scaler = MinMaxScaler()
X[cols_to_scale] = scaler.fit_transform(X[cols_to_scale])

# Convert categorical columns to one-hot encoding
X = pd.get_dummies(X, columns=['country', 'gender'], drop_first=False)

#Ensure all columns are numeric
X = X.astype(int)

# Target variable
y = data['churn']

In [4]:
print("Complete Preprocessed Dataset for Churn Model:")
display(X.head(), y.head())

Complete Preprocessed Dataset for Churn Model:


Unnamed: 0,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,country_France,country_Germany,country_Spain,gender_Female,gender_Male
0,0,42,2,0,1,1,1,0,1,0,0,1,0
1,0,41,1,0,1,0,1,0,0,0,1,1,0
2,0,42,8,0,3,1,0,0,1,0,0,1,0
3,0,39,1,0,2,0,0,0,1,0,0,1,0
4,0,43,2,0,1,1,1,0,0,0,1,1,0


Unnamed: 0,churn
0,1
1,0
2,1
3,0
4,0


# Splitting data

In [5]:
X_train_temp, X_temp, y_train_temp, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [6]:
# Apply SMOTE on the training data to handle class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_temp, y_train_temp)

In [7]:
print(f'Original data: {X.shape[0]} rows')
print(f'Training data: {X_train_temp.shape[0]} rows')
print(f'Training data (SMOTE): {X_train_smote.shape[0]} rows')
print(f'Validation data: {X_val.shape[0]} rows')
print(f'Test data: {X_test.shape[0]} rows')

Original data: 10000 rows
Training data: 7000 rows
Training data (SMOTE): 11094 rows
Validation data: 1500 rows
Test data: 1500 rows


# Logistic Regression Model

#### We're not using SMOTE with Logistic Regression because it may cause overfitting by introducing synthetic data. Instead, we're using class_weight='balanced' to handle class imbalance, which is more suitable for this linear model.

In [8]:
# Logistic Regression pipeline
logreg_pipeline = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression(
            penalty='l2',              # L2 regularization
            max_iter=1000,             # Ensure convergence
            class_weight='balanced',   # Handle class imbalance
            solver='saga',             # Use saga solver
            n_jobs=-1                  # Use all CPU cores
            ))
    ]
)

In [9]:
start_time = time.time()
logreg_pipeline.fit(X_train_temp, y_train_temp)
end_time = time.time()

train_time_logreg = end_time - start_time

print(f"LogReg Training time: {train_time_logreg:.2f} seconds")

LogReg Training time: 0.16 seconds


### Predict on the validation and test set

In [10]:
y_pred_val_logreg = logreg_pipeline.predict(X_val)
y_pred_test_logreg = logreg_pipeline.predict(X_test)

### Check Accuracy

In [11]:
val_acc_logreg = accuracy_score(y_val, y_pred_val_logreg)
test_acc_logreg = accuracy_score(y_test, y_pred_test_logreg)

print(f"LogReg Validation Accuracy: {val_acc_logreg:.2f}")
print(f"LogReg Test Accuracy: {test_acc_logreg:.2f}")

LogReg Validation Accuracy: 0.70
LogReg Test Accuracy: 0.72


###  Classification report for validation and test sets

In [13]:
report_val_logreg = classification_report(y_val, y_pred_val_logreg)
report_test_logreg = classification_report(y_test, y_pred_test_logreg)

print(f"LogReg Classification Report (Validation):\n{report_val_logreg}")
print(f"LogReg Classification Report (Test):\n{report_test_logreg}")

LogReg Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.91      0.70      0.79      1216
           1       0.35      0.69      0.46       284

    accuracy                           0.70      1500
   macro avg       0.63      0.69      0.63      1500
weighted avg       0.80      0.70      0.73      1500

LogReg Classification Report (Test):
              precision    recall  f1-score   support

           0       0.91      0.72      0.80      1200
           1       0.39      0.72      0.51       300

    accuracy                           0.72      1500
   macro avg       0.65      0.72      0.66      1500
weighted avg       0.81      0.72      0.74      1500



# XGBClassifier Model

In [14]:
xgb_pipeline = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('xgb', XGBClassifier(
            objective='binary:logistic',     # Binary classification
            eval_metric='logloss',           # Evaluation metric
            n_estimators=200,                # Number of boosting rounds
            n_jobs=-1                        # Use all CPU cores
            ))
    ])

In [15]:
start_time = time.time()
xgb_pipeline.fit(X_train_temp, y_train_temp)
end_time = time.time()

train_time_xgb = end_time - start_time

print(f"XGBoost Training time: {train_time_xgb:.2f} seconds")

XGBoost Training time: 0.19 seconds


### Predictions and evalutaion

In [16]:
y_pred_val_xgb = xgb_pipeline.predict(X_val)
y_pred_test_xgb = xgb_pipeline.predict(X_test)
print(f"XGBoost Validation Accuracy (no SMOTE): {accuracy_score(y_val, y_pred_val_xgb):.2f}")
print(f"XGBoost Test Accuracy (no SMOTE): {accuracy_score(y_test, y_pred_test_xgb):.2f}")
print("XGBoost Classification Report (Test, no SMOTE):")
print(classification_report(y_test, y_pred_test_xgb))

XGBoost Validation Accuracy (no SMOTE): 0.85
XGBoost Test Accuracy (no SMOTE): 0.84
XGBoost Classification Report (Test, no SMOTE):
              precision    recall  f1-score   support

           0       0.87      0.94      0.90      1200
           1       0.65      0.45      0.53       300

    accuracy                           0.84      1500
   macro avg       0.76      0.70      0.72      1500
weighted avg       0.83      0.84      0.83      1500



# XGBClassifier Model with SMOTE

In [17]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_temp, y_train_temp)

In [18]:
xgb_pipeline_smote = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('xgb', XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            n_estimators=200,
            n_jobs=-1
        ))
    ])

In [19]:
start_time = time.time()
xgb_pipeline_smote.fit(X_train_smote, y_train_smote)
end_time = time.time()
train_time_xgb_smote = end_time - start_time
print(f"XGBoost Training time (with SMOTE): {train_time_xgb_smote:.2f} seconds")

XGBoost Training time (with SMOTE): 0.92 seconds


### Predictions and evaluation with SMOTE

In [20]:
y_pred_val_xgb_smote = xgb_pipeline_smote.predict(X_val)
y_pred_test_xgb_smote = xgb_pipeline_smote.predict(X_test)
print(f"XGBoost Validation Accuracy (with SMOTE): {accuracy_score(y_val, y_pred_val_xgb_smote):.2f}")
print(f"XGBoost Test Accuracy (with SMOTE): {accuracy_score(y_test, y_pred_test_xgb_smote):.2f}")
print("XGBoost Classification Report (Test, with SMOTE):")
print(classification_report(y_test, y_pred_test_xgb_smote))

XGBoost Validation Accuracy (with SMOTE): 0.81
XGBoost Test Accuracy (with SMOTE): 0.81
XGBoost Classification Report (Test, with SMOTE):
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      1200
           1       0.53      0.55      0.54       300

    accuracy                           0.81      1500
   macro avg       0.71      0.71      0.71      1500
weighted avg       0.82      0.81      0.81      1500



- Without SMOTE: XGBoost achieves higher accuracy (84%), but it struggles with detecting churn cases (low recall of 45%). This means it's better at predicting non-churn (Klass 0) but misses many churn customers (Klass 1).

- With SMOTE: XGBoost's recall for churn cases improves to 55%, meaning it catches more churn customers. However, overall accuracy drops slightly to 81%, as the model becomes more balanced but may also produce more false positives for churn (lower precision).

# SVM Model

In [21]:
# SVM pipeline
svm_pipeline = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('svm', SVC(
            kernel='rbf',             # Radial basis function kernel
            C=1.0,                    # Regularization parameter
            class_weight='balanced',  # Handle class imbalance
            probability=True         # Enable probability estimates
        ))
    ]
)


In [22]:
start_time = time.time()
svm_pipeline.fit(X_train_temp, y_train_temp)
end_time = time.time()

train_time_svm = end_time - start_time

print(f"SVM Training time: {train_time_svm:.2f} seconds")

SVM Training time: 31.97 seconds


### Prediction and evaluation

In [23]:
val_acc_svm = svm_pipeline.predict(X_val)
test_acc_svm = svm_pipeline.predict(X_test)
print(f"SVM Validation Accuracy: {accuracy_score(y_val, val_acc_svm):.2f}")
print(f"SVM Test Accuracy: {accuracy_score(y_test, test_acc_svm):.2f}")
print("SVM Classification Report:")
print(classification_report(y_test, test_acc_svm))


SVM Validation Accuracy: 0.79
SVM Test Accuracy: 0.79
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.81      0.86      1200
           1       0.49      0.74      0.59       300

    accuracy                           0.79      1500
   macro avg       0.71      0.77      0.72      1500
weighted avg       0.84      0.79      0.81      1500



# SVM Model with SMOTE

In [24]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_temp, y_train_temp)

In [25]:
svm_pipeline_smote = Pipeline(
    steps=[
        ('scaler', StandardScaler()),  # Optional scaling step
        ('svm', SVC(
            kernel='rbf',
            C=1.0,
            class_weight='balanced',
            probability=True
        ))
    ])


In [26]:
start_time = time.time()
svm_pipeline_smote.fit(X_train_smote, y_train_smote)
end_time = time.time()
train_time_svm_smote = end_time - start_time
print(f"SVM Training time (with SMOTE): {train_time_svm_smote:.2f} seconds")

SVM Training time (with SMOTE): 21.46 seconds


### Predictions and evaluation with SMOTE

In [27]:
val_acc_svm_smote = svm_pipeline_smote.predict(X_val)
test_acc_svm_smote = svm_pipeline_smote.predict(X_test)
print(f"SVM Validation Accuracy (SMOTE): {accuracy_score(y_val, val_acc_svm_smote):.2f}")
print(f"SVM Test Accuracy (SMOTE): {accuracy_score(y_test, test_acc_svm_smote):.2f}")
print("SVM Classification Report (Test (SMOTE):")
print(classification_report(y_test, test_acc_svm_smote))

SVM Validation Accuracy (SMOTE): 0.82
SVM Test Accuracy (SMOTE): 0.82
SVM Classification Report (Test (SMOTE):
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      1200
           1       0.55      0.63      0.59       300

    accuracy                           0.82      1500
   macro avg       0.73      0.75      0.74      1500
weighted avg       0.83      0.82      0.83      1500



- Without SMOTE: SVM achieved a test accuracy of 79%, with a recall of 74% for churn cases (Class 1). However, the precision for Class 1 was relatively low at 49%, indicating a higher number of false positives.

- With SMOTE: SVM's test accuracy improved to 82%, and recall for Class 1 increased to 63%. Precision also improved slightly to 55%, resulting in a better balance between recall and precision for churn cases.

# We have found that SVM (SMOTE) is the best-performing model based on the comparison of accuracy, recall and precision.

We'll proceed with the following steps:

1. Hyperparameter Tuning for SVM using GridSearchCV.
2. Fit the GridSearchCV
3. Evaluate the Best Model


### 1. Hyperparameter tuning with GridSearchCV

In [28]:
# Define the parameter grid
param_grid = {
    'svm__C': [0.1, 1, 10],             # Regularization parameter
    'svm__kernel': ['rbf', 'linear'],    # Kernel type
    'svm__gamma': ['scale', 'auto']      # Kernel coefficient
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=svm_pipeline,
    param_grid=param_grid,
    scoring='f1',                        # Use f1-score to focus on class 1
    cv=3,                                # 3-fold cross-validation
    verbose=2,
    n_jobs=-1
)

### 2. Fit the GridSearchCV

In [29]:
# Fit the GridSearchCV
grid_search.fit(X_train_smote, y_train_smote)
print("Best hyperparameters:", grid_search.best_params_)

# Fit the model on the best found parameters
best_svm = grid_search.best_estimator_

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best hyperparameters: {'svm__C': 10, 'svm__gamma': 'auto', 'svm__kernel': 'rbf'}


### 3. Evaluate the best model

In [30]:
y_pred_test = best_svm.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Classification Report (Test):")
print(classification_report(y_test, y_pred_test))

Test Accuracy: 0.8193333333333334
Classification Report (Test):
              precision    recall  f1-score   support

           0       0.91      0.86      0.88      1200
           1       0.54      0.65      0.59       300

    accuracy                           0.82      1500
   macro avg       0.72      0.75      0.74      1500
weighted avg       0.83      0.82      0.83      1500



### After fine-tuning, the model's performance remains similar to before. Test accuracy stayed nearly the same (0.82), with a slight drop in precision for class 1 (from 0.55 to 0.54) and a small improvement in recall (from 0.63 to 0.65). Overall, fine-tuning made minimal impact.