# Predicting Bank Customer Attrition
By Lance Belen

<h2><u>Data Retrieval and Preparation</u></h2>

In [1]:
import pandas as pd
df = pd.read_csv("BankChurners.csv")
df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [2]:
df.shape

(10127, 23)

In [3]:
df.dtypes

CLIENTNUM                                                                                                                               int64
Attrition_Flag                                                                                                                         object
Customer_Age                                                                                                                            int64
Gender                                                                                                                                 object
Dependent_count                                                                                                                         int64
Education_Level                                                                                                                        object
Marital_Status                                                                                                                         object
Income

<h3><i>Data Pre-Processing</i></h3>

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df_cleaned = df.copy()
df_cleaned = df_cleaned.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1)
df_cleaned.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


<h4><u>Remove outliers</u></h4>

In [6]:
for col in df_cleaned.columns:
    if df[col].dtype == 'int':
        Q1 = df_cleaned[col].quantile(0.25)
        Q3 = df_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        threshold = 1.5
        outliers = df_cleaned[(df_cleaned[col] < Q1 - threshold * IQR) | (df_cleaned[col] > Q3 + threshold * IQR)]
        df_cleaned = df_cleaned.drop(outliers.index)
        print(f'Dropped {len(outliers)} outliers in {col}.')

Dropped 0 outliers in CLIENTNUM.
Dropped 2 outliers in Customer_Age.
Dropped 0 outliers in Dependent_count.
Dropped 385 outliers in Months_on_book.
Dropped 0 outliers in Total_Relationship_Count.
Dropped 308 outliers in Months_Inactive_12_mon.
Dropped 584 outliers in Contacts_Count_12_mon.
Dropped 0 outliers in Total_Revolving_Bal.
Dropped 808 outliers in Total_Trans_Amt.
Dropped 0 outliers in Total_Trans_Ct.


<h4><u>Check for missing values</u></h4>

In [7]:
if (len(df_cleaned[df_cleaned[col].isnull() == True]) > 0):
    print(f'Missing value/s found!')
else:
    print('No missing value/s!')

No missing value/s!


<h4><u>Rename ambiguous columns</u></h4>

In [8]:
df_cleaned=df_cleaned.rename(
    columns={
        'Months_Inactive_12_mon': 'Months_Inactive_Past_12_Months',
        'Contacts_Count_12_mon': 'Contacts_Count_Past_12_Months',
        'Total_Revolving_Bal': 'Total_Revolving_Balance',
        'Total_Amt_Chng_Q4_Q1': 'Transaction_Amt_Change_Q4_over_Q1',
        'Total_Trans_Amt': 'Total_Transaction_Amount',
        'Total_Trans_Ct': 'Total_Transaction_Count',
        'Total_Ct_Chng_Q4_Q1': 'Transaction_Count_Change_Q4_over_Q1',
    }
)

df_cleaned=df_cleaned.drop(columns=["CLIENTNUM")

KeyError: "['CLIENTNUM'] not found in axis"

<h4><u>Export dataset for analysis and visualisation</u></h4>

In [None]:
df_cleaned.to_excel('BankChurnersCleaned.xlsx')

<h2><u>Data Exploration</u></h2>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_cleaned.head()

In [None]:
df_cleaned.describe()

In [None]:
df_cleaned['Attrition_Flag'].value_counts()

In [None]:
sns.countplot(x='Attrition_Flag', data=df_cleaned, hue='Attrition_Flag', palette=['lightblue', 'steelblue'])
plt.title("Churn vs. Non-Churn Customers")
plt.xlabel("Attrition Flag (0 = Active, 1 = Churned)")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df_cleaned['Customer_Age'], bins=30, kde=True, color="steelblue")
plt.title("Distribution of Customer Age")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df_cleaned['Credit_Limit'], bins=30, kde=True, color="steelblue")
plt.title("Distribution of Credit Limit")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='Gender', hue='Attrition_Flag', data=df_cleaned, palette=['lightblue', 'steelblue'])
plt.title("Churn by Gender")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y='Income_Category', hue='Attrition_Flag', data=df_cleaned, palette=['lightblue', 'steelblue'])
plt.title("Churn by Income Category")
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='Attrition_Flag', y='Credit_Limit', data=df_cleaned, hue='Attrition_Flag', palette=['lightblue', 'steelblue'])
plt.title("Credit Limit vs. Churn")
plt.show()

In [None]:
sns.pairplot(df[['Customer_Age', 'Credit_Limit', 'Total_Trans_Amt', 'Attrition_Flag']], 
             hue="Attrition_Flag", palette=['lightblue', 'steelblue'])
plt.show()

<h2><u>Data Modelling</u></h2>

<h3><i>Defining Features and Target Variable</i></h3>

In [None]:
X = df_cleaned.drop(columns=['Attrition_Flag'])
y = df_cleaned['Attrition_Flag']

In [None]:
X

In [None]:
y

<h3><i>One-Hot Encoding</i></h3>

In [None]:
X = pd.get_dummies(X, drop_first=True)
X

<h3><i>Train-Test Split</i></h3>

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

<h3><i>Feature Scaling for Logistic Regression</i></h3>

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h3><i>Train Models</i></h3>

<h4><u>Logistic Regression</u></h4>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression(max_iter=250)
log_reg.fit(X_train_scaled, y_train)

y_pred_log = log_reg.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))

<h4><u>Decision Tree</h4>

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

<h4><u>XGBoost</u></h4>

In [None]:
from xgboost import XGBClassifier

y_train_xgb = y_train.map({'Attrited Customer': 1, 'Existing Customer': 0})
y_test_xgb = y_test.map({'Attrited Customer': 1, 'Existing Customer': 0})

xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train, y_train_xgb)

y_pred_xgb = xgb.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test_xgb, y_pred_xgb))

<h3><i>Model Evaluation</i></h3>

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

print("Logistic Regression:\n", classification_report(y_test, y_pred_log))
print("-------------------------------------------------------------")
print("Decision Tree:\n", classification_report(y_test, y_pred_dt))
print("-------------------------------------------------------------")
print("XGBoost:\n", classification_report(y_test_xgb, y_pred_xgb))
print("XGBoost ROC-AUC Score:", roc_auc_score(y_test_xgb, y_pred_xgb))

<h3><i>Model Improvement - XGBoost</i></h3>

<h4>Feature Engineering</h4>

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Get feature importance from trained XGBoost model
feature_importance = xgb.feature_importances_

# Create a DataFrame to sort & visualize
feature_imp_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})
feature_imp_df = feature_imp_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10,6))
plt.barh(feature_imp_df['Feature'], feature_imp_df['Importance'], color='skyblue')
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Feature Importance from XGBoost")
plt.gca().invert_yaxis()  # Flip to show the highest at the top
plt.show()

In [None]:
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier

xgb = XGBClassifier(eval_metric='logloss')

selector = RFE(xgb, n_features_to_select=10)  # Keep top 10 features
selector.fit(X_train, y_train_xgb)

selected_features = X_train.columns[selector.support_]

print('SELECTED FEATURES:')
for feature in selected_features:
    print(feature)
print()

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

xgb.fit(X_train_selected, y_train_xgb)
y_pred_xgb = xgb.predict(X_test_selected)

print("XGBoost Accuracy after feature selection:", accuracy_score(y_test_xgb, y_pred_xgb))

<h4>Handling class imbalance</h4>

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

y_train_xgb = y_train.map({'Attrited Customer': 1, 'Existing Customer': 0})
y_test_xgb = y_test.map({'Attrited Customer': 1, 'Existing Customer': 0})

scale_pos_weight = y_train_xgb.value_counts()[0] / y_train_xgb.value_counts()[1]

xgb = XGBClassifier(eval_metric='logloss', scale_pos_weight=scale_pos_weight)

xgb.fit(X_train, y_train_xgb)

y_pred_xgb = xgb.predict(X_test)

print("XGBoost Accuracy after handling class imbalance:", accuracy_score(y_test_xgb, y_pred_xgb))

<h4>Hyperparameter Tuning</h4>

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0),
        'scale_pos_weight': y_train_xgb.value_counts()[0] / y_train_xgb.value_counts()[1],  # Handle class imbalance
        'eval_metric': 'logloss'
    }
    
    # Train model
    model = XGBClassifier(**params)
    model.fit(X_train, y_train_xgb)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Return accuracy as the metric to optimize
    return accuracy_score(y_test_xgb, y_pred)

# Run Optuna optimization
study = optuna.create_study(direction='maximize')  # Maximize accuracy
study.optimize(objective, n_trials=30)  # Run 30 trials

# Print best parameters
print(f"Best parameters: {study.best_params} \n")

best_xgb = XGBClassifier(**study.best_params)
best_xgb.fit(X_train, y_train_xgb)

y_pred_xgb = best_xgb.predict(X_test)

print("Optimized XGBoost Accuracy:", accuracy_score(y_test_xgb, y_pred_xgb))

<h3>K-Folds Cross-Validation</h3>

In [None]:
from sklearn.model_selection import KFold, cross_val_score

<h4>Logistic Regression</h4>

In [None]:
kf = KFold(n_splits=5, random_state=1, shuffle=True)
cv_scores = cross_val_score(log_reg, X_train, y_train, cv=kf, scoring='accuracy')
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Cross-Validation Score: {cv_scores.mean():.3f}")

<h4>Decision Tree</h4>

In [None]:
kf = KFold(n_splits=5, random_state=1, shuffle=True)
cv_scores = cross_val_score(dt, X_train, y_train, cv=kf, scoring='accuracy')
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Cross-Validation Score: {cv_scores.mean():.3f}")

<h4>XGBoost</h4>

In [None]:
kf = KFold(n_splits=5, random_state=1, shuffle=True)
cv_scores = cross_val_score(xgb, X_train, y_train_xgb, cv=kf, scoring='accuracy')
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Cross-Validation Score: {cv_scores.mean():.3f}")

In [None]:
xgb.save_model('xgboost_model.json')