In [2]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import classification_report 


## Training prediction models
This section is the most important part of the project it deals with the training of the models.

In [3]:
df = pd.read_csv("data/BankChurners_preprocessed.csv")

After I loaded the data, I have defined the axes and split the test and the train set.

In [4]:

X = df.drop(columns=['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', 'Attrited'], axis=1)
y=df[["Attrited"]]
print(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Index(['Customer_Age', 'Dependent_count', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Gender_F', 'Gender_M', 'Education_Level_College',
       'Education_Level_Doctorate', 'Education_Level_Graduate',
       'Education_Level_High School', 'Education_Level_Post-Graduate',
       'Education_Level_Uneducated', 'Education_Level_Unknown',
       'Marital_Status_Divorced', 'Marital_Status_Married',
       'Marital_Status_Single', 'Marital_Status_Unknown',
       'Income_Category_$120K +', 'Income_Category_$40K - $60K',
       'Income_Category_$60K - $80K', 'Income_Category_$80K - $120K',
       'Income_Category_Less than $40K', 'Income_Category_Unknown',
       'Card_Category_Blue', 'Card_Category_Gold', 'Card_Category_Platinum',
       'Car

As a next step, I trained the Decision Tree classifer and evaluated its result.

In [5]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.928923988153998
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96      2543
         1.0       0.78      0.78      0.78       496

    accuracy                           0.93      3039
   macro avg       0.87      0.87      0.87      3039
weighted avg       0.93      0.93      0.93      3039



The result is too good! this is because the dataset, including the test set, has mostly existing customers. ```RandomForestClassifier``` is a widely used ML model for its inherent ability to not favor the majority class ([source](https://semaphoreci.com/blog/imbalanced-data-machine-learning-python#:~:text=In%20a%20nutshell%2C%20imbalanced%20data,handling%20the%20less%20common%20cases)).

In [7]:

import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on train and test sets
y_train_pred = rf_classifier.predict(X_train)
y_test_pred = rf_classifier.predict(X_test)

# Calculate and print accuracy for the train set
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Accuracy on Train Set:", train_accuracy)

# Calculate and print accuracy for the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy on Test Set:", test_accuracy)
print(classification_report(y_test, y_test_pred))

  return fit_method(estimator, *args, **kwargs)


Accuracy on Train Set: 1.0
Accuracy on Test Set: 0.9506416584402764
              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97      2543
         1.0       0.93      0.75      0.83       496

    accuracy                           0.95      3039
   macro avg       0.94      0.87      0.90      3039
weighted avg       0.95      0.95      0.95      3039



This model gives very high scores as well; this means that the data needs to be resampled.

In [8]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score

After resampling, I train Decision Tree, KNN and Random Forest classifier and print their scores and classification report.

In [10]:

#split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#oversampling after the split
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_oversampled, y_oversampled = oversampler.fit_resample(X_train, y_train)

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train.values.ravel())
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy Decision Tree classifer:",metrics.accuracy_score(y_test, y_pred))
print(f"Report for Decision Tree classifer: \n", classification_report(y_test, y_pred))

# Create KNN classifer object
knn_clf = KNeighborsClassifier()
knn_clf = knn_clf.fit(X_train,y_train.values.ravel())
y_pred = knn_clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy KNN classifer:",metrics.accuracy_score(y_test, y_pred))
print(f"Report for KNN classifer: \n", classification_report(y_test, y_pred))


#Create RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train.values.ravel())
y_pred = rf_classifier.predict(X_test)
y_prob = rf_classifier.predict_proba(X_test)
print("Accuracy RandomForestClassifier classifer: ",metrics.accuracy_score(y_test, y_pred))
print(f"Report for RandomForestClassifier classifer: \n", classification_report(y_test, y_pred))



Accuracy Decision Tree classifer: 0.9282658769332017
Report for Decision Tree classifer: 
               precision    recall  f1-score   support

         0.0       0.95      0.96      0.96      2543
         1.0       0.79      0.77      0.78       496

    accuracy                           0.93      3039
   macro avg       0.87      0.86      0.87      3039
weighted avg       0.93      0.93      0.93      3039

Accuracy KNN classifer: 0.8845014807502468
Report for KNN classifer: 
               precision    recall  f1-score   support

         0.0       0.91      0.95      0.93      2543
         1.0       0.69      0.54      0.60       496

    accuracy                           0.88      3039
   macro avg       0.80      0.75      0.77      3039
weighted avg       0.88      0.88      0.88      3039

Accuracy RandomForestClassifier classifer:  0.9506416584402764
Report for RandomForestClassifier classifer: 
               precision    recall  f1-score   support

         0.0       

Indeed, Random Forest Classifier is the best model for the task. The results need to be saved for further visualization.

In [18]:
#insert the probabilities into the dataframe and save
X_test = X_test.drop(columns=["Churning_probability"], axis=1)
y_prob_all = rf_classifier.predict_proba(X_test)

df_1 = pd.DataFrame(y_prob_all, index=X_test.index, columns=["prob_exist", "prob_churn"])
X_test_pred = X_test.copy(deep=True)
X_test["Churning_probability"] = df_1["prob_churn"]
X_test["Attrited"] = y_test
X_test.to_csv("data/BankChurners_predictions.csv")

Another strategy of resampling is undersampling. It removes the data from the larger category (in our case, existing customers).

In [12]:

#split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#undersampling
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train, y_train= undersampler.fit_resample(X_train, y_train)
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train.values.ravel())
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy Decision Tree classifer:",metrics.accuracy_score(y_test, y_pred))
print(f"Report for Decision Tree classifer: \n", classification_report(y_test, y_pred))

# Create KNN classifer object
knn_clf = KNeighborsClassifier()
knn_clf = knn_clf.fit(X_train,y_train.values.ravel())
y_pred = knn_clf.predict(X_test)


# Model Accuracy, how often is the classifier correct?
print("Accuracy KNN classifer:",metrics.accuracy_score(y_test, y_pred))
print(f"Report for KNN classifer: \n", classification_report(y_test, y_pred))

#Create RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train.values.ravel())
y_pred = rf_classifier.predict(X_test)
print("Accuracy RandomForestClassifier classifer:",metrics.accuracy_score(y_test, y_pred))
print(f"Report for RandomForestClassifier classifer: \n", classification_report(y_test, y_pred))

Accuracy Decision Tree classifer: 0.8943731490621916
Report for Decision Tree classifer: 
               precision    recall  f1-score   support

         0.0       0.98      0.89      0.93      2543
         1.0       0.62      0.90      0.74       496

    accuracy                           0.89      3039
   macro avg       0.80      0.90      0.83      3039
weighted avg       0.92      0.89      0.90      3039

Accuracy KNN classifer: 0.8180322474498191
Report for KNN classifer: 
               precision    recall  f1-score   support

         0.0       0.95      0.82      0.88      2543
         1.0       0.47      0.80      0.59       496

    accuracy                           0.82      3039
   macro avg       0.71      0.81      0.74      3039
weighted avg       0.87      0.82      0.84      3039

Accuracy RandomForestClassifier classifer: 0.9279368213228035
Report for RandomForestClassifier classifer: 
               precision    recall  f1-score   support

         0.0       0

Oversampling has shown better results than undersampling, therefore, this type of preprocessing is preferred for this data.

## What are the most important features?

In [13]:

feature_scores = pd.Series(rf_classifier.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(feature_scores)



Total_Trans_Amt                   0.197753
Total_Trans_Ct                    0.193131
Total_Revolving_Bal               0.102252
Total_Ct_Chng_Q4_Q1               0.096499
Avg_Utilization_Ratio             0.062177
Total_Amt_Chng_Q4_Q1              0.059250
Total_Relationship_Count          0.041480
Credit_Limit                      0.035053
Avg_Open_To_Buy                   0.032530
Months_Inactive_12_mon            0.031252
Customer_Age                      0.024953
Months_on_book                    0.022667
Contacts_Count_12_mon             0.021360
Dependent_count                   0.011781
Gender_M                          0.006396
Gender_F                          0.006390
Marital_Status_Married            0.006226
Education_Level_Graduate          0.004913
Marital_Status_Single             0.004109
Income_Category_$80K - $120K      0.003541
Education_Level_Unknown           0.003205
Income_Category_$60K - $80K       0.003200
Education_Level_High School       0.003164
Income_Cate

The most important features are _Total_Trans_Amt (0.197753), Total_Trans_Ct (0.193131), Total_Revolving_Bal, (0.102252)._