In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [None]:
churn = pd.read_csv("data/bank_churn.csv") 
churn.shape

In [None]:
churn

In [None]:
churn.drop(churn.columns[0:3],axis = 1,inplace = True)

In [None]:
churn['Exited'].value_counts().plot(kind = 'pie', title='Exited')
print(churn['Exited'].value_counts(normalize=True))

In [None]:
# Define lists of categorical and numerical features 
cat_features = [var for var in churn.columns if churn[var].dtype == 'object']
print('there are {} categorical variables'.format(len(cat_features)))
print(cat_features)
#
num_features = [var for var in churn.columns if churn[var].dtype in ('int64', 'float64')]
print('there are {} numerical variables'.format(len(num_features)))
print(num_features)

In [None]:
# distribution plots for categorical features
fig, axs = plt.subplots(1, 2, figsize=(12,4))
for i, var in enumerate(cat_features):
  sns.countplot(x=var, data=churn, ax=axs[i])
  print(churn[var].value_counts(normalize=True))

In [None]:
# distribution plots for categorical features grouped by the target
fig, axs = plt.subplots(1, 2, figsize=(12,4))
for i, var in enumerate(cat_features):
  sns.countplot(x=var, data=churn, ax=axs[i], hue='Exited')

Half of the customers are from France. The other half is equally divided between Spain and Germany. Gender shows a slight predominance of males. When we group by 'exited', it's clear that the proportions of clients who have churned are different: churn rate in germany is the highest among countries. As for gender, female's churn rate are higher than for males.

In [None]:
# distribution plots for numerical features
fig, axs = plt.subplots(2, 4, figsize=(16,5))
for i in range(int(len(num_features)/2)):
  sns.boxplot(x='Exited', y=num_features[i], data=churn, ax=axs[0, i])
  #axs[0, i].set_title(num_features[i])
  sns.boxplot(x='Exited', y=num_features[i+4], data=churn, ax=axs[1, i])
  #axs[1, i].set_title(num_features[i+4])
fig.tight_layout()
plt.show()

Plots above show us that there are outliers on 'CreditScore' and 'Age' ('NumOfProducts' = 4 doesn't seem to be an outlier). We can also notice that distributions are different for churned cliest for the variables 'Age', 'Tenure' and 'Balance'.

Numerical Encoding for Categorical Features


In [None]:
churn['GenderInt'] = pd.factorize(churn['Gender'])[0]
num_features.append('GenderInt')
churn['GeographyInt'] = pd.factorize(churn['Geography'])[0]
num_features.append('GeographyInt')

In [None]:
# correlation heatmap function
def corr_heatmap(df, method):
  df_corr = df.iloc[:,3:].corr(method=method)
  mask = np.triu(np.ones_like(df_corr, dtype=np.bool))
  fig, ax = plt.subplots(figsize = (10, 10))
  sns.heatmap(df_corr, annot=True, vmin=-1, vmax=1, cmap='viridis', linewidths=.5, mask=mask, ax=ax)
  plt.show()

In [None]:
# Pearson Correlation analysis
corr_heatmap(churn, 'pearson')

No variables are too highly correlated to be considered a problem. The variables with greater correlation to the target are 'Age', 'IsActiveMember', 'NumOfProducts' and 'Balance'.

In [None]:
# Separtes X for explanatory feaures and y for target
X = churn[num_features]
y = X.pop('Exited')
print(X.shape)
print(y.shape)

### Data split

Now for modeling purposes, before applying any transformation to the data, it's of capital importance to isolate some data for validation, otherwise we would incur in leakage.

In [None]:
# splits data into train and validation
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=16, stratify=y)
print(X_train.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_valid.shape)

In [None]:
def outlier_detection(feature, df, low, up):
  q1 = df[feature].quantile(low)
  q3 = df[feature].quantile(up)
  iqr = q3-q1
  uf = q3 + 1.5*iqr
  lf = q1 - 1.5*iqr
  return(uf, lf)

def outlier_replacement(feature, uf, lf):
  if feature > uf:
    clean = uf
  elif feature < lf:
    clean = lf
  else:
    clean = feature
  return clean

In [None]:
# 'CreditScore' outlier treatment

uf, lf = outlier_detection('CreditScore', X_train, 0.25, 0.75)
print(uf, lf)
clean = np.vectorize(outlier_replacement)(X_train['CreditScore'], uf, lf)
print(len(clean),'=' , X_train.shape[0])
X_train['CreditScore_clean'] = clean
clean = np.vectorize(outlier_replacement)(X_valid['CreditScore'], uf, lf)
print(len(clean),'=' , X_valid.shape[0])
X_valid['CreditScore_clean'] = clean

fig, axs = plt.subplots(2, 2, figsize=(10,8))
sns.violinplot(x='CreditScore', data=X_train, ax=axs[0, 0])
axs[0, 0].set_title('X_train CreditScore original distribution')
sns.violinplot(x='CreditScore_clean', data=X_train, ax=axs[0, 1])
axs[0, 1].set_title('X_train CreditScore clean distribution')
#
sns.violinplot(x='CreditScore', data=X_valid, ax=axs[1, 0])
axs[1, 0].set_title('X_valid CreditScore original distribution')
sns.violinplot(x='CreditScore_clean', data=X_valid, ax=axs[1, 1])
axs[1, 1].set_title('X_valid CreditScore clean distribution')
#
fig.tight_layout()
plt.show()

In [None]:
# 'Age' outlier treatment

uf, lf = outlier_detection('Age', X_train, 0.25, 0.75)
print(uf, lf)
clean = np.vectorize(outlier_replacement)(X_train['Age'], uf, lf)
print(len(clean),'=' , X_train.shape[0])
X_train['Age_clean'] = clean
clean = np.vectorize(outlier_replacement)(X_valid['Age'], uf, lf)
print(len(clean),'=' , X_valid.shape[0])
X_valid['Age_clean'] = clean

fig, axs = plt.subplots(2, 2, figsize=(10,8))
sns.violinplot(x='Age', data=X_train, ax=axs[0, 0])
axs[0, 0].set_title('X_train Age original distribution')
sns.violinplot(x='Age_clean', data=X_train, ax=axs[0, 1])
axs[0, 1].set_title('X_train Age clean distribution')
#
sns.violinplot(x='Age', data=X_valid, ax=axs[1, 0])
axs[1, 0].set_title('X_valid Age original distribution')
sns.violinplot(x='Age_clean', data=X_valid, ax=axs[1, 1])
axs[1, 1].set_title('X_valid Age clean distribution')
#
fig.tight_layout()
plt.show()

In [None]:
# replace variables by their clean versions (i.e. without outliers) 
num_features.remove('CreditScore')
num_features.append('CreditScore_clean')
#
num_features.remove('Exited')
num_features.remove('Age')
num_features.append('Age_clean')

In [None]:
X_train = X_train[num_features]
print(X_train.shape)
X_valid = X_valid[num_features]
print(X_valid.shape)

Scaling

In [None]:
# Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

vars_scale = ['Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'CreditScore_clean', 'Age_clean']

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[vars_scale]), columns=X_train[vars_scale].columns)
X_train_scaled.set_index(X_train.index, inplace=True)
print(X_train_scaled.shape)

X_valid_scaled = pd.DataFrame(scaler.transform(X_valid[vars_scale]), columns=X_valid[vars_scale].columns)
X_valid_scaled.set_index(X_valid.index, inplace=True)
print(X_valid_scaled.shape)

In [None]:
X_train_final = pd.concat([X_train[['HasCrCard', 'IsActiveMember', 'GenderInt', 'GeographyInt']], X_train_scaled], axis=1)
print(X_train_final.shape)
X_train_final.head(2)

In [None]:
from skopt import BayesSearchCV

In [None]:
from sklearn.linear_model import LogisticRegression

opt = BayesSearchCV(
    LogisticRegression(),
    {
        'penalty': ['l2'],
        'C': (1e-1, 1e1, 'log-uniform'),
        'tol': (1e-6, 1e-3, 'log-uniform'),
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train_final, y_train)

print("test score: %s" % opt.best_score_)
print("valid score: %s" % opt.score(X_valid_final, y_valid))
print("best params: %s" % str(opt.best_params_))

In [None]:
from sklearn.metrics import classification_report

y_pred = opt.best_estimator_.predict(X_valid_final)

print(classification_report(y_valid, y_pred))

In [None]:
from sklearn.metrics import classification_report

y_pred = opt.best_estimator_.predict(X_valid_final)

print(classification_report(y_valid, y_pred))

In [None]:
reglog = opt.best_estimator_

## SVM

In [None]:
from sklearn.svm import SVC

opt = BayesSearchCV(
    SVC(),
    {
        'C': (1e-1, 1e1, 'log-uniform'),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': (1, 3),
        'tol': (1e-5, 1e-3, 'log-uniform'),
        #'gamma': ['scale', 'auto']
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train_final, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_valid_final, y_valid))
print("best params: %s" % str(opt.best_params_))

In [None]:
from sklearn.metrics import classification_report

y_pred = opt.best_estimator_.predict(X_valid_final)

print(classification_report(y_valid, y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(opt.best_estimator_, X_valid_final, y_valid)

In [None]:
svm = opt.best_estimator_

Randome Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier

opt = BayesSearchCV(
    RandomForestClassifier(),
    {
        'n_estimators': [200, 400, 800, 1000, 1500, 2000],
        #'criterion': ['gini', 'entropy'],
        'min_samples_split': (2, 7),
        #'min_samples_leaf': (1, 7),
        'max_features': ['auto', 'sqrt', 'log2'],
        'bootstrap': [True, False]
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train_final, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_valid_final, y_valid))

In [None]:
from sklearn.metrics import classification_report

y_pred = opt.best_estimator_.predict(X_valid_final)

print(classification_report(y_valid, y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(opt.best_estimator_, X_valid_final, y_valid)

In [None]:
rf = opt.best_estimator_

XGboost

In [None]:
pip install xgboost

In [None]:
from xgboost import XGBClassifier

opt = BayesSearchCV(
    XGBClassifier(),
    {
        'learning_rate': (1e-5, 1e-1, 'log-uniform'),
        'min_split_loss': [0.05, 0.1, 0.3, 0.5, 0.75, 1],
        'max_depth': (3, 15),
        #'min_child_weight': (3, 7),
        'subsample': (1e-2, 0.9999, 'log-uniform'),
        #'colsample_bytree': (1e-2, 1, 'log-uniform'),
        #'reg_lambda': (1e-2, 1, 'log-uniform'),
        #reg_alpha': (1e-2, 1, 'log-uniform'),
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train_final, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_valid_final, y_valid))

In [None]:
from sklearn.metrics import classification_report

y_pred = opt.best_estimator_.predict(X_valid_final)

print(classification_report(y_valid, y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(opt.best_estimator_, X_valid_final, y_valid)

In [None]:
xgb = opt.best_estimator_

In [None]:
print(X_train_final.shape)
print(y_train.shape)
print(X_valid_final.shape)
print(y_valid.shape)

In [None]:
print(y_train.value_counts())
print(y_train.value_counts(normalize=True))

### Oversampling
Let's apply SMOTE for oversampling the minority class

In [None]:
pip install imblearn

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train_balanced, y_train_balanced = oversample.fit_resample(X_train_final, y_train)

print(X_train_balanced.shape)
print(y_train_balanced.shape)

print(y_train_balanced.value_counts())
print(y_train_balanced.value_counts(normalize=True))

In [None]:
%%time
opt = BayesSearchCV(
    LogisticRegression(),
    {
        'penalty': ['l2'],
        'C': (1e-1, 1e1, 'log-uniform'),
        'tol': (1e-6, 1e-3, 'log-uniform'),
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train_balanced, y_train_balanced)

print("test score: %s" % opt.best_score_)
print("valid score: %s" % opt.score(X_valid_final, y_valid))
print("best params: %s" % str(opt.best_params_))

In [None]:
y_pred = opt.best_estimator_.predict(X_valid_final)

print(classification_report(y_valid, y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(opt.best_estimator_, X_valid_final, y_valid)

In [None]:
reglog = opt.best_estimator_

### SVM

In [None]:
%%time
opt = BayesSearchCV(
    SVC(),
    {
        'C': (1e-1, 1e1, 'log-uniform'),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': (1, 3),
        'tol': (1e-5, 1e-3, 'log-uniform'),
        #'gamma': ['scale', 'auto']
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train_balanced, y_train_balanced)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_valid_final, y_valid))
print("best params: %s" % str(opt.best_params_))

In [None]:
from sklearn.metrics import classification_report

y_pred = opt.best_estimator_.predict(X_valid_final)

print(classification_report(y_valid, y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(opt.best_estimator_, X_valid_final, y_valid)

In [None]:
svm = opt.best_estimator_


Random Forest


In [None]:
%%time
opt = BayesSearchCV(
    RandomForestClassifier(),
    {
        'n_estimators': [200, 400, 800, 1000, 1500, 2000],
        #'criterion': ['gini', 'entropy'],
        'min_samples_split': (2, 7),
        #'min_samples_leaf': (1, 7),
        'max_features': ['auto', 'sqrt', 'log2'],
        'bootstrap': [True, False]
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train_balanced, y_train_balanced)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_valid_final, y_valid))
print("best params: %s" % str(opt.best_params_))

In [None]:
from sklearn.metrics import classification_report

y_pred = opt.best_estimator_.predict(X_valid_final)

print(classification_report(y_valid, y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(opt.best_estimator_, X_valid_final, y_valid)

In [None]:
rf = opt.best_estimator_

XG boost

In [None]:
%%time
opt = BayesSearchCV(
    XGBClassifier(),
    {
        'learning_rate': (1e-5, 1e-1, 'log-uniform'),
        'min_split_loss': [0.05, 0.1, 0.3, 0.5, 0.75, 1],
        'max_depth': (3, 15),
        #'min_child_weight': (3, 7),
        'subsample': (1e-2, 0.9999, 'log-uniform'),
        #'colsample_bytree': (1e-2, 1, 'log-uniform'),
        #'reg_lambda': (1e-2, 1, 'log-uniform'),
        #reg_alpha': (1e-2, 1, 'log-uniform'),
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train_balanced, y_train_balanced)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_valid_final, y_valid))
print("best params: %s" % str(opt.best_params_))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(opt.best_estimator_, X_valid_final, y_valid)

In [None]:
xgb = opt.best_estimator_

Undersampling the majority class

In [None]:
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler()
X_train_balanced, y_train_balanced = undersample.fit_resample(X_train_final, y_train)

print(X_train_balanced.shape)
print(y_train_balanced.shape)

print(y_train_balanced.value_counts())
print(y_train_balanced.value_counts(normalize=True))

In [None]:
%%time
opt = BayesSearchCV(
    LogisticRegression(),
    {
        'penalty': ['l2'],
        'C': (1e-1, 1e1, 'log-uniform'),
        'tol': (1e-6, 1e-3, 'log-uniform'),
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train_balanced, y_train_balanced)

print("test score: %s" % opt.best_score_)
print("valid score: %s" % opt.score(X_valid_final, y_valid))
print("best params: %s" % str(opt.best_params_))

In [None]:
y_pred = opt.best_estimator_.predict(X_valid_final)

print(classification_report(y_valid, y_pred))


In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(opt.best_estimator_, X_valid_final, y_valid)

In [None]:
reglog = opt.best_estimator_

SVM

In [None]:
%%time
opt = BayesSearchCV(
    SVC(),
    {
        'C': (1e-1, 1e1, 'log-uniform'),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': (1, 3),
        'tol': (1e-5, 1e-3, 'log-uniform'),
        #'gamma': ['scale', 'auto']
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train_balanced, y_train_balanced)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_valid_final, y_valid))
print("best params: %s" % str(opt.best_params_))

In [None]:
from sklearn.metrics import classification_report

y_pred = opt.best_estimator_.predict(X_valid_final)

print(classification_report(y_valid, y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(opt.best_estimator_, X_valid_final, y_valid)

In [None]:
svm = opt.best_estimator_

Random Forest

In [None]:
%%time
opt = BayesSearchCV(
    RandomForestClassifier(),
    {
        'n_estimators': [200, 400, 800, 1000, 1500, 2000],
        #'criterion': ['gini', 'entropy'],
        'min_samples_split': (2, 7),
        #'min_samples_leaf': (1, 7),
        'max_features': ['auto', 'sqrt', 'log2'],
        'bootstrap': [True, False]
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train_balanced, y_train_balanced)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_valid_final, y_valid))
print("best params: %s" % str(opt.best_params_))

In [None]:
from sklearn.metrics import classification_report

y_pred = opt.best_estimator_.predict(X_valid_final)

print(classification_report(y_valid, y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(opt.best_estimator_, X_valid_final, y_valid)

In [None]:
rf = opt.best_estimator_

XGBoost

In [None]:
%%time
opt = BayesSearchCV(
    XGBClassifier(),
    {
        'learning_rate': (1e-5, 1e-1, 'log-uniform'),
        'min_split_loss': [0.05, 0.1, 0.3, 0.5, 0.75, 1],
        'max_depth': (3, 15),
        #'min_child_weight': (3, 7),
        'subsample': (1e-2, 0.9999, 'log-uniform'),
        #'colsample_bytree': (1e-2, 1, 'log-uniform'),
        #'reg_lambda': (1e-2, 1, 'log-uniform'),
        #reg_alpha': (1e-2, 1, 'log-uniform'),
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train_balanced, y_train_balanced)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_valid_final, y_valid))
print("best params: %s" % str(opt.best_params_))

In [None]:
from sklearn.metrics import classification_report

y_pred = opt.best_estimator_.predict(X_valid_final)

print(classification_report(y_valid, y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(opt.best_estimator_, X_valid_final, y_valid)

In [None]:
xgb = opt.best_estimator_

Undersampling didn't improve results either. At best, they are very similar to the ones we got without adopting any sampling strategy. Again, SVM showed a small improvement.

**Final coments** <br>
Results show that XGBoost, a tree based method, presented the best results.