<a href="https://colab.research.google.com/github/datascience-uniandes/classification_tutorial/blob/master/churn/churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification: Detecting churn probability and causes

MINE-4101: Applied Data Science  
Univerisdad de los Andes  
  
Last update: October, 2023

In [None]:
!pip show shap

In [None]:
from joblib import dump

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, f1_score

from pandas_profiling import ProfileReport

import shap

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

### Loading the data

In [None]:
churn_df = pd.read_csv('./data/churn_train_val.csv')
test_df = pd.read_csv('./data/churn_test_labeled.csv')

In [None]:
churn_df.shape

In [None]:
test_df.shape

In [None]:
churn_df.dtypes

In [None]:
churn_df.head()

In [None]:
test_df.head()

### Profiling the data

In [None]:
profile = ProfileReport(churn_df)

In [None]:
profile.to_notebook_iframe()

### Analyzing features vs. target

In [None]:
plt.figure(figsize = (15, 3))
sns.boxplot(data = churn_df, x = 'credit_score', y = 'churn', orient = 'h')
plt.title('Credit score')
plt.show()

In [None]:
pd.crosstab(churn_df['churn'], churn_df['country'], normalize = 'columns')

In [None]:
pd.crosstab(churn_df['churn'], churn_df['gender'], normalize = 'columns')

In [None]:
plt.figure(figsize = (15, 3))
sns.boxplot(data = churn_df, x = 'age', y = 'churn', orient = 'h')
plt.title('Age')
plt.show()

In [None]:
plt.figure(figsize = (15, 3))
sns.boxplot(data = churn_df, x = 'tenure', y = 'churn', orient = 'h')
plt.title('Tenure')
plt.show()

In [None]:
plt.figure(figsize = (15, 3))
sns.boxplot(data = churn_df, x = 'balance', y = 'churn', orient = 'h')
plt.title('Balance')
plt.show()

In [None]:
pd.crosstab(churn_df['churn'], churn_df['products_number'], normalize = 'columns')

In [None]:
pd.crosstab(churn_df['churn'], churn_df['credit_card'], normalize = 'columns')

In [None]:
pd.crosstab(churn_df['churn'], churn_df['active_member'], normalize = 'columns')

In [None]:
plt.figure(figsize = (15, 3))
sns.boxplot(data = churn_df, x = 'estimated_salary', y = 'churn', orient = 'h')
plt.title('Estimated salary')
plt.show()

### Training a first set of models and selecting the best using F1

In [None]:
features = ['credit_score', 'age', 'tenure', 'balance', 'products_number', 'credit_card', 'active_member', 'estimated_salary']

In [None]:
tree = DecisionTreeClassifier(random_state = 10)

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth' : [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'class_weight': ['balanced', None]
}

In [None]:
grid = GridSearchCV(estimator = tree, param_grid = param_grid, scoring = ['precision', 'recall', 'f1'], n_jobs = 1, refit = 'f1', cv = 5, return_train_score = True, verbose = 2)

In [None]:
grid.fit(churn_df[features], churn_df['churn'])

In [None]:
grid.best_params_

In [None]:
best_results_df = pd.DataFrame(grid.cv_results_).iloc[grid.best_index_].reset_index().rename(columns = {'index': 'result', grid.best_index_: 'value'})
best_results_df = best_results_df.loc[best_results_df['result'].str.contains('split')]
[best_results_df['split'], best_results_df['dataset'], best_results_df['metric']] = zip(*best_results_df['result'].str.split('_'))
best_results_df['dataset'].replace({'test': 'validation'}, inplace = True)
del best_results_df['result']

In [None]:
plt.figure(figsize = (8, 10))
sns.boxplot(data = best_results_df, y = 'value', x = 'metric', hue = 'dataset', showmeans = True)
plt.show()

In [None]:
train_val_preds = grid.best_estimator_.predict(churn_df[features])
test_preds = grid.best_estimator_.predict(test_df[features])

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (10, 4))

train_val_cm = confusion_matrix(churn_df['churn'], train_val_preds, labels = grid.best_estimator_.classes_, normalize = 'true')
train_val_disp = ConfusionMatrixDisplay(confusion_matrix = train_val_cm, display_labels = grid.best_estimator_.classes_)
train_val_disp.plot(ax = axes[0])
axes[0].set_title('Train/Val')

test_cm = confusion_matrix(test_df['churn'], test_preds, labels = grid.best_estimator_.classes_, normalize = 'true')
test_disp = ConfusionMatrixDisplay(confusion_matrix = test_cm, display_labels = grid.best_estimator_.classes_)
test_disp.plot(ax = axes[1])
axes[1].set_title('Test')

plt.show()

In [None]:
print('Precision:')
print('- Train/Val:', precision_score(churn_df['churn'], train_val_preds))
print('- Test:', precision_score(test_df['churn'], test_preds))
print('\nRecall:')
print('- Train/Val:', recall_score(churn_df['churn'], train_val_preds))
print('- Test:', recall_score(test_df['churn'], test_preds))
print('\nF1:')
print('- Train/Val:', f1_score(churn_df['churn'], train_val_preds))
print('- Test:', f1_score(test_df['churn'], test_preds))

### Adding categorical features and training with new algorithms

In [None]:
features2 = ['credit_score', 'age', 'tenure', 'balance', 'products_number', 'credit_card', 'active_member', 'estimated_salary', 'gender', 'country']

In [None]:
pipeline = Pipeline([
    ('transformer', ColumnTransformer([
        ('ordinal', OrdinalEncoder(), ['gender']),
        ('categorical', OneHotEncoder(sparse = False, handle_unknown = 'ignore'), ['country'])
    ], remainder = 'passthrough')),
    ('poly', 'passthrough'),
    ('normalizer', StandardScaler()),
    ('classifier',  DecisionTreeClassifier())
]) 

In [None]:
param_grid2 = [
    {
        'classifier': [DecisionTreeClassifier(random_state = 20)],
        'classifier__criterion': ['gini', 'entropy'],
        'classifier__max_depth' : [2, 3, 4, 5, 6, 7, 8, 9, 10],
        'classifier__class_weight': ['balanced', None]
    },
    {
        'poly': [PolynomialFeatures()],
        'poly__degree': [1, 2, 3, 4],
        'normalizer': [StandardScaler(), MinMaxScaler()],
        'classifier': [LogisticRegression()],
        'classifier__penalty': ['none', 'l1', 'l2'],
        'classifier__C': [0.001, 0.01, 0.1, 1.],
        'classifier__class_weight': ['balanced', None]
    }
]

In [None]:
grid2 = GridSearchCV(estimator = pipeline, param_grid = param_grid2, scoring = ['precision', 'recall', 'f1'], n_jobs = 1, refit = 'f1', cv = 5, return_train_score = True, verbose = 2)

In [None]:
grid2.fit(churn_df[features2], churn_df['churn'])

In [None]:
grid2.best_params_

In [None]:
best_results_df = pd.DataFrame(grid2.cv_results_).iloc[grid2.best_index_].reset_index().rename(columns = {'index': 'result', grid2.best_index_: 'value'})
best_results_df = best_results_df.loc[best_results_df['result'].str.contains('split')]
[best_results_df['split'], best_results_df['dataset'], best_results_df['metric']] = zip(*best_results_df['result'].str.split('_'))
best_results_df['dataset'].replace({'test': 'validation'}, inplace = True)
del best_results_df['result']

In [None]:
plt.figure(figsize = (8, 10))
sns.boxplot(data = best_results_df, y = 'value', x = 'metric', hue = 'dataset', showmeans = True)
plt.show()

In [None]:
train_val_preds = grid2.best_estimator_.predict(churn_df[features2])
test_preds = grid2.best_estimator_.predict(test_df[features2])

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (10, 4))

train_val_cm = confusion_matrix(churn_df['churn'], train_val_preds, labels = grid2.best_estimator_.classes_, normalize = 'true')
train_val_disp = ConfusionMatrixDisplay(confusion_matrix = train_val_cm, display_labels = grid2.best_estimator_.classes_)
train_val_disp.plot(ax = axes[0])
axes[0].set_title('Train/Val')

test_cm = confusion_matrix(test_df['churn'], test_preds, labels = grid2.best_estimator_.classes_, normalize = 'true')
test_disp = ConfusionMatrixDisplay(confusion_matrix = test_cm, display_labels = grid2.best_estimator_.classes_)
test_disp.plot(ax = axes[1])
axes[1].set_title('Test')

plt.show()

In [None]:
print('Precision:')
print('- Train/Val:', precision_score(churn_df['churn'], train_val_preds))
print('- Test:', precision_score(test_df['churn'], test_preds))
print('\nRecall:')
print('- Train/Val:', recall_score(churn_df['churn'], train_val_preds))
print('- Test:', recall_score(test_df['churn'], test_preds))
print('\nF1:')
print('- Train/Val:', f1_score(churn_df['churn'], train_val_preds))
print('- Test:', f1_score(test_df['churn'], test_preds))

### Exporting the pipeline

### Explaining predictions

In [None]:
test_preds_proba = grid.best_estimator_.predict_proba(test_df[features])[:, 1]

In [None]:
test_df['prob'] = test_preds_proba

In [None]:
plt.hist(test_preds_proba)
plt.title('Probability predictios for test dataset')
plt.show()

In [None]:
test_df.loc[test_preds_proba < 0.1]. head()

In [None]:
test_df.loc[test_preds_proba > 0.9].head()

In [None]:
test_df.loc[(test_preds_proba > 0.47) & (test_preds_proba < 0.53)].head()

In [None]:
explainer = shap.Explainer(grid.best_estimator_.predict, test_df[features])
shap_values = explainer(test_df[features])

**Explaining individual predictions:**

In [None]:
shap.plots.waterfall(shap_values[1])

In [None]:
shap.plots.waterfall(shap_values[148])

In [None]:
shap.plots.waterfall(shap_values[85])

**Explaining all instances:**

In [None]:
shap.summary_plot(shap_values, plot_type = 'violin')