# classification: Predict if customers will get churned

https://deepnote.com/project/d899ea24-a16c-48e2-9d9b-7483d07922b5#%2Fnotebook.ipynb

In [8]:
reset -fs

Load Data
-----

In [9]:
import numpy as np
import pandas as pd
from category_encoders import *
from sklearn.compose import *
from sklearn.impute import *
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import *
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import f1_score 
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix

## 1. Data

In [10]:
df = pd.read_csv('data.csv')

## 2. Feature Engineering

### Target transformations

In [11]:
def label_churn(row):
    if row['Attrition_Flag'] == 'Existing Customer' :
        return 0
    if row['Attrition_Flag'] == 'Attrited Customer':
        return 1

### train-test splitting

In [12]:
X = df[df.columns[2:]]
y = pd.DataFrame()
y['Attrition_Flag'] = df.apply(lambda row: label_churn(row), axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25) 

In [13]:
#  Ensure validation set has same properties as test set (e.g., size)
print(X_train.shape, X_val.shape, X_test.shape)

(6075, 19) (2026, 19) (2026, 19)


### Feature transformations 

In [14]:
# define catigorical variables groups and continuous variables groups 
# to fit into different pipeline
cat_vars = ['Gender', 'Education_Level', 'Income_Category', 'Marital_Status', 'Card_Category']
con_vars_1 = ['Total_Relationship_Count', 'Total_Trans_Ct']
con_vars_2 = ['Customer_Age', 'Credit_Limit']

# pipeline for continuous variables
# fill null values with median then standardize
con_pipe = Pipeline([('imputer', SimpleImputer(strategy='median', add_indicator=True)),
                     ('scaler', StandardScaler())])

# pipeline for categorical variables
# fill null values with mode then do OneHotEncoder
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent', add_indicator=True)),
                     ('ohe', OneHotEncoder(handle_unknown='ignore'))
                     ])

# put all feature transformations in a signle Pipeline
preprocessing = ColumnTransformer([('categorical', cat_pipe,  cat_vars),
                                   ('continuous',  con_pipe,  con_vars_1),
                                   ('qt', QuantileTransformer(output_distribution='normal'), con_vars_2)
                                   ])

## 3. Algorithms & Search

In [15]:
class DummyEstimator(BaseEstimator):
    "Pass through class, methods are present but do nothing."
    def fit(self): pass
    def score(self): pass

In [16]:
# Create space of candidate learning algorithms and their hyperparameters

# Setup pipeline with DummyEstimator to cv search across algorithms
pipe = Pipeline([("preprocessing", preprocessing),
                 ('pca', PCA()),
                 ('clf', DummyEstimator())])

# Go through each algorithm and a variety of hyperparameters
search_space = [{'clf': [LogisticRegression()], # Actual Estimator
                 'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'],
                 'clf__C': np.logspace(0, 4, 10),
                 'clf__class_weight': [None, 'balanced'],
                 'clf__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
                 'pca__n_components': range(1, 15)},
                
                {'clf': [RandomForestClassifier()],  # Actual Estimator
                 'clf__criterion': ['gini', 'entropy'],
                 'clf__max_depth': [None, 5, 10, 15, 20, 25],
                 'clf__min_samples_split': range(2,40),
                 'clf__min_samples_leaf': range(1,30),
                 'clf__max_features': ['auto', 'sqrt', 'log2'],
                 'clf__max_leaf_nodes': (range(2,50)),
                 'clf__max_samples': [None, 0.25, 0.33, 0.5, 0.67, 0.8],
                 'clf__class_weight': ['balanced', 'balanced_subsample'],
                 'pca__n_components': range(1, 15)},
                
                {'clf': [ExtraTreesClassifier()],
                 'clf__criterion': ['gini', 'entropy'],
                 'clf__max_depth': [None, 5, 10, 15, 20, 25],
                 'clf__min_samples_split': range(2,40),
                 'clf__min_samples_leaf': range(1,30),
                 'clf__max_features': ['auto', 'sqrt', 'log2'],
                 'clf__max_leaf_nodes': (range(2,50)),
                 'clf__max_samples': [None, 0.25, 0.33, 0.5, 0.67, 0.8],
                 'clf__class_weight': ['balanced', 'balanced_subsample'],
                 'pca__n_components': range(1, 15)
                },
                
                {'clf': [RidgeClassifier()],
                 'clf__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                 'clf__class_weight': [None, 'balanced'],
                 'clf__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'saga'],
                 'pca__n_components': range(1, 15)
                },
                
                {'clf': [KNeighborsClassifier()],
                 'clf__weights': ['uniform', 'distance'],
                 'clf__algorithm': ['ball_tree', 'kd_tree', 'brute'],
                 'clf__leaf_size': range(1, 60),
                 'clf__p' : [1, 2],
                 'pca__n_components': range(1, 15)
                },
                
                {'clf': [SVC()],
                 'clf__kernel': ['liner', 'poly', 'rbf', 'sigmoid', 'precomputed'],
                 'clf__degree': range(1,10),
                 'clf__gamma': ['scale', 'auto'],
                 'pca__n_components': range(1, 15)
                }
                
               ]


clf_algos_rand = RandomizedSearchCV(estimator=pipe, 
                                    param_distributions=search_space, 
                                    n_iter=100,
                                    cv=5, 
                                    n_jobs=-1,
                                    verbose=1)

# Grid search
best_model = clf_algos_rand.fit(X_train, y_train.values.ravel());

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [17]:
best_model.best_estimator_.get_params()['pca']

PCA(n_components=13)

In [18]:
# View best model
best_model.best_estimator_.get_params()['clf']

RandomForestClassifier(class_weight='balanced_subsample', max_depth=25,
                       max_features='sqrt', max_leaf_nodes=45, max_samples=0.25,
                       min_samples_leaf=7)

## 4. Fit the best model

In [19]:
hyperparameters = {'class_weight': 'balanced_subsample', 
                   'max_depth': 25,
                   'max_features': 'sqrt',
                   'max_leaf_nodes': 45, 
                   'max_samples': 0.25,
                   'min_samples_leaf': 7}

pipe = Pipeline([("preprocessing", preprocessing),
                 ('pca', PCA(n_components=13)),
                 ('etc', RandomForestClassifier(**hyperparameters))])

pipe.fit(X_train, y_train.values.ravel());

## 5. Evaluation Metrics

In [20]:
y_pred = pipe.predict(X_test)

In [21]:
# accuracy: correct classfication rate
acc_test = accuracy_score(y_test.values.ravel(), y_pred)
acc_test

0.8460019743336624

In [22]:
# f1: that calculates the harmonic mean of the precision and recall
f1_test  = f1_score(y_test.values.ravel(), y_pred, average='weighted')
f1_test

0.855577003754181

In [23]:
from sklearn.metrics import roc_auc_score, precision_recall_curve
roc_auc_score(y_test.values.ravel(), pipe.predict_proba(X_test)[:,1])

0.8909289724507117

In [25]:
# Compute confusion matrix to evaluate the accuracy of a classification.
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test.values.ravel(), y_pred)

array([[1483,  219],
       [  93,  231]])

From the confusion matrix, the true negative and true positive is high, but the false positive is false negative is also not low. The model doesn't do a good job on predicting the right class, but its ability to correctly predict class 0 is better than its ability to correctly predict class 1.

## 6. Conclusion

The best model I choose is:

In [26]:
pipe['etc']

RandomForestClassifier(class_weight='balanced_subsample', max_depth=25,
                       max_features='sqrt', max_leaf_nodes=45, max_samples=0.25,
                       min_samples_leaf=7)

The pipe with steps and non-default hyperparameters:  

In [27]:
pipe

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Gender', 'Education_Level',
                                                   'Income_Category',
                                                   'Marital_Status',
                                                   'Card_Category']),
                                                 ('continuous',
                                                  Pipeline(steps=[('imputer'

I built a pipeline to preprocess the data, and then use RandomizedSearch cross validation, I searched across six algorithms with specific lists of hyperparameters. I also performed automated hyperparameter search across to find the best n_componentsthe PCA . The result is that RandomForestClassifier works best for this dataset. Wrapping up the steps of preprocessing the data, principal component analysis and the final model into a pipeline, I train the model on the train set and evaluate it on the test set. 

The metrics accuracy, f1, and auc roc are not very bad, however, from the confusion matrix, I can see the model didn't do a very good job. 

Next steps / future directions: 
+ Gather more data. 
+ The dataset is a little bit imbalanced, because it has only 19.07% of customers who have churned. Next step, we can use SMOTE or bootstrapping techniques.