In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

import xgboost as xgb
from sklearn.svm import SVC

import os
BASE_DIR = os.path.dirname(os.path.abspath('')) # define base directory
DATA_DIR = os.path.join(BASE_DIR, 'Data')
OUT_DIR = os.path.join(BASE_DIR, 'Output')

In [2]:
df = pd.read_csv(os.path.join(DATA_DIR, 'data_processed.csv'))
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,1,9,1,0,0,0,2,0,2,2,0,1,1,3,65.6,593.3,0
1,1,0,0,0,9,1,2,0,0,0,0,0,0,2,0,0,3,59.9,542.4,0
2,1,1,0,0,4,1,0,1,0,0,2,0,0,0,0,1,2,73.9,280.85,1
3,1,1,1,0,13,1,0,1,0,2,2,0,2,2,0,1,2,98.0,1237.85,1
4,0,1,1,0,3,1,0,1,0,0,0,2,2,0,0,1,3,83.9,267.4,1


In [3]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# Function to print confusion matrix
def conf_matrix(y_test, y_pred):
    matrix = confusion_matrix(y_test, y_pred)

    sns.heatmap(data=matrix, annot=True, fmt=".0f", xticklabels = ["Churn no", "Churn yes"], yticklabels = ["Churn no", "Churn yes"], cmap="Blues")
    plt.ylabel("True Class")
    plt.xlabel("Predicted Class")
    plt.title("Decision Tree Classifier")

In [6]:
def metrics(y_test, y_pred):
    return(classification_report(y_test, y_pred, target_names = ["Churn No", "Churn Yes"]))

#### 1.1 Randomforest classifier

In [18]:
# Hyperprameter tuning
#params = {'n_estimators' :[180, 200, 215, 230],
#         'max_depth': [8],
#         'min_samples_leaf': [18]}

params = {'n_estimators' :[300, 350, 400, 450, 500], 'max_depth': [3,5,7,9,11], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [10, 20, 30]}

random_grid = {'max_depth': [6,8,10], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [40, 50, 60], 'n_estimators': [ 300, 350,400]}

rf_model = RandomForestClassifier(random_state=5, oob_score=True)

gridsearch = GridSearchCV(rf_model, params,verbose=1, cv=5, n_jobs=-1, scoring = 'recall')
gridsearch.fit(X_train, y_train)
#gridsearch = RandomizedSearchCV(estimator = RandomForestClassifier(random_state=5, oob_score=True), param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring = 'roc_auc')

#gridsearch = RandomForestClassifier(random_state=5, oob_score=True, n_estimators=215, min_samples_leaf=18,max_depth=8,max_features='sqrt')

Fitting 5 folds for each of 150 candidates, totalling 750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   51.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed:  4.7min finished


GridSearchCV(cv=5,
             estimator=RandomForestClassifier(oob_score=True, random_state=5),
             n_jobs=-1,
             param_grid={'max_depth': [3, 5, 7, 9, 11],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [10, 20, 30],
                         'n_estimators': [300, 350, 400, 450, 500]},
             scoring='recall', verbose=1)

In [8]:
gridsearch.best_estimator_

RandomForestClassifier(max_depth=7, min_samples_leaf=10, n_estimators=300,
                       oob_score=True, random_state=5)

In [9]:
best_model = gridsearch.best_estimator_

#### 1.2 Random forest metrics

In [16]:
y_pred = cross_val_predict(estimator=best_model, X=X_test, y=y_test, cv=5)
cross_val_scores = cross_val_score(estimator=best_model, X=X_test, y=y_test, cv=5, scoring="recall")
print(f"Average ROC AUC score across 5-fold-CV: {cross_val_scores.mean().round(decimals=2)}")
cross_val_scores

Average ROC AUC score across 5-fold-CV: 0.59


array([0.53153153, 0.59459459, 0.55855856, 0.62162162, 0.66071429])

In [17]:
print(metrics(y_test, y_pred))

              precision    recall  f1-score   support

    Churn No       0.87      0.94      0.90      1554
   Churn Yes       0.77      0.59      0.67       556

    accuracy                           0.85      2110
   macro avg       0.82      0.76      0.78      2110
weighted avg       0.84      0.85      0.84      2110



In [13]:
y_pred2_CV = cross_val_predict(best_model, X_test, y_test, cv=5)
print(metrics(y_test, y_pred2_CV))

              precision    recall  f1-score   support

    Churn No       0.87      0.94      0.90      1554
   Churn Yes       0.77      0.62      0.69       556

    accuracy                           0.85      2110
   macro avg       0.82      0.78      0.79      2110
weighted avg       0.85      0.85      0.85      2110



### 2. Ensemble learning
### Voting classifier
- RandomForestClassifier
- XGBoost
- SVM classifier


In [16]:
# define all models 
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
rf_model = RandomForestClassifier(random_state=42)
svm_model = SVC(random_state=42, probability=True)

In [17]:
voting_clf = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svc', svm_model)],
voting='soft')
voting_model= voting_clf.fit(X_train, y_train)


In [23]:
y_pred3 = voting_model.predict(X_test)

In [24]:
print(metrics(y_test, y_pred3))

              precision    recall  f1-score   support

    Churn No       0.87      0.92      0.90      1554
   Churn Yes       0.75      0.63      0.68       556

    accuracy                           0.85      2110
   macro avg       0.81      0.78      0.79      2110
weighted avg       0.84      0.85      0.84      2110



In [20]:
y_pred3_CV = cross_val_predict(voting_model, X_test, y_test, n_jobs=-1, cv=5)

In [21]:
print(metrics(y_test, y_pred3_CV))

              precision    recall  f1-score   support

    Churn No       0.86      0.93      0.89      1554
   Churn Yes       0.74      0.59      0.65       556

    accuracy                           0.84      2110
   macro avg       0.80      0.76      0.77      2110
weighted avg       0.83      0.84      0.83      2110



In [43]:
rf_model2 = RandomForestClassifier(n_estimators = 20000, max_depth=5, min_samples_leaf=40, random_state=42, oob_score=True)
rf_model2.fit(X_train,y_train)

RandomForestClassifier(max_depth=5, min_samples_leaf=40, n_estimators=20000,
                       oob_score=True, random_state=42)

In [44]:
y_pred_rf = rf_model2.predict(X_test)
print(metrics(y_test, y_pred_rf))

              precision    recall  f1-score   support

    Churn No       0.87      0.93      0.90      1554
   Churn Yes       0.76      0.63      0.69       556

    accuracy                           0.85      2110
   macro avg       0.82      0.78      0.79      2110
weighted avg       0.84      0.85      0.84      2110



In [42]:
y_pred_rf = rf_model2.predict(X_train)
print(metrics(y_train, y_pred_rf))

              precision    recall  f1-score   support

    Churn No       0.89      0.95      0.92      3609
   Churn Yes       0.82      0.66      0.73      1313

    accuracy                           0.87      4922
   macro avg       0.85      0.81      0.82      4922
weighted avg       0.87      0.87      0.87      4922

