In [313]:
import pandas as pd
from sklearn import metrics

from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score , recall_score , classification_report , confusion_matrix , roc_auc_score , f1_score

import warnings
warnings.filterwarnings("ignore")


In [269]:
df = pd.read_csv('../data/processed/processed_churn.csv')
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,-1.277445,0,1,-1.172988,-0.994411,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0.066327,1,0,-0.275697,-0.173258,0,...,0,0,0,0,0,1,0,0,0,1
2,1,0,0,0,-1.236724,1,1,-0.375396,-0.959534,1,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0.514251,0,0,-0.740959,-0.19489,0,...,1,0,0,0,0,1,0,0,0,0
4,0,0,0,0,-1.236724,1,1,0.189564,-0.94055,1,...,0,0,0,0,0,0,0,0,1,0


Building Baseline Model

In [270]:
# Data Spliting

from sklearn.model_selection import train_test_split

X = df.drop('Churn', axis=1) # Features
y = df['Churn']              # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [271]:
# Smoteenn 

from imblearn.combine import SMOTEENN

smote = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X, y)
Xr_train,Xr_test,yr_train,yr_test=train_test_split(X_train_resampled, y_train_resampled,test_size=0.2)
print("Before Resampling:", yr_train.value_counts())
print("After Resampling:", yr_test.value_counts())

Before Resampling: Churn
1    2919
0    2266
Name: count, dtype: int64
After Resampling: Churn
1    740
0    557
Name: count, dtype: int64


Logistic Regression

In [272]:
# training the Baseliner Model with Logistic Regression Algorithm

model_LG = LogisticRegression(max_iter=1000)
model_LG.fit(X_train, y_train)

y_pred_LG = model_LG.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_LG))
print("Classification Report:\n", classification_report(y_test, y_pred_LG))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_LG))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_LG))


Accuracy: 0.8062455642299503
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.61       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409

ROC AUC Score: 0.7280916582706863
Confusion Matrix:
 [[926 109]
 [164 210]]


In [273]:
# Logistic Regression Algorithm using smoteenn

from sklearn.linear_model import LogisticRegression

model_LG_bal = LogisticRegression(max_iter=1000)
model_LG_bal.fit(Xr_train, yr_train)

y_pred_LG_bal = model_LG_bal.predict(Xr_test)

print("Accuracy:", accuracy_score(yr_test, y_pred_LG_bal))
print("Classification Report:\n", classification_report(yr_test, y_pred_LG_bal))
print("ROC AUC Score:", roc_auc_score(yr_test, y_pred_LG_bal))
print("Confusion Matrix:\n", confusion_matrix(yr_test, y_pred_LG_bal))

Accuracy: 0.9298380878951427
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.90      0.92       557
           1       0.93      0.95      0.94       740

    accuracy                           0.93      1297
   macro avg       0.93      0.93      0.93      1297
weighted avg       0.93      0.93      0.93      1297

ROC AUC Score: 0.9260820515308845
Confusion Matrix:
 [[501  56]
 [ 35 705]]


Decision Tree

In [274]:


model_DT = DecisionTreeClassifier()
model_DT.fit(X_train, y_train)

y_pred = model_DT.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7274662881476224
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.82      0.82      1035
           1       0.49      0.47      0.48       374

    accuracy                           0.73      1409
   macro avg       0.65      0.65      0.65      1409
weighted avg       0.73      0.73      0.73      1409

ROC AUC Score: 0.646292851791573
Confusion Matrix:
 [[848 187]
 [197 177]]


In [275]:
# Decision Tree Algorithm using smoteenn

model_DT_bal = DecisionTreeClassifier()
model_DT_bal.fit(Xr_train, yr_train)

y_pred_DT_bal = model_DT_bal.predict(Xr_test)

print("Accuracy:", accuracy_score(yr_test, y_pred_DT_bal))
print("Classification Report:\n", classification_report(yr_test, y_pred_DT_bal))
print("ROC AUC Score:", roc_auc_score(yr_test, y_pred_DT_bal))
print("Confusion Matrix:\n", confusion_matrix(yr_test, y_pred_DT_bal))

Accuracy: 0.9313801079414032
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.90      0.92       557
           1       0.93      0.95      0.94       740

    accuracy                           0.93      1297
   macro avg       0.93      0.93      0.93      1297
weighted avg       0.93      0.93      0.93      1297

ROC AUC Score: 0.9280993740598769
Confusion Matrix:
 [[504  53]
 [ 36 704]]


Random Forest

In [276]:


model_RF = RandomForestClassifier()
model_RF.fit(X_train, y_train)

y_pred = model_RF.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7828246983676366
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.90      0.86      1035
           1       0.62      0.47      0.53       374

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.70      1409
weighted avg       0.77      0.78      0.77      1409

ROC AUC Score: 0.6822663979952982
Confusion Matrix:
 [[928 107]
 [199 175]]


In [277]:
# Random Forest Algorithm using smoteenn

model_RF_bal = RandomForestClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)
model_RF_bal.fit(Xr_train, yr_train)

y_pred_RF_bal = model_RF_bal.predict(Xr_test)

print("Accuracy:", accuracy_score(yr_test, y_pred_RF_bal))
print("Classification Report:\n", classification_report(yr_test, y_pred_RF_bal))
print("ROC AUC Score:", roc_auc_score(yr_test, y_pred_RF_bal))
print("Confusion Matrix:\n", confusion_matrix(yr_test, y_pred_RF_bal))

Accuracy: 0.9198149575944488
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.86      0.90       557
           1       0.90      0.96      0.93       740

    accuracy                           0.92      1297
   macro avg       0.92      0.91      0.92      1297
weighted avg       0.92      0.92      0.92      1297

ROC AUC Score: 0.9128584598961618
Confusion Matrix:
 [[481  76]
 [ 28 712]]


XG Boost

In [278]:


model_XGB = XGBClassifier()
model_XGB.fit(X_train, y_train)

y_pred_XGB = model_XGB.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_XGB))
print("Classification Report:\n", classification_report(y_test, y_pred_XGB))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_XGB))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_XGB))


Accuracy: 0.7778566359119943
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85      1035
           1       0.59      0.52      0.55       374

    accuracy                           0.78      1409
   macro avg       0.71      0.69      0.70      1409
weighted avg       0.77      0.78      0.77      1409

ROC AUC Score: 0.6942532744323027
Confusion Matrix:
 [[903 132]
 [181 193]]


In [279]:
# XGBoost Algorithm using smoteenn

model_XGB_bal = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, subsample=0.8, colsample_bytree=0.8, random_state=42, scale_pos_weight=3)
model_XGB_bal.fit(Xr_train, yr_train)

y_pred_XGB_bal = model_XGB_bal.predict(Xr_test)

print("Accuracy:", accuracy_score(yr_test, y_pred_XGB_bal))
print("Classification Report:\n", classification_report(yr_test, y_pred_XGB_bal))
print("ROC AUC Score:", roc_auc_score(yr_test, y_pred_XGB_bal))
print("Confusion Matrix:\n", confusion_matrix(yr_test, y_pred_XGB_bal))                                           #gjghjghk

Accuracy: 0.9336931380107941
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.88      0.92       557
           1       0.91      0.98      0.94       740

    accuracy                           0.93      1297
   macro avg       0.94      0.93      0.93      1297
weighted avg       0.94      0.93      0.93      1297

ROC AUC Score: 0.9265745548061526
Confusion Matrix:
 [[488  69]
 [ 17 723]]


In [280]:
# XGBoost Classifier with Smoteenn , RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV
model_XGB_CV =  XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_dist = {
    'n_estimators': [100, 200,300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1,0.2],
    'subsample': [0.6,0.8, 1.0],
    'colsample_bytree': [0.6,0.8, 1.0],
    'scale_pos_weight': [1,len(yr_train[yr_train==0]) / len(yr_train[yr_train==1])]
}

random_search = RandomizedSearchCV(model_XGB_CV, param_distributions=param_dist, n_iter=20, scoring='recall', cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(Xr_train, yr_train)
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'subsample': 0.8, 'scale_pos_weight': 1, 'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.01, 'colsample_bytree': 1.0}
Best Score: 0.9657405014215559


In [314]:
# Applying the best parameters obtained by RandomizedSearchCV

model_XGB_RSV_bal = XGBClassifier(subsample=1.0, scale_pos_weight=1, n_estimators=300, max_depth=9, learning_rate=0.05, colsample_bytree=1.0, use_label_encoder=False, eval_metric='logloss')
model_XGB_RSV_bal.fit(Xr_train, yr_train)

y_pred_XGB_RSV_BAL = model_XGB_RSV_bal.predict(Xr_test)

print("Accuracy:", accuracy_score(yr_test, y_pred_XGB_RSV_BAL))
print("Classification Report:\n", classification_report(yr_test, y_pred_XGB_RSV_BAL))
print("ROC AUC Score:", roc_auc_score(yr_test, y_pred_XGB_RSV_BAL))
print("Confusion Matrix:\n", confusion_matrix(yr_test, y_pred_XGB_RSV_BAL))
print('f1 Score:', f1_score(yr_test, y_pred_XGB_RSV_BAL))


Accuracy: 0.9545104086353122
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95       557
           1       0.95      0.97      0.96       740

    accuracy                           0.95      1297
   macro avg       0.95      0.95      0.95      1297
weighted avg       0.95      0.95      0.95      1297

ROC AUC Score: 0.9523654713959919
Confusion Matrix:
 [[522  35]
 [ 24 716]]
f1 Score: 0.960429242119383


Support Vector Machine

In [282]:

model_SVC = SVC()
model_SVC.fit(X_train, y_train)

y_pred_SVC = model_SVC.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_SVC))
print("Classification Report:\n", classification_report(y_test, y_pred_SVC))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_SVC))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_SVC))


Accuracy: 0.794180269694819
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87      1035
           1       0.65      0.48      0.55       374

    accuracy                           0.79      1409
   macro avg       0.74      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409

ROC AUC Score: 0.6942648996357436
Confusion Matrix:
 [[939  96]
 [194 180]]


In [283]:
model_SVC_bal = SVC()
model_SVC_bal.fit(Xr_train, yr_train)

y_pred_SVC_bal = model_SVC_bal.predict(Xr_test)

print("Accuracy:", accuracy_score(yr_test, y_pred_SVC_bal))
print("Classification Report:\n", classification_report(yr_test, y_pred_SVC_bal))
print("ROC AUC Score:", roc_auc_score(yr_test, y_pred_SVC_bal))
print("Confusion Matrix:\n", confusion_matrix(yr_test, y_pred_SVC_bal))


Accuracy: 0.9406322282189669
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.91      0.93       557
           1       0.94      0.96      0.95       740

    accuracy                           0.94      1297
   macro avg       0.94      0.94      0.94      1297
weighted avg       0.94      0.94      0.94      1297

ROC AUC Score: 0.9370954437381726
Confusion Matrix:
 [[508  49]
 [ 28 712]]


Naive Bayes

In [284]:


model_NB = GaussianNB()
model_NB.fit(X_train, y_train)

Y_pred_NB = model_NB.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7828246983676366
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.90      0.86      1035
           1       0.62      0.47      0.53       374

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.70      1409
weighted avg       0.77      0.78      0.77      1409

ROC AUC Score: 0.6822663979952982
Confusion Matrix:
 [[928 107]
 [199 175]]


In [285]:
model_NB_bal = GaussianNB(var_smoothing=1e-9)
model_NB_bal.fit(Xr_train, yr_train)

Y_pred_NB_bal = model_NB_bal.predict(Xr_test)

print("Accuracy:", accuracy_score(yr_test, Y_pred_NB_bal))
print("Classification Report:\n", classification_report(yr_test, Y_pred_NB_bal))
print("ROC AUC Score:", roc_auc_score(yr_test, Y_pred_NB_bal))
print("Confusion Matrix:\n", confusion_matrix(yr_test, Y_pred_NB_bal))

Accuracy: 0.882035466461064
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.76      0.85       557
           1       0.84      0.97      0.90       740

    accuracy                           0.88      1297
   macro avg       0.90      0.87      0.88      1297
weighted avg       0.89      0.88      0.88      1297

ROC AUC Score: 0.8673188898054248
Confusion Matrix:
 [[425 132]
 [ 21 719]]


K Nearest Neighbour

In [286]:


model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)

Y_pred_knn = model_knn.predict(X_test)

print("Accuracy:", accuracy_score(y_test, Y_pred))
print("Classification Report:\n", classification_report(y_test, Y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, Y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, Y_pred))

Accuracy: 0.7643718949609652
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84      1035
           1       0.56      0.55      0.55       374

    accuracy                           0.76      1409
   macro avg       0.70      0.69      0.70      1409
weighted avg       0.76      0.76      0.76      1409

ROC AUC Score: 0.6944664031620553
Confusion Matrix:
 [[873 162]
 [170 204]]


In [287]:
model_knn_bal = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='minkowski')
model_knn_bal.fit(Xr_train, yr_train)

Y_pred_knn_bal = model_knn_bal.predict(Xr_test)

print("Accuracy:", accuracy_score(yr_test, Y_pred_knn_bal))
print("Classification Report:\n", classification_report(yr_test, Y_pred_knn_bal))

print("ROC AUC Score:", roc_auc_score(yr_test, Y_pred_knn_bal))
print("Confusion Matrix:\n", confusion_matrix(yr_test, Y_pred_knn_bal))

Accuracy: 0.969159599074788
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.93      0.96       557
           1       0.95      1.00      0.97       740

    accuracy                           0.97      1297
   macro avg       0.97      0.96      0.97      1297
weighted avg       0.97      0.97      0.97      1297

ROC AUC Score: 0.9647593284487359
Confusion Matrix:
 [[520  37]
 [  3 737]]


MODEL SELECTION

1. XGBoost + RandomSearchCV(Smoothed) --> Best overall balenced Model  

2. KNN --> Best raw performance but risk of Overfitting 

FINAL MODEL -->  XGBoost

NOTE : KNN , SVM also gave slightly better results but XGBoost was chosen because it's more Robust and Widely trusted. 

                                                                                                                                                                 .

FINAL TRAINING AND SAVE THE MODEL

In [288]:
import joblib

# Save the model
joblib.dump(model_XGB_RSV_bal, 'model_XGB_RSV_bal.pkl')

# Load the model
loaded_model_XGB_RSV_bal = joblib.load('model_XGB_RSV_bal.pkl')


Evaluate on Test Set

In [316]:
import pickle
filename = 'model_XGB_RSV_bal.pkl'
pickle.dump(model_XGB_RSV_bal, open(filename, 'wb'))
load_model = pickle.load(open(filename, 'rb'))
model_score_r1 = load_model.score(Xr_test, yr_test)
model_score_r1

0.9545104086353122

In [327]:
import sys
import os

sys.path.append(os.path.abspath("../src"))
os.makedirs("reports/figures", exist_ok=True)


from utils.metrics import (
    plot_confusion_matrix,
    plot_roc_curve,
    plot_pr_curve,
    save_classification_report
)

In [331]:

# Predictions
y_pred = model_XGB_RSV_bal.predict(Xr_test)
y_probs = model_XGB_RSV_bal.predict_proba(Xr_test)[:, 1]

# Save plots to reports/figures
plot_confusion_matrix(yr_test, y_pred, save_path="reports/figures/confusion_matrix.png")
plot_roc_curve(yr_test, y_probs, save_path="reports/figures/roc_curve.png")
plot_pr_curve(yr_test, y_probs, save_path="reports/figures/pr_curve.png")

# Save classification report
save_classification_report(yr_test, y_pred, save_path="reports/results.json")

{'0': {'precision': 0.9560439560439561,
  'recall': 0.9371633752244165,
  'f1-score': 0.9465095194922938,
  'support': 557.0},
 '1': {'precision': 0.9533954727030626,
  'recall': 0.9675675675675676,
  'f1-score': 0.960429242119383,
  'support': 740.0},
 'accuracy': 0.9545104086353122,
 'macro avg': {'precision': 0.9547197143735093,
  'recall': 0.952365471395992,
  'f1-score': 0.9534693808058383,
  'support': 1297.0},
 'weighted avg': {'precision': 0.9545328707145333,
  'recall': 0.9545104086353122,
  'f1-score': 0.9544513812841565,
  'support': 1297.0}}

In [333]:
print("Length of y_test:", len(y_test))
print("Length of y_pred:", len(y_pred))

Length of y_test: 1409
Length of y_pred: 1297
