
# **Model Building**

In [6]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

### **Reading the DataFrame**

In [9]:
churn_df= pd.read_csv('churn_data_cleaned.csv', index_col= 0)
churn_df.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,1,0,1,1,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,0


### **Extracting Inputs and Outputs**

**x** will be our input and **y** will be our output (target variable). y would represent whether a customer is likely **churn(1)** or **not(0)**. 

In [12]:
x = churn_df.drop('Churn', axis = 1)
x.head() 

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0


In [13]:
y = churn_df['Churn']
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

### **Splitting into training and testing data**

Using 80% of the data as training data and rest as testing

In [39]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2) 

### **Model Building**

We know that an accuracy of a base Hardcoded model- One in which all values of target variable would be set to *zero*  would be 83%

### Decision Tree Classifier

In [40]:
# We have used gini as the criterion to split since its computationally less expensive in comparison to enytropy which calculates logarithms at each stage
dt_model = DecisionTreeClassifier(criterion = "gini", 
                                  random_state = 100, 
                                  max_depth = 6,
                                  min_samples_leaf = 8)

#### Fitting the model

In [41]:
dt_model.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

####  Making predictions

In [43]:
dt_preds = dt_model.predict(x_test)

#### Evaluating the model

In [87]:
# accuracy score of the model
dt_accuracy = dt_model.score(x_test, y_test)
dt_accuracy

0.7810945273631841

In [46]:
print(classification_report(y_test, dt_preds, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85      1014
           1       0.62      0.54      0.58       393

    accuracy                           0.78      1407
   macro avg       0.73      0.71      0.72      1407
weighted avg       0.77      0.78      0.78      1407



Here we can see extremely poor scores on all three metrics - Precision, recall and f1-score for the minority class i.e class of churners. We can also see that the accuracy score is not much higher in comparison to our hardcoded base model (with all values = 0, accuracy score = 73%).

The most likely cause for the poor performance is the **imbalance** in the dataset. To improve the results we will now Oversample the data set using **SMOTEENN** (Over sampling using SMOTE and cleaning using Editted Nearest Neighbour)

### Over Sampled Decision Tree Classifer

In [48]:
# Balancing by upsampling
sm = SMOTEENN()
X_balanced, y_balanced = sm.fit_resample(x,y) 

In [53]:
# Making training and test sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_balanced, y_balanced, test_size = 0.2) 

In [54]:
# Making a decision tree model on balanced data
dt_smote_model = DecisionTreeClassifier(criterion = "gini", 
                                  random_state = 100, 
                                  max_depth = 6,
                                  min_samples_leaf = 8)

dt_smote_model.fit(X_train2, y_train2)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [55]:
# Getting predictions
dt_smote_preds = dt_smote_model.predict(X_test2)

In [84]:
dt_smote_accuracy = dt_smote_model.score(X_test2, y_test2)
dt_smote_accuracy

0.9260832625318607

In [56]:
# Evaluating the model
print(classification_report(y_test2, dt_smote_preds, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.94      0.89      0.91       518
           1       0.92      0.95      0.94       659

    accuracy                           0.93      1177
   macro avg       0.93      0.92      0.92      1177
weighted avg       0.93      0.93      0.93      1177



In [57]:
print(metrics.confusion_matrix(y_test2, dt_smote_preds))

[[463  55]
 [ 32 627]]


We can see a considerable improvements in the Decision Tree's predictions after performing SMOTEENN. The new model has almost **93%** accuracy and better metrics for both churners and non-churners.

We will now try to implement some other models to test for better results.

### Random Forest Classifier


In [58]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(criterion = 'gini', 
                                  n_estimators = 100,  
                                  random_state = 100,
                                  max_depth = 6, 
                                  min_samples_leaf = 8)

In [59]:
rf_model.fit(X_train2, y_train2)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [60]:
rf_preds = rf_model.predict(X_test2)

In [61]:
# Evaluating the model
print(classification_report(y_test2, rf_preds, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.94      0.88      0.91       518
           1       0.91      0.96      0.93       659

    accuracy                           0.92      1177
   macro avg       0.93      0.92      0.92      1177
weighted avg       0.92      0.92      0.92      1177



In [62]:
print(metrics.confusion_matrix(y_test2, dt_smote_preds))


[[463  55]
 [ 32 627]]


In [83]:
rf_accuracy = rf_model.score(X_test2, y_test2)
rf_accuracy

0.923534409515718

We can see that the performance of Random Forest classifier is not much better than the Decision Tree classifer. In fact the accuracy score of Random Forest classifier is lesser than the Decision Tree Model

### XGBoost Model

In [74]:
from xgboost import XGBClassifier

xg_model = XGBClassifier(objective='binary:logistic')
xg_model.fit(X_train2, y_train2)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [72]:
xg_preds =  xg_model.predict(X_test2)

In [73]:
print(classification_report(y_test2, xg_preds, labels=[0,1]))


              precision    recall  f1-score   support

           0       0.94      0.95      0.95       518
           1       0.96      0.95      0.96       659

    accuracy                           0.95      1177
   macro avg       0.95      0.95      0.95      1177
weighted avg       0.95      0.95      0.95      1177



In [82]:
xg_accuracy = xg_model.score(X_test2, y_test2)
xg_accuracy

0.9532710280373832

We can see that the performance of XGBoost in all metrics is really good.

### **Model Comparison**

In [91]:
# Compare Several models according to their Accuracies
Model_Comparison = pd.DataFrame({
    'Model': ['Decision Tree', 'SMOTE Decision Tree', 'Random Forest Classifier', 'XGBoost'],
    'Score': [dt_accuracy, dt_smote_accuracy, rf_accuracy, xg_accuracy]})
Model_Comparison['Score'] = Model_Comparison['Score']*100
Model_Comparison_df = Model_Comparison.sort_values(by='Score', ascending=False)
Model_Comparison_df = Model_Comparison_df.set_index('Score')
Model_Comparison_df.reset_index()

Unnamed: 0,Score,Model
0,95.327103,XGBoost
1,92.608326,SMOTE Decision Tree
2,92.353441,Random Forest Classifier
3,78.109453,Decision Tree


### **Saving the Final Model**


In [104]:
# Saving the final model into a .sav file through pickle
import pickle

filename = 'model3.sav'
pickle.dump(rf_model, open(filename, 'wb'))


In [95]:
load_model = pickle.load(open(filename, 'rb'))

In [96]:
final_model_score = load_model.score(X_test2, y_test2)
final_model_score

0.9532710280373832