### Import Libraries

In [42]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from imblearn.over_sampling import SMOTE

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

### Import Dataset

In [2]:
df = pd.read_csv("Churn.csv")
df.sample(5)

Unnamed: 0,Pipeline,Amount,Churn,Contract.Length,Has.Ever.Been.Poc,Country,Tier,Industry,Created.On.Platform,No.Of.Conversations.Q1.To.Q3,...,Live.Campaigns.Q1.To.Q3,No.Of.Integrations.Q1.To.Q3,Previous.No.Of.Deals,Percentage.Going.Live,Distinct.Game.Types.Used,Distinct.Live.Game.Types.Used,Avg.Unique.Reg.Rate.Q1.To.Q3,Avg.Time.Spent.Q1.To.Q3,Days.Until.First.Campaign,Days.Until.First.Live.Campaign
754,Newbizz Pipeline,10752.688172,0,366,Yes,DK,A,Retailers,2019,5,...,6,3,0,0.75,2,2,0.559742,93.785714,13,35.0
291,Renewal sales pipeline,4480.241935,0,366,No,DK,C,Pharma / Healthcare,2016,1,...,2,1,0,0.666667,3,2,0.613744,71.0,36,34.0
6,Renewal sales pipeline,3360.215054,0,365,No,DK,C,Financial services,2018,0,...,0,1,1,0.0,1,0,0.367183,13.0,44,72.833333
342,Renewal sales pipeline,3360.215054,0,365,No,DK,C,B2B,2019,2,...,0,0,1,0.0,3,0,0.871181,63.444444,28,72.833333
520,Newbizz Pipeline,6016.129032,0,366,No,FI,B,Retailers,2020,16,...,7,7,0,0.304348,14,4,0.673754,29.44186,87,103.0


In [3]:
df["Tier"].value_counts()

B             268
A             225
C             216
Enterprise    112
Name: Tier, dtype: int64

**Data Information**

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 821 entries, 0 to 820
Data columns (total 23 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Pipeline                               821 non-null    object 
 1   Amount                                 821 non-null    float64
 2   Churn                                  821 non-null    int64  
 3   Contract.Length                        821 non-null    int64  
 4   Has.Ever.Been.Poc                      821 non-null    object 
 5   Country                                821 non-null    object 
 6   Tier                                   821 non-null    object 
 7   Industry                               821 non-null    object 
 8   Created.On.Platform                    821 non-null    int64  
 9   No.Of.Conversations.Q1.To.Q3           821 non-null    int64  
 10  Avg.Conversation.Rating.Q1.To.Q3       821 non-null    float64
 11  Avg.No

In [5]:
df.describe()

Unnamed: 0,Amount,Churn,Contract.Length,Created.On.Platform,No.Of.Conversations.Q1.To.Q3,Avg.Conversation.Rating.Q1.To.Q3,Avg.No.Of.Conversation.Parts.Q1.To.Q3,Created.Campaigns.Q1.To.Q3,Live.Campaigns.Q1.To.Q3,No.Of.Integrations.Q1.To.Q3,Previous.No.Of.Deals,Percentage.Going.Live,Distinct.Game.Types.Used,Distinct.Live.Game.Types.Used,Avg.Unique.Reg.Rate.Q1.To.Q3,Avg.Time.Spent.Q1.To.Q3,Days.Until.First.Campaign,Days.Until.First.Live.Campaign
count,821.0,821.0,821.0,821.0,821.0,821.0,821.0,821.0,821.0,821.0,821.0,821.0,821.0,821.0,821.0,821.0,821.0,821.0
mean,9657.051711,0.120585,372.934227,2018.937881,10.131547,4.703725,26.051568,20.292326,8.101096,2.752741,0.371498,0.399253,5.897686,2.809988,0.367183,39.317469,30.479903,72.833333
std,6828.748353,0.325842,53.226328,1.385391,13.067731,0.288429,11.07338,28.249538,15.797368,2.005725,0.569224,0.374597,4.133947,2.463479,0.138341,28.437184,42.704043,61.234633
min,0.134409,0.0,13.0,2014.0,0.0,3.0,6.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.053818,0.0,1.0,-305.0
25%,5376.344086,0.0,365.0,2018.0,1.0,4.666667,20.0,6.0,1.0,1.0,0.0,0.125,3.0,1.0,0.297884,20.458333,6.0,31.0
50%,7661.290323,0.0,365.0,2019.0,6.0,4.703725,26.051568,12.0,4.0,2.0,0.0,0.333333,5.0,2.0,0.367183,36.565217,15.0,67.0
75%,11424.731183,0.0,366.0,2020.0,14.0,5.0,28.4,23.0,8.0,4.0,1.0,0.588235,8.0,4.0,0.417306,53.6,34.0,85.0
max,71023.41129,1.0,638.0,2021.0,104.0,5.0,125.0,267.0,207.0,11.0,2.0,4.0,29.0,14.0,0.871181,233.0,267.0,426.0


### Label Encoding for Ordinal Features

In [6]:
# POS
def poc(x):
    if x == "No":
        return 0
    else:
        return 1

df["Has.Ever.Been.Poc"] = df["Has.Ever.Been.Poc"].apply(poc)

In [7]:
# Tier

def tier(x):
    if x == "A":
        return 3
    if x == "B":
        return 2
    if x == "C":
        return 1
    if x == "Enterprise":
        return 0

df["Tier"] = df["Tier"].apply(tier)

### One Hot Encoding

In [8]:
df = pd.get_dummies(df)

**Separate features and target variables**

In [9]:
# Features
features = df[df.columns.drop("Churn")]

# Target
target = df["Churn"]

### Logistic Regression Stats Model

In [10]:
X_train, X_test, y_train, y_test = train_test_split(features,
                                                    target,
                                                    test_size = 0.2,
                                                    stratify = target,
                                                    random_state = 42)

In [11]:
# building the model and fitting the data
log_reg = sm.Logit(y_train, X_train).fit()

log_reg.summary()

         Current function value: 0.266427
         Iterations: 35




0,1,2,3
Dep. Variable:,Churn,No. Observations:,656.0
Model:,Logit,Df Residuals:,621.0
Method:,MLE,Df Model:,34.0
Date:,"Mon, 01 Nov 2021",Pseudo R-squ.:,0.2756
Time:,16:44:47,Log-Likelihood:,-174.78
converged:,False,LL-Null:,-241.26
Covariance Type:,nonrobust,LLR p-value:,1.22e-13

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Amount,-7.681e-05,3.23e-05,-2.379,0.017,-0.000,-1.35e-05
Contract.Length,0.0083,0.003,2.978,0.003,0.003,0.014
Has.Ever.Been.Poc,0.2414,0.432,0.559,0.576,-0.605,1.088
Tier,-0.3828,0.177,-2.162,0.031,-0.730,-0.036
Created.On.Platform,-0.1537,0.160,-0.962,0.336,-0.467,0.159
No.Of.Conversations.Q1.To.Q3,-0.0786,0.025,-3.173,0.002,-0.127,-0.030
Avg.Conversation.Rating.Q1.To.Q3,-0.5106,0.519,-0.984,0.325,-1.528,0.507
Avg.No.Of.Conversation.Parts.Q1.To.Q3,-0.0019,0.013,-0.143,0.886,-0.027,0.024
Created.Campaigns.Q1.To.Q3,0.0012,0.023,0.051,0.959,-0.044,0.046


### Select features which have P>|z| less than 0.05

In [12]:
X = df[["Amount", 
        "Contract.Length", 
        "Tier", 
        "No.Of.Conversations.Q1.To.Q3", 
        "Previous.No.Of.Deals"]]

y = df["Churn"]

### Apply SMOTE Method if target class is unballanced

In [13]:
df["Churn"].value_counts(normalize = True)

0    0.879415
1    0.120585
Name: Churn, dtype: float64

Target Class is highly unballanced so we apply SMOTE to balance the classes

### SMOTE

In [14]:
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1444 entries, 0 to 1443
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Amount                        1444 non-null   float64
 1   Contract.Length               1444 non-null   int64  
 2   Tier                          1444 non-null   int64  
 3   No.Of.Conversations.Q1.To.Q3  1444 non-null   int64  
 4   Previous.No.Of.Deals          1444 non-null   int64  
dtypes: float64(1), int64(4)
memory usage: 56.5 KB


In [16]:
X["Tier"] = X["Tier"].astype("category")

**Split Data into training set and testing set**

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    stratify = y,
                                                    random_state = 42)

# Logistic Regression

In [87]:
lr = LogisticRegression()

lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_lr}).sample(10)

Unnamed: 0,Actual,Predicted
783,1,1
718,0,0
803,0,0
1338,1,0
524,1,1
642,0,0
1167,1,1
348,0,1
70,1,1
738,0,0


**Logistic Regression Confusion Matrix**

In [88]:
pd.DataFrame(confusion_matrix(y_test, pred_lr))

Unnamed: 0,0,1
0,105,40
1,19,125


**Logistic Regression Classification Report**

In [89]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.85      0.72      0.78       145
           1       0.76      0.87      0.81       144

    accuracy                           0.80       289
   macro avg       0.80      0.80      0.79       289
weighted avg       0.80      0.80      0.79       289



### Logistic Regression with Tuned Parameters

In [90]:
params = {"C": [0.0001, 0.001, 0.01, 0.1, 0.5, 0.7, 0.9, 1]}

lr = LogisticRegression()

cv = GridSearchCV(lr, params)

cv.fit(X_train, y_train)

cv.best_params_

{'C': 0.7}

In [91]:
lr = LogisticRegression(C = 0.7)

lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_lr}).sample(10)

Unnamed: 0,Actual,Predicted
859,1,0
174,0,1
824,1,1
800,0,0
43,0,0
1002,1,1
1272,1,1
435,0,1
697,0,1
655,0,0


**Tuned Logistic Regression Confusion Matrix**

In [92]:
pd.DataFrame(confusion_matrix(y_test, pred_lr))

Unnamed: 0,0,1
0,105,40
1,19,125


**Tuned Logistic Regression Classification Report**

In [93]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.85      0.72      0.78       145
           1       0.76      0.87      0.81       144

    accuracy                           0.80       289
   macro avg       0.80      0.80      0.79       289
weighted avg       0.80      0.80      0.79       289



### Decision Tree Classification

In [95]:
dt = DecisionTreeClassifier()

dt.fit(X_train, y_train)

pred_dt = dt.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_dt}).sample(10)

Unnamed: 0,Actual,Predicted
1027,1,1
769,1,1
509,0,0
348,0,1
1419,1,1
267,0,0
1159,1,1
380,0,0
732,0,0
1156,1,1


**Decision Tree Confusion Matrix**

In [96]:
pd.DataFrame(confusion_matrix(y_test, pred_dt))

Unnamed: 0,0,1
0,121,24
1,21,123


**Decision Tree Classification Report**

In [98]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84       145
           1       0.84      0.85      0.85       144

    accuracy                           0.84       289
   macro avg       0.84      0.84      0.84       289
weighted avg       0.84      0.84      0.84       289



### Decision Tree Classification with Tuned Parameters

In [102]:
params = {"max_depth": list(range(1, 10)),
          "min_samples_leaf": [0.1, 0.2, 0.3, 0.4, 0.5]}

dt = DecisionTreeClassifier()

cv = GridSearchCV(dt, params)

cv.fit(X_train, y_train)

cv.best_params_

{'max_depth': 4, 'min_samples_leaf': 0.1}

In [103]:
dt = DecisionTreeClassifier(max_depth = 4,
                            min_samples_leaf = 0.1)

dt.fit(X_train, y_train)

pred_dt = dt.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_dt}).sample(10)

Unnamed: 0,Actual,Predicted
1382,1,1
239,0,0
510,0,0
422,0,1
331,1,1
1171,1,1
271,1,1
551,0,0
305,0,0
600,0,1


**Tuned Decision Tree Confusion Matrix**

In [104]:
pd.DataFrame(confusion_matrix(y_test, pred_dt))

Unnamed: 0,0,1
0,117,28
1,39,105


**Tuned Decision Tree Classification Report**

In [105]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.75      0.81      0.78       145
           1       0.79      0.73      0.76       144

    accuracy                           0.77       289
   macro avg       0.77      0.77      0.77       289
weighted avg       0.77      0.77      0.77       289



### Random Forest Classification

In [107]:
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_rf}).sample(10)

Unnamed: 0,Actual,Predicted
228,0,0
774,0,0
905,1,1
904,1,1
503,0,0
600,0,1
1299,1,1
529,0,0
229,0,0
692,0,0


**Random Forest Confusion Matrix**

In [108]:
pd.DataFrame(confusion_matrix(y_test, pred_rf))

Unnamed: 0,0,1
0,123,22
1,11,133


**Random Forest Classification Report**

In [109]:
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.92      0.85      0.88       145
           1       0.86      0.92      0.89       144

    accuracy                           0.89       289
   macro avg       0.89      0.89      0.89       289
weighted avg       0.89      0.89      0.89       289



### Random Forest with Tuned Parameters

In [111]:
params = {"n_estimators": [50, 75, 100, 125, 150, 175, 200],
          "max_depth": list(range(1, 10)),
          "min_samples_leaf": [0.1, 0.2, 0.3, 0.4]}

rf = RandomForestClassifier()

cv = GridSearchCV(rf, params)

cv.fit(X_train, y_train)

cv.best_params_

{'max_depth': 4, 'min_samples_leaf': 0.1, 'n_estimators': 150}

In [112]:
rf = RandomForestClassifier(max_depth = 4, 
                            min_samples_leaf = 0.1, 
                            n_estimators = 150)

rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_rf}).sample(10)

Unnamed: 0,Actual,Predicted
815,0,1
35,0,1
633,0,0
421,0,0
995,1,1
548,1,1
732,0,0
150,0,0
1419,1,1
1067,1,1


**Tuned Random Forest Confusion Matrix**

In [113]:
pd.DataFrame(confusion_matrix(y_test, pred_rf))

Unnamed: 0,0,1
0,111,34
1,28,116


**Tuned Random Forest Classification Report**

In [115]:
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.80      0.77      0.78       145
           1       0.77      0.81      0.79       144

    accuracy                           0.79       289
   macro avg       0.79      0.79      0.79       289
weighted avg       0.79      0.79      0.79       289



### K Nearest Neighbors

In [116]:
knn = KNeighborsClassifier()

knn.fit(X_train, y_train)

pred_knn = knn.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_knn}).sample(10)

Unnamed: 0,Actual,Predicted
1382,1,1
1143,1,0
518,0,0
634,0,0
839,1,1
799,1,1
1357,1,1
1058,1,1
1395,1,1
732,0,1


**K Neighbors Classification Confusion Matrix**

In [117]:
pd.DataFrame(confusion_matrix(y_test, pred_knn))

Unnamed: 0,0,1
0,114,31
1,30,114


**K Nearest Neighbors Classification Report**

In [118]:
print(classification_report(y_test, pred_knn))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79       145
           1       0.79      0.79      0.79       144

    accuracy                           0.79       289
   macro avg       0.79      0.79      0.79       289
weighted avg       0.79      0.79      0.79       289



### Tuned K Nearest Neighbors 

In [119]:
params = {"n_neighbors": list(range(1, 10))}

knn = KNeighborsClassifier()

cv = GridSearchCV(knn, params)

cv.fit(X_train, y_train)

cv.best_params_

{'n_neighbors': 1}

In [120]:
knn = KNeighborsClassifier(n_neighbors = 1)

knn.fit(X_train, y_train)

pred_knn = knn.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_knn}).sample(10)

Unnamed: 0,Actual,Predicted
503,0,0
420,1,1
140,0,0
750,0,0
297,1,0
687,1,1
1406,1,1
319,0,0
957,1,1
430,0,0


**Tuned K Neighbors Classification Confusion Matrix**

In [121]:
pd.DataFrame(confusion_matrix(y_test, pred_knn))

Unnamed: 0,0,1
0,126,19
1,26,118


**Tuned K Nearest Neighbors Classification Report**

In [122]:
print(classification_report(y_test, pred_knn))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       145
           1       0.86      0.82      0.84       144

    accuracy                           0.84       289
   macro avg       0.85      0.84      0.84       289
weighted avg       0.85      0.84      0.84       289



### Support Vector Classification

In [124]:
svm = SVC()

svm.fit(X_train, y_train)

pred_svm = svm.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_svm}).sample(10)

Unnamed: 0,Actual,Predicted
995,1,1
1375,1,1
1170,1,1
832,1,1
482,1,1
933,1,1
328,0,0
1337,1,0
950,1,1
701,0,1


**Support Vector Classification Confusion Matrix**

In [125]:
pd.DataFrame(confusion_matrix(y_test, pred_svm))

Unnamed: 0,0,1
0,64,81
1,41,103


**Support Vector Classification Classification Report**

In [126]:
print(classification_report(y_test, pred_svm))

              precision    recall  f1-score   support

           0       0.61      0.44      0.51       145
           1       0.56      0.72      0.63       144

    accuracy                           0.58       289
   macro avg       0.58      0.58      0.57       289
weighted avg       0.58      0.58      0.57       289



### Tuned Support Vector Classification

In [20]:
params = {"C": [0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9]}

svm = SVC()

cv = GridSearchCV(svm, params)

cv.fit(X_train, y_train)

cv.best_params_

{'C': 0.01}

In [21]:
svm = SVC(C = 0.01)

svm.fit(X_train, y_train)

pred_svm = svm.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_svm}).sample(10)

Unnamed: 0,Actual,Predicted
1255,1,1
383,1,1
1417,1,1
503,0,1
1258,1,1
565,1,1
994,1,1
228,0,1
1301,1,1
100,0,1


**Tuned Support Vector Classification Confusion Matrix**

In [22]:
pd.DataFrame(confusion_matrix(y_test, pred_svm))

Unnamed: 0,0,1
0,37,108
1,17,127


**Tuned Support Vector Classification Classification Report**

In [23]:
print(classification_report(y_test, pred_svm))

              precision    recall  f1-score   support

           0       0.69      0.26      0.37       145
           1       0.54      0.88      0.67       144

    accuracy                           0.57       289
   macro avg       0.61      0.57      0.52       289
weighted avg       0.61      0.57      0.52       289



### Adaboost Classification

In [25]:
adbc = AdaBoostClassifier()

adbc.fit(X_train, y_train)

pred_adbc = adbc.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_adbc}).sample(10)

Unnamed: 0,Actual,Predicted
13,0,0
555,0,0
406,0,0
655,0,0
532,0,0
435,0,0
86,0,1
978,1,1
634,0,0
1067,1,1


**AdaBoost Classification Confusion Matrix**

In [26]:
pd.DataFrame(confusion_matrix(y_test, pred_adbc))

Unnamed: 0,0,1
0,116,29
1,23,121


**AdaBoost Classifier Classification Report**

In [27]:
print(classification_report(y_test, pred_adbc))

              precision    recall  f1-score   support

           0       0.83      0.80      0.82       145
           1       0.81      0.84      0.82       144

    accuracy                           0.82       289
   macro avg       0.82      0.82      0.82       289
weighted avg       0.82      0.82      0.82       289



### Tuned Adaboost Classification

In [28]:
params = {"n_estimators": [50, 75, 100, 125, 150, 175, 200],
          "learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.5]}

adbc = AdaBoostClassifier()

cv = GridSearchCV(adbc, params)

cv.fit(X_train, y_train)

cv.best_params_

{'learning_rate': 0.5, 'n_estimators': 100}

In [29]:
adbc = AdaBoostClassifier(n_estimators = 100,
                          learning_rate = 0.5)

adbc.fit(X_train, y_train)

pred_adbc = adbc.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_adbc}).sample(10)

Unnamed: 0,Actual,Predicted
1084,1,1
229,0,0
526,1,1
1187,1,1
994,1,1
90,1,1
842,1,1
1299,1,1
800,0,0
719,1,1


**Tuned Adaboost Classification Confusion Matrix**

In [30]:
pd.DataFrame(confusion_matrix(y_test, pred_adbc))

Unnamed: 0,0,1
0,118,27
1,23,121


**Tuned Adaboost Classifier Classification Report**

In [31]:
print(classification_report(y_test, pred_adbc))

              precision    recall  f1-score   support

           0       0.84      0.81      0.83       145
           1       0.82      0.84      0.83       144

    accuracy                           0.83       289
   macro avg       0.83      0.83      0.83       289
weighted avg       0.83      0.83      0.83       289



### Gradient Boosting Classification

In [33]:
gbc = GradientBoostingClassifier()

gbc.fit(X_train, y_train)

pred_gbc = gbc.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_gbc}).sample(10)

Unnamed: 0,Actual,Predicted
644,0,0
554,0,1
638,0,0
117,0,0
830,1,1
952,1,1
14,0,0
1247,1,1
1241,1,1
1338,1,1


**Gradient Boosting Classifier Confusion Matrix**

In [34]:
pd.DataFrame(confusion_matrix(y_test, pred_gbc))

Unnamed: 0,0,1
0,120,25
1,11,133


**Gradient Boosting Classifier Classification Report**

In [35]:
print(classification_report(y_test, pred_gbc))

              precision    recall  f1-score   support

           0       0.92      0.83      0.87       145
           1       0.84      0.92      0.88       144

    accuracy                           0.88       289
   macro avg       0.88      0.88      0.88       289
weighted avg       0.88      0.88      0.88       289



### Tuned Gradient Boosting Classification

In [37]:
params = {"learning_rate": [0.0001, 0.001, 0.01],
          "n_estimators": [75, 150 , 200],
          "min_samples_leaf": [0.1, 0.2, 0.3, 0.4],
          "max_depth": [4, 6, 8]}

gbc = GradientBoostingClassifier()

cv = GridSearchCV(gbc, params)

cv.fit(X_train, y_train)

cv.best_params_

{'learning_rate': 0.01,
 'max_depth': 4,
 'min_samples_leaf': 0.1,
 'n_estimators': 200}

In [39]:
gbr = GradientBoostingClassifier(learning_rate = 0.01,
                                 max_depth = 4,
                                 min_samples_leaf = 0.1,
                                 n_estimators = 200)

gbr.fit(X_train, y_train)

pred_gr = gbr.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_gr}).sample(10)

Unnamed: 0,Actual,Predicted
331,1,1
1195,1,1
532,0,1
33,1,1
430,0,0
1159,1,1
928,1,1
722,0,1
774,0,0
421,0,1


**Tuned Gradient Boosting Classifier Confusion Matrix**

In [40]:
pd.DataFrame(confusion_matrix(y_test, pred_gr))

Unnamed: 0,0,1
0,112,33
1,18,126


**Tuned Gradient Boosting Classifier Classification Report**

In [41]:
print(classification_report(y_test, pred_gr))

              precision    recall  f1-score   support

           0       0.86      0.77      0.81       145
           1       0.79      0.88      0.83       144

    accuracy                           0.82       289
   macro avg       0.83      0.82      0.82       289
weighted avg       0.83      0.82      0.82       289



### Extra Trees Classifier

In [44]:
etc = ExtraTreesClassifier()

etc.fit(X_train, y_train)

pred_etc = etc.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_etc}).sample(10)

Unnamed: 0,Actual,Predicted
1272,1,1
913,1,1
567,0,1
797,0,1
420,1,1
639,0,1
548,1,1
652,0,0
1424,1,1
1110,1,1


### Extra Trees Classifier Confusion Matrix

In [45]:
pd.DataFrame(confusion_matrix(y_test, pred_etc))

Unnamed: 0,0,1
0,117,28
1,13,131


### Extra Trees Classifier Classification Report

In [46]:
print(classification_report(y_test, pred_etc))

              precision    recall  f1-score   support

           0       0.90      0.81      0.85       145
           1       0.82      0.91      0.86       144

    accuracy                           0.86       289
   macro avg       0.86      0.86      0.86       289
weighted avg       0.86      0.86      0.86       289



### Tuned Extra Trees Classifier

In [47]:
params = {"n_estimators": [50, 100, 150, 200],
          "max_depth": [3, 4, 5, 6],
          "min_samples_leaf": [0.1, 0.2, 0.3, 0.4]}

etc = ExtraTreesClassifier()

cv = GridSearchCV(etc, params)

cv.fit(X_train, y_train)

cv.best_params_

{'max_depth': 6, 'min_samples_leaf': 0.1, 'n_estimators': 100}

In [48]:
etc = ExtraTreesClassifier(max_depth = 6, min_samples_leaf = 0.1, n_estimators = 100)

etc.fit(X_train, y_train)

pred_etc = etc.predict(X_test)
pd.DataFrame({"Actual": y_test,
              "Predicted": pred_etc}).sample(10)

Unnamed: 0,Actual,Predicted
859,1,1
797,0,0
1294,1,1
1234,1,1
441,0,0
504,0,1
867,1,0
1350,1,1
1067,1,1
952,1,1


**Tuned Extra Trees Classifier Confusion Matrix**

In [49]:
pd.DataFrame(confusion_matrix(y_test, pred_etc))

Unnamed: 0,0,1
0,106,39
1,18,126


**Tuned Extra Trees Classifier Classification Report**

In [50]:
print(classification_report(y_test, pred_etc))

              precision    recall  f1-score   support

           0       0.85      0.73      0.79       145
           1       0.76      0.88      0.82       144

    accuracy                           0.80       289
   macro avg       0.81      0.80      0.80       289
weighted avg       0.81      0.80      0.80       289

