In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
%matplotlib inline
import warnings

import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
)

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import os

print(os.getcwd())

print(os.listdir(os.getcwd()))

C:\Users\jlim7\OneDrive\Documents\GitHub\Capstone2
['.git', '.gitignore', '.ipynb_checkpoints', '42343_72434_bundle_archive', '42343_72434_bundle_archive.zip', 'Classification Models.ipynb', 'Initial EDA.ipynb', 'LICENSE', 'README.md', 'WA_Fn-UseC_-Telco-Customer-Churn.xlsx']


<IPython.core.display.Javascript object>

In [44]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(np.array(x), i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

# Model Preparation

In [4]:
pd.set_option("display.max_columns", None)
churn = pd.read_excel("WA_Fn-UseC_-Telco-Customer-Churn.xlsx")
churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


<IPython.core.display.Javascript object>

In [5]:
churn = pd.get_dummies(churn, columns=["Churn"], drop_first=True)
churn = churn.drop(columns=["customerID", "TotalCharges"])

<IPython.core.display.Javascript object>

In [6]:
bin_cols = [
    "SeniorCitizen",
]
cat_cols = [
    "gender",
    "Partner",
    "Dependents",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
]
drop_cats = [
    "Male",
    "No",
    "No",
    "Yes",
    "No",
    "Fiber optic",
    "No",
    "No",
    "No",
    "No",
    "No",
    "No",
    "Month-to-month",
    "Yes",
    "Electronic check",
]
num_cols = ["tenure", "MonthlyCharges"]

<IPython.core.display.Javascript object>

In [7]:
X = churn.drop(columns=["Churn_Yes"])
y = churn["Churn_Yes"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=13, stratify=y
)

<IPython.core.display.Javascript object>

In [8]:
preprocessing = ColumnTransformer(
    [
        ("scale", StandardScaler(), num_cols),
        ("one_hot", OneHotEncoder(drop=drop_cats), cat_cols),
    ],
    remainder="passthrough",
)

<IPython.core.display.Javascript object>

# K Nearest Neighbor Classifier

In [9]:
pipeline = Pipeline([("preprocessing", preprocessing), ("knn", KNeighborsClassifier())])

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['tenure', 'MonthlyCharges']),
                                                 ('one_hot',
                                                  OneHotEncoder(categories='auto',
                                                                drop=['Male',
                                                                      'No',
                                                                      'No',
 

<IPython.core.display.Javascript object>

In [10]:
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.8358182463613774
Test score: 0.7771469127040455


<IPython.core.display.Javascript object>

In [11]:
y_pred = pipeline.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[890 145]
 [169 205]]
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1035
           1       0.59      0.55      0.57       374

    accuracy                           0.78      1409
   macro avg       0.71      0.70      0.71      1409
weighted avg       0.77      0.78      0.77      1409



<IPython.core.display.Javascript object>

#### Add a grid search to optimize KNN model

In [12]:
grid = {
    "knn__n_neighbors": [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    "knn__weights": ["distance", "uniform"],
}

model = GridSearchCV(pipeline, grid, cv=2)
model.fit(X_train, y_train)

GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('scale',
                                                                         StandardScaler(copy=True,
                                                                                        with_mean=True,
                                                                                        with_std=True),
                                                                         ['tenure',
                                                                       

<IPython.core.display.Javascript object>

In [13]:
model.best_params_

{'knn__n_neighbors': 21, 'knn__weights': 'uniform'}

<IPython.core.display.Javascript object>

In [14]:
model.score(X_train, y_train)

0.811324103656372

<IPython.core.display.Javascript object>

In [15]:
model.score(X_test, y_test)

0.7991483321504613

<IPython.core.display.Javascript object>

In [16]:
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[890 145]
 [169 205]]
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1035
           1       0.59      0.55      0.57       374

    accuracy                           0.78      1409
   macro avg       0.71      0.70      0.71      1409
weighted avg       0.77      0.78      0.77      1409



<IPython.core.display.Javascript object>

# Support Vector Classifier

In [17]:
pipeline = Pipeline([("preprocessing", preprocessing), ("SVC", SVC())])

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['tenure', 'MonthlyCharges']),
                                                 ('one_hot',
                                                  OneHotEncoder(categories='auto',
                                                                drop=['Male',
                                                                      'No',
                                                                      'No',
 

<IPython.core.display.Javascript object>

In [18]:
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.8134540291089812
Test score: 0.8161816891412349


<IPython.core.display.Javascript object>

In [19]:
y_pred = pipeline.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[964  71]
 [188 186]]
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      1035
           1       0.72      0.50      0.59       374

    accuracy                           0.82      1409
   macro avg       0.78      0.71      0.74      1409
weighted avg       0.81      0.82      0.80      1409



<IPython.core.display.Javascript object>

#### Add a grid search to optimize SVC model

In [20]:
grid = {
    "SVC__kernel": ["linear", "rbf", "poly"],
    "SVC__C": [1, 10, 100, 1000],
    "SVC__degree": [3, 5, 7, 10],
}

model = GridSearchCV(pipeline, grid, cv=2)
model.fit(X_train, y_train)

GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('scale',
                                                                         StandardScaler(copy=True,
                                                                                        with_mean=True,
                                                                                        with_std=True),
                                                                         ['tenure',
                                                                       

<IPython.core.display.Javascript object>

In [21]:
model.best_params_

{'SVC__C': 1, 'SVC__degree': 3, 'SVC__kernel': 'rbf'}

<IPython.core.display.Javascript object>

In [22]:
model.score(X_train, y_train)

0.8134540291089812

<IPython.core.display.Javascript object>

In [23]:
model.score(X_test, y_test)

0.8161816891412349

<IPython.core.display.Javascript object>

In [24]:
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[964  71]
 [188 186]]
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      1035
           1       0.72      0.50      0.59       374

    accuracy                           0.82      1409
   macro avg       0.78      0.71      0.74      1409
weighted avg       0.81      0.82      0.80      1409



<IPython.core.display.Javascript object>

# Random Forest Classifier

In [25]:
pipeline = Pipeline(
    [("preprocessing", preprocessing), ("rf", RandomForestClassifier())]
)

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['tenure', 'MonthlyCharges']),
                                                 ('one_hot',
                                                  OneHotEncoder(categories='auto',
                                                                drop=['Male',
                                                                      'No',
                                                                      'No',
 

<IPython.core.display.Javascript object>

In [26]:
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.9971600993965212
Test score: 0.7991483321504613


<IPython.core.display.Javascript object>

In [27]:
y_pred = pipeline.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[938  97]
 [186 188]]
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1035
           1       0.66      0.50      0.57       374

    accuracy                           0.80      1409
   macro avg       0.75      0.70      0.72      1409
weighted avg       0.79      0.80      0.79      1409



<IPython.core.display.Javascript object>

#### Add a grid search to optimize Random Forest model

In [28]:
grid = {
    "rf__max_depth": [3, 5, 7, 10, 15],
    "rf__n_estimators": [1, 10, 100, 1000, 10000],
    "rf__min_samples_leaf": [1, 3, 5, 7, 10],
    "rf__criterion": ["gini", "entropy"],
}

model = GridSearchCV(pipeline, grid, cv=2, n_jobs=-1)
model.fit(X_train, y_train)

GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('scale',
                                                                         StandardScaler(copy=True,
                                                                                        with_mean=True,
                                                                                        with_std=True),
                                                                         ['tenure',
                                                                       

<IPython.core.display.Javascript object>

In [29]:
model.best_params_

{'rf__criterion': 'entropy',
 'rf__max_depth': 15,
 'rf__min_samples_leaf': 7,
 'rf__n_estimators': 1000}

<IPython.core.display.Javascript object>

In [30]:
model.score(X_train, y_train)

0.8363507277245297

<IPython.core.display.Javascript object>

In [31]:
model.score(X_test, y_test)

0.8069552874378992

<IPython.core.display.Javascript object>

In [42]:
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[940  95]
 [185 189]]
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.67      0.51      0.57       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



<IPython.core.display.Javascript object>

# Gradient Boosted Classifier

In [33]:
pipeline = Pipeline([("preprocessing", preprocessing), ("xgb", XGBClassifier())])

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['tenure', 'MonthlyCharges']),
                                                 ('one_hot',
                                                  OneHotEncoder(categories='auto',
                                                                drop=['Male',
                                                                      'No',
                                                                      'No',
 

<IPython.core.display.Javascript object>

In [34]:
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.9290024849130281
Test score: 0.7970191625266146


<IPython.core.display.Javascript object>

In [35]:
y_pred = pipeline.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[923 112]
 [174 200]]
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1035
           1       0.64      0.53      0.58       374

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



<IPython.core.display.Javascript object>

#### Add a grid search to optimize XGB model

In [36]:
grid = {
    "xgb__subsample": [0.5, 0.75, 1.0],
    "xgb__colsample_bytree": [0.5, 0.75, 1.0],
    "xgb__max_depth": [5, 7, 10],
    "xgb__n_trees": [100, 150, 200],
    "xgb__learning_rate": [0.2, 0.4, 0.6, 0.8, 1],
}
model = GridSearchCV(pipeline, grid, cv=2, n_jobs=-1)
model.fit(X_train, y_train)



Parameters: { n_trees } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('scale',
                                                                         StandardScaler(copy=True,
                                                                                        with_mean=True,
                                                                                        with_std=True),
                                                                         ['tenure',
                                                                       

<IPython.core.display.Javascript object>

In [37]:
model.best_params_

{'xgb__colsample_bytree': 0.75,
 'xgb__learning_rate': 0.2,
 'xgb__max_depth': 5,
 'xgb__n_trees': 100,
 'xgb__subsample': 1.0}

<IPython.core.display.Javascript object>

In [38]:
model.score(X_train, y_train)

0.8635072772452964

<IPython.core.display.Javascript object>

In [39]:
model.score(X_test, y_test)

0.801277501774308

<IPython.core.display.Javascript object>

In [41]:
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[940  95]
 [185 189]]
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.67      0.51      0.57       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



<IPython.core.display.Javascript object>