In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
%matplotlib inline
import warnings

import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
)

from sklearn.decomposition import PCA

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import os

print(os.getcwd())

print(os.listdir(os.getcwd()))

C:\Users\jlim7\OneDrive\Documents\GitHub\Capstone2
['.git', '.gitignore', '.ipynb_checkpoints', '42343_72434_bundle_archive', '42343_72434_bundle_archive.zip', 'Classification Models with PCA.ipynb', 'Classification Models with SelectKBest.ipynb', 'Initial Classification Models.ipynb', 'Initial EDA.ipynb', 'LICENSE', 'README.md', 'WA_Fn-UseC_-Telco-Customer-Churn.xlsx']


<IPython.core.display.Javascript object>

In [3]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(np.array(x), i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
pd.set_option("display.max_columns", None)
churn = pd.read_excel("WA_Fn-UseC_-Telco-Customer-Churn.xlsx")
churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


<IPython.core.display.Javascript object>

In [5]:
churn = pd.get_dummies(churn, columns=["Churn"], drop_first=True)
churn = churn.drop(columns=["customerID", "TotalCharges"])

<IPython.core.display.Javascript object>

In [6]:
X = churn.drop(columns=["Churn_Yes"])
y = churn["Churn_Yes"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=13, stratify=y
)

<IPython.core.display.Javascript object>

In [7]:
churn["OnlineSecurity"] = churn["OnlineSecurity"] == "Yes"
churn["OnlineBackup"] = churn["OnlineBackup"] == "Yes"
churn["DeviceProtection"] = churn["DeviceProtection"] == "Yes"
churn["TechSupport"] = churn["TechSupport"] == "Yes"
churn["StreamingTV"] = churn["StreamingTV"] == "Yes"
churn["StreamingMovies"] = churn["StreamingMovies"] == "Yes"

<IPython.core.display.Javascript object>

In [8]:
col = [
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
]

for c in col:
    churn[col] = churn[col].astype(int)

<IPython.core.display.Javascript object>

In [9]:
bin_cols = [
    "SeniorCitizen",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
]
cat_cols = [
    "gender",
    "Partner",
    "Dependents",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
]
drop_cats = [
    "Male",
    "No",
    "No",
    "Yes",
    "No",
    "Fiber optic",
    "Month-to-month",
    "Yes",
    "Electronic check",
]
num_cols = ["tenure", "MonthlyCharges"]

<IPython.core.display.Javascript object>

In [10]:
churn = pd.get_dummies(churn, columns=cat_cols, drop_first=True)

<IPython.core.display.Javascript object>

In [11]:
col = churn.drop(columns=["Churn_Yes"])

<IPython.core.display.Javascript object>

In [12]:
X = churn.drop(columns=["Churn_Yes"])
y = churn["Churn_Yes"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=13, stratify=y
)

scaler = StandardScaler()
pca = PCA()

<IPython.core.display.Javascript object>

# K Nearest Neighbor Classifier with PCA

In [13]:
pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA()), ("knn", KNeighborsClassifier()),]
)
pipeline = pipeline.fit(X_train, y_train)

<IPython.core.display.Javascript object>

In [14]:
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.8301384451544196
Test score: 0.7707594038325053


<IPython.core.display.Javascript object>

In [15]:
y_pred = pipeline.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Actual No Churn", "Actual Churn"],
    columns=["Predicted No Churn", "Predicted Churn"],
)
print(con_mat)
print(classification_report(y_test, y_pred))

                 Predicted No Churn  Predicted Churn
Actual No Churn                 887              148
Actual Churn                    175              199
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1035
           1       0.57      0.53      0.55       374

    accuracy                           0.77      1409
   macro avg       0.70      0.69      0.70      1409
weighted avg       0.77      0.77      0.77      1409



<IPython.core.display.Javascript object>

Adding a grid search to KNN

In [16]:
grid = {
    "pca__n_components": [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    "knn__n_neighbors": [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    "knn__weights": ["distance", "uniform"],
}

model = GridSearchCV(pipeline, grid, cv=2)
model.fit(X_train, y_train)

GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('knn',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkows

<IPython.core.display.Javascript object>

In [17]:
model.best_params_

{'knn__n_neighbors': 19, 'knn__weights': 'uniform', 'pca__n_components': 11}

<IPython.core.display.Javascript object>

In [18]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.8024494142705005
Test score: 0.7828246983676366


<IPython.core.display.Javascript object>

In [19]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Actual No Churn", "Actual Churn"],
    columns=["Predicted No Churn", "Predicted Churn"],
)
print(con_mat)
print(classification_report(y_test, y_pred))

                 Predicted No Churn  Predicted Churn
Actual No Churn                 925              110
Actual Churn                    196              178
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1035
           1       0.62      0.48      0.54       374

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.70      1409
weighted avg       0.77      0.78      0.77      1409



<IPython.core.display.Javascript object>

# Support Vector Classifier

In [20]:
pipeline = Pipeline([("scaler", StandardScaler()), ("pca", PCA()), ("svc", SVC()),])
pipeline = pipeline.fit(X_train, y_train)

<IPython.core.display.Javascript object>

In [21]:
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.820021299254526
Test score: 0.8133427963094393


<IPython.core.display.Javascript object>

In [22]:
y_pred = pipeline.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Actual No Churn", "Actual Churn"],
    columns=["Predicted No Churn", "Predicted Churn"],
)
print(con_mat)
print(classification_report(y_test, y_pred))

                 Predicted No Churn  Predicted Churn
Actual No Churn                 955               80
Actual Churn                    183              191
              precision    recall  f1-score   support

           0       0.84      0.92      0.88      1035
           1       0.70      0.51      0.59       374

    accuracy                           0.81      1409
   macro avg       0.77      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409



<IPython.core.display.Javascript object>

Adding a grid search to SVC

In [23]:
grid = {
    "pca__n_components": [3, 5, 7, 9, 11, 13, 15, 17],
    "svc__C": [1, 10, 100, 500],
    "svc__kernel": ["linear", "rbf", "poly"],
    "svc__degree": [1, 2, 3],
}

model = GridSearchCV(pipeline, grid, cv=2)
model.fit(X_train, y_train)

GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('svc',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                           

<IPython.core.display.Javascript object>

In [24]:
model.best_params_

{'pca__n_components': 15,
 'svc__C': 1,
 'svc__degree': 1,
 'svc__kernel': 'linear'}

<IPython.core.display.Javascript object>

In [25]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.7976570820021299
Test score: 0.7991483321504613


<IPython.core.display.Javascript object>

In [26]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Actual No Churn", "Actual Churn"],
    columns=["Predicted No Churn", "Predicted Churn"],
)
print(con_mat)
print(classification_report(y_test, y_pred))

                 Predicted No Churn  Predicted Churn
Actual No Churn                 912              123
Actual Churn                    160              214
              precision    recall  f1-score   support

           0       0.85      0.88      0.87      1035
           1       0.64      0.57      0.60       374

    accuracy                           0.80      1409
   macro avg       0.74      0.73      0.73      1409
weighted avg       0.79      0.80      0.80      1409



<IPython.core.display.Javascript object>

# Random Forest Classifier

In [27]:
pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA()), ("rf", RandomForestClassifier()),]
)
pipeline = pipeline.fit(X_train, y_train)

<IPython.core.display.Javascript object>

In [28]:
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.9973375931842385
Test score: 0.801277501774308


<IPython.core.display.Javascript object>

In [34]:
y_pred = pipeline.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Actual No Churn", "Actual Churn"],
    columns=["Predicted No Churn", "Predicted Churn"],
)
print(con_mat)
print(classification_report(y_test, y_pred))

                 Predicted No Churn  Predicted Churn
Actual No Churn                 952               83
Actual Churn                    197              177
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1035
           1       0.68      0.47      0.56       374

    accuracy                           0.80      1409
   macro avg       0.75      0.70      0.72      1409
weighted avg       0.79      0.80      0.79      1409



<IPython.core.display.Javascript object>

Add grid search to RF

In [37]:
grid = {
    "pca__n_components": [7, 9, 11, 13, 15, 17],
    "rf__max_depth": [3, 5, 7, 10, 15],
    "rf__min_samples_leaf": [1, 3, 5],
    "rf__criterion": ["gini", "entropy"],
}

model = GridSearchCV(pipeline, grid)
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_we

<IPython.core.display.Javascript object>

In [38]:
model.best_params_

{'pca__n_components': 17,
 'rf__criterion': 'entropy',
 'rf__max_depth': 10,
 'rf__min_samples_leaf': 5}

<IPython.core.display.Javascript object>

In [39]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.8627973020944267
Test score: 0.8055358410220014


<IPython.core.display.Javascript object>

In [40]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Actual No Churn", "Actual Churn"],
    columns=["Predicted No Churn", "Predicted Churn"],
)
print(con_mat)
print(classification_report(y_test, y_pred))

                 Predicted No Churn  Predicted Churn
Actual No Churn                 948               87
Actual Churn                    187              187
              precision    recall  f1-score   support

           0       0.84      0.92      0.87      1035
           1       0.68      0.50      0.58       374

    accuracy                           0.81      1409
   macro avg       0.76      0.71      0.73      1409
weighted avg       0.79      0.81      0.80      1409



<IPython.core.display.Javascript object>

# Gradient Boosted Classifier

In [41]:
pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA()), ("xgb", XGBClassifier()),]
)
pipeline = pipeline.fit(X_train, y_train)

<IPython.core.display.Javascript object>

In [42]:
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.9483493077742279
Test score: 0.7778566359119943


<IPython.core.display.Javascript object>

In [43]:
y_pred = pipeline.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Actual No Churn", "Actual Churn"],
    columns=["Predicted No Churn", "Predicted Churn"],
)
print(con_mat)
print(classification_report(y_test, y_pred))

                 Predicted No Churn  Predicted Churn
Actual No Churn                 880              155
Actual Churn                    158              216
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      1035
           1       0.58      0.58      0.58       374

    accuracy                           0.78      1409
   macro avg       0.71      0.71      0.71      1409
weighted avg       0.78      0.78      0.78      1409



<IPython.core.display.Javascript object>

Add grid search to XBG

In [44]:
grid = {
    "xgb__subsample": [0.5, 0.75, 1.0],
    "xgb__colsample_bytree": [0.5, 0.75, 1.0],
    "xgb__max_depth": [5, 7, 10],
}
model = GridSearchCV(pipeline, grid)
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('xgb',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
          

<IPython.core.display.Javascript object>

In [45]:
model.best_params_

{'xgb__colsample_bytree': 0.5, 'xgb__max_depth': 10, 'xgb__subsample': 0.75}

<IPython.core.display.Javascript object>

In [46]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.9831380901668442
Test score: 0.7877927608232789


<IPython.core.display.Javascript object>

In [47]:
y_pred = model.predict(X_test)

con_mat = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Actual No Churn", "Actual Churn"],
    columns=["Predicted No Churn", "Predicted Churn"],
)
print(con_mat)
print(classification_report(y_test, y_pred))

                 Predicted No Churn  Predicted Churn
Actual No Churn                 917              118
Actual Churn                    181              193
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1035
           1       0.62      0.52      0.56       374

    accuracy                           0.79      1409
   macro avg       0.73      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409



<IPython.core.display.Javascript object>