In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix
import graphviz
from graphviz import Graph
import prepare

In [2]:
# Prep Data
df = prepare.prep_telco()

In [3]:
df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,streaming_movies_yes,paperless_billing_yes,churn_yes,contract_type_one year,contract_type_two year,internet_service_type_fiber optic,internet_service_type_none,payment_type_credit card (automatic),payment_type_electronic check,payment_type_mailed check
0,0016-QLJIS,Female,0,Yes,Yes,65,Yes,Yes,Yes,Yes,...,1,1,0,0,1,0,0,0,0,1
1,0017-DINOC,Male,0,No,No,54,No,No phone service,Yes,No,...,0,0,0,0,1,0,0,1,0,0
2,0019-GFNTW,Female,0,No,No,56,No,No phone service,Yes,Yes,...,0,0,0,0,1,0,0,0,0,0
3,0056-EPFBG,Male,0,Yes,Yes,20,No,No phone service,Yes,No,...,0,1,0,0,1,0,0,1,0,0
4,0078-XZMHT,Male,0,Yes,No,72,Yes,Yes,No,Yes,...,1,1,0,0,1,0,0,0,0,0


In [9]:
df.columns.to_list()

['customer_id',
 'gender',
 'senior_citizen',
 'partner',
 'dependents',
 'tenure',
 'phone_service',
 'multiple_lines',
 'online_security',
 'online_backup',
 'device_protection',
 'tech_support',
 'streaming_tv',
 'streaming_movies',
 'paperless_billing',
 'monthly_charges',
 'total_charges',
 'churn',
 'contract_type',
 'internet_service_type',
 'payment_type',
 'senior_citizen',
 'gender_male',
 'partner_yes',
 'dependents_yes',
 'phone_service_yes',
 'multiple_lines_no phone service',
 'multiple_lines_yes',
 'online_security_no internet service',
 'online_security_yes',
 'online_backup_no internet service',
 'online_backup_yes',
 'device_protection_no internet service',
 'device_protection_yes',
 'tech_support_no internet service',
 'tech_support_yes',
 'streaming_tv_no internet service',
 'streaming_tv_yes',
 'streaming_movies_no internet service',
 'streaming_movies_yes',
 'paperless_billing_yes',
 'churn_yes',
 'contract_type_one year',
 'contract_type_two year',
 'internet_ser

In [6]:
# Slit Data
train, validate, test = prepare.split_telco(df)
train.shape, validate.shape, test.shape

((3943, 49), (1691, 49), (1409, 49))

In [7]:
# Columns to Drop for Modeling
cols_drop = ['customer_id',
             'gender',
             'senior_citizen',
             'partner',
             'dependents',
             'tenure',
             'phone_service',
             'multiple_lines',
             'online_security',
             'online_backup',
             'device_protection',
             'tech_support',
             'streaming_tv',
             'streaming_movies',
             'paperless_billing',
             'churn',
             'contract_type',
             'internet_service_type',
             'payment_type',]

In [8]:
train = train.drop(columns=cols_drop)
validate = validate.drop(columns=cols_drop)
test = test.drop(columns=cols_drop)
train.shape, validate.shape, test.shape

((3943, 29), (1691, 29), (1409, 29))

In [10]:
# Create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns='churn_yes')
y_train = train.churn_yes

X_validate = validate.drop(columns='churn_yes')
y_validate = validate.churn_yes

X_test = test.drop(columns='churn_yes')
y_test = test.churn_yes

# <font color = 'red'> Use Decision Tree Modeling

In [23]:
# Let's continue getting loopy, so we can compare in-sample to out-of-sample
metrics = []

for i in range(1, 50):
    # Make the model
    clf = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    clf = clf.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = clf.score(X_train, y_train)
    
    out_of_sample_accuracy = clf.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df.head()

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,1,0.73472,0.734477,0.000243
1,2,0.781385,0.785925,-0.004541
2,3,0.781385,0.785925,-0.004541
3,4,0.791529,0.780603,0.010926
4,5,0.800913,0.784743,0.01617


In [24]:
df.difference.abs().idxmin()

0

# <font color = 'red'> Use Random Forest Modeling

In [27]:
# compare in-sample to out-of-sample
metrics = []

for i in range(1,11):
    # Make the model
    rf = RandomForestClassifier(max_depth=11-i,
                           random_state=123,
                           min_samples_leaf=i)

    # Fit the model (on train and only train)
    rf = rf.fit(X_train, y_train)
    
    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = rf.score(X_train, y_train)
    
    out_of_sample_accuracy = rf.score(X_validate, y_validate)

    output = {
        "max_depth": 11-i,
        "min_sample": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df.head(10)

Unnamed: 0,max_depth,min_sample,train_accuracy,validate_accuracy,difference
0,10,1,0.884098,0.80071,0.083389
1,9,2,0.844281,0.799527,0.044754
2,8,3,0.826021,0.802484,0.023537
3,7,4,0.820188,0.804849,0.015338
4,6,5,0.807253,0.803075,0.004178
5,5,6,0.801674,0.801892,-0.000219
6,4,7,0.784682,0.787108,-0.002427
7,3,8,0.779609,0.776464,0.003146
8,2,9,0.73472,0.734477,0.000243
9,1,10,0.73472,0.734477,0.000243


In [28]:
df.difference.abs().idxmin()

5

# <font color = 'red'> Use KNN Modeling

In [13]:
# compare in-sample to out-of-sample
metrics = []

for i in range(1,50):
    # Make the model
    knn = KNeighborsClassifier(n_neighbors=i, weights='uniform')

    # Fit the model (on train and only train)
    knn = knn.fit(X_train, y_train)
    
    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = knn.score(X_train, y_train)
    
    out_of_sample_accuracy = knn.score(X_validate, y_validate)

    output = {
        "K_Neighbor": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,K_Neighbor,train_accuracy,validate_accuracy,difference
0,1,0.997464,0.708457,0.289007
1,2,0.854426,0.74453,0.109896
2,3,0.850875,0.74039,0.110485
3,4,0.828557,0.765228,0.063329
4,5,0.832361,0.754583,0.077778
5,6,0.82526,0.771141,0.054119
6,7,0.82526,0.769959,0.055301
7,8,0.820949,0.776464,0.044485
8,9,0.820949,0.772915,0.048033
9,10,0.812579,0.784151,0.028428


In [14]:
df.difference.idxmin()

22