In [1]:
!which python3

/Users/karthikmaddukuri/anaconda3/envs/churn-model/bin/python3


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Standard imports
import os
import pickle
import sys
sys.path.append('..')  #To let jupyter notebook to look one folder up

#Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.ensemble import ( AdaBoostClassifier, GradientBoostingClassifier, 
                              RandomForestClassifier
                             )
from sklearn.inspection import plot_partial_dependence
from sklearn.metrics import roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.tree import DecisionTreeClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Local imports
from src.localpaths import *
from src.data.make_dataset import load_training_data
from src.models.train_model import *

In [4]:
#Configuration for notebooks

# pd.options.display.max_columns = 100
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Load Featurized Data 

In [5]:
X_train, y_train = load_training_data(final=True)

# Feature Multicollinearity

In [6]:
vifs = [variance_inflation_factor(X_train.values,i) for i in range(len(X_train.columns))]

  vif = 1. / (1. - r_squared_i)


In [7]:
sorted(zip(X_train.columns, vifs), key=lambda x: x[1], reverse=True)

[('InternetService_No', inf),
 ('OnlineSecurity_No internet service', inf),
 ('OnlineBackup_No internet service', inf),
 ('DeviceProtection_No internet service', inf),
 ('TechSupport_No internet service', inf),
 ('StreamingTV_No internet service', inf),
 ('StreamingMovies_No internet service', inf),
 ('PhoneService', 1795.2936129533036),
 ('MonthlyCharges', 879.4071653278096),
 ('InternetService_Fiber optic', 150.413015558894),
 ('MultipleLines_No phone service', 61.269312275148884),
 ('StreamingTV_Yes', 24.573123044650014),
 ('StreamingMovies_Yes', 24.400188765894438),
 ('TotalCharges', 10.871762430587555),
 ('tenure', 7.528269480250208),
 ('MultipleLines_Yes', 7.388728481143919),
 ('DeviceProtection_Yes', 6.990713871204519),
 ('OnlineBackup_Yes', 6.911783532524887),
 ('TechSupport_Yes', 6.498500323508462),
 ('OnlineSecurity_Yes', 6.478979774573932),
 ('Contract_Two year', 2.6312318158444175),
 ('PaymentMethod_Electronic check', 1.9381234819755606),
 ('PaymentMethod_Mailed check', 1.8

## Testing high_vif_drop

In [8]:
def drop_high_vif_features(X_train):
    """
    Drops features with a variance inflation factor greater than 10
    """
    finished = False
    while not finished:
        vifs = [variance_inflation_factor(X_train.values,i) for i in range(len(X_train.columns))]
        high_vifs = sorted(zip(X_train.columns, vifs), key=lambda x: x[1], reverse=True)
        high_vif_col, high_vif_value = high_vifs[0]
        if high_vif_value >= 10:
            print(f'Dropping column {high_vif_col} as it has {high_vif_value:.1f} >=10')
            X_train = X_train.drop(columns=[high_vif_col])
        else:
            print('finished dropping columns')
            finished = True

    return X_train


In [9]:
X_train = drop_high_vif_features(X_train)

Dropping column InternetService_No as it has inf >=10
Dropping column OnlineSecurity_No internet service as it has inf >=10
Dropping column OnlineBackup_No internet service as it has inf >=10
Dropping column DeviceProtection_No internet service as it has inf >=10
Dropping column TechSupport_No internet service as it has inf >=10
Dropping column StreamingTV_No internet service as it has inf >=10
Dropping column PhoneService as it has 1795.3 >=10
Dropping column MonthlyCharges as it has 43.2 >=10
Dropping column TotalCharges as it has 20.0 >=10
finished dropping columns


In [10]:
X_train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PaperlessBilling,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,OnlineSecurity_Yes,OnlineBackup_Yes,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,0,1,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,0,0,0,7,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,0,0,1,4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0,0,29,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,0,0,0,3,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,0,0,1,0,1,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5278,1,0,1,1,23,1,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
5279,0,0,1,1,12,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5280,0,1,0,0,12,1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [11]:
vifs = [variance_inflation_factor(X_train.values,i) for i in range(len(X_train.columns))]
high_vifs = sorted(zip(X_train.columns, vifs), key=lambda x: x[1], reverse=True)

In [12]:
high_vifs

[('tenure', 7.4774441220077295),
 ('Contract_Two year', 3.4313527485856485),
 ('InternetService_Fiber optic', 3.1828069347558534),
 ('Partner', 2.8221516198532606),
 ('StreamingMovies_Yes', 2.695791666181273),
 ('PaperlessBilling', 2.6936287153615943),
 ('StreamingTV_Yes', 2.663586240957822),
 ('StreamingMovies_No internet service', 2.637857334474688),
 ('MultipleLines_Yes', 2.4895384911525227),
 ('DeviceProtection_Yes', 2.2453701155471633),
 ('PaymentMethod_Electronic check', 2.225300945646245),
 ('OnlineBackup_Yes', 2.112322094341988),
 ('Contract_One year', 2.0715371315045408),
 ('TechSupport_Yes', 2.020772666726921),
 ('Dependents', 1.9453155700993512),
 ('OnlineSecurity_Yes', 1.9039746521866974),
 ('gender', 1.8450275914324004),
 ('PaymentMethod_Mailed check', 1.7828477091189336),
 ('PaymentMethod_Credit card (automatic)', 1.7219223454312147),
 ('MultipleLines_No phone service', 1.3734870994565838),
 ('SeniorCitizen', 1.3630154116601583)]

# test loading final data without high VIF columns

In [13]:
X_train, y_train = load_training_data(final=True)

In [14]:
X_train.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PaperlessBilling,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,OnlineSecurity_Yes,OnlineBackup_Yes,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,0,1,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,0,0,0,7,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,0,0,1,4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0,0,29,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,0,0,0,3,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
