Given the following dataset, can you create a decision tree to predict customer churn? For simplicity, you can set the maximum depth of the decision tree to 4. For the purpose of this exercise, you do not need to optimize the model.

Solution will create a model and decision tree using Python, then discuss interpreting the output.

In [56]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt

In [43]:
data_path = "https://raw.githubusercontent.com/erood/interviewqs.com_code_snippets/master/Datasets/teleco_user_data.csv"
data = pd.read_csv(data_path)

In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), obj

In [45]:
dummies = pd.get_dummies(data=data, columns=['gender', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'Churn'], drop_first=True)

In [46]:
dummies['TotalCharges'] = [x if x != " " else "0" for x in dummies['TotalCharges']]

In [47]:
dummies['TotalCharges'] = pd.to_numeric(dummies['TotalCharges'])

In [50]:
X = dummies.drop(["customerID","Churn_Yes"], axis=1)
y = dummies["Churn_Yes"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [51]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5986 entries, 1869 to 860
Data columns (total 30 columns):
SeniorCitizen                            5986 non-null int64
tenure                                   5986 non-null int64
MonthlyCharges                           5986 non-null float64
TotalCharges                             5986 non-null float64
gender_Male                              5986 non-null uint8
Partner_Yes                              5986 non-null uint8
Dependents_Yes                           5986 non-null uint8
PhoneService_Yes                         5986 non-null uint8
MultipleLines_No phone service           5986 non-null uint8
MultipleLines_Yes                        5986 non-null uint8
InternetService_Fiber optic              5986 non-null uint8
InternetService_No                       5986 non-null uint8
OnlineSecurity_No internet service       5986 non-null uint8
OnlineSecurity_Yes                       5986 non-null uint8
OnlineBackup_No internet service 

Without Total Charges

In [33]:
dtc = DecisionTreeClassifier(max_depth=4)
# dtc.fit(X_train, y_train)
cv_r = cross_val_score(dtc, X_train, y_train, cv=3, scoring='recall')
cv_p = cross_val_score(dtc, X_train, y_train, cv=3, scoring='precision')
print(cv_r)
print(cv_p)

[0.51984877 0.49149338 0.39697543]
[0.62785388 0.62052506 0.64615385]


With Total Charges

In [54]:
dtc = DecisionTreeClassifier(max_depth=4)
# dtc.fit(X_train, y_train)
cv_r = cross_val_score(dtc, X_train, y_train, cv=3, scoring='recall')
cv_p = cross_val_score(dtc, X_train, y_train, cv=3, scoring='precision')
print(cv_r)
print(cv_p)

[0.51984877 0.49149338 0.39697543]
[0.62785388 0.62052506 0.64615385]


In [58]:
dtc.fit(X_train, y_train)
predictions = dtc.predict(X_test)

In [63]:
fpr, tpr, thresh = roc_curve(y_test, predictions)
roc_auc = auc(fpr, tpr)

In [64]:
fpr

array([0.        , 0.09806452, 1.        ])