In [1]:
# Import Modules
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

## Loading Telco Churn CSV raw data 

In [2]:
#Loading data obtained from Kaggle
customer_churn_df = pd.read_csv(Path("Resources/WA_Fn-UseC_-Telco-Customer-Churn.csv"))

#printing the first 5 rows
customer_churn_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,DSL,No,...,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3,No
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,DSL,No,...,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4,No
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,Fiber optic,No,...,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85,Yes
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85,Yes
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,Fiber optic,No,...,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4,Yes


In [3]:
# Split target column from dataset
y = customer_churn_df['Churn']
X = customer_churn_df.drop(columns='Churn')

# Set Index
X = X.set_index('customerID')

In [4]:
# Print first 5 entries for target
y[:5]

0     No
1     No
2    Yes
3    Yes
4    Yes
Name: Churn, dtype: object

In [5]:
customer_churn_df = customer_churn_df.replace({'Partner': {'Yes': 1, 'No': 0}})
customer_churn_df = customer_churn_df.replace({'Dependents': {'Yes': 1, 'No': 0}})
customer_churn_df = customer_churn_df.replace({'PhoneService': {'Yes': 1, 'No': 0}})
customer_churn_df = customer_churn_df.replace({'MultipleLines': {'Yes': 1, 'No': 0, "No phone service" : 0}})
customer_churn_df = customer_churn_df.replace({'OnlineSecurity': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'OnlineBackup': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'DeviceProtection': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'TechSupport': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'StreamingTV': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'StreamingMovies': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'PaperlessBilling': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'Churn': {'Yes': 1, 'No': 0, "No internet service" : 0}})


customer_churn_df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0002-ORFBO,Female,0,1,1,9,1,0,DSL,0,...,0,1,1,0,One year,1,Mailed check,65.60,593.30,0
1,0003-MKNFE,Male,0,0,0,9,1,1,DSL,0,...,0,0,0,1,Month-to-month,0,Mailed check,59.90,542.40,0
2,0004-TLHLJ,Male,0,0,0,4,1,0,Fiber optic,0,...,1,0,0,0,Month-to-month,1,Electronic check,73.90,280.85,1
3,0011-IGKFF,Male,1,1,0,13,1,0,Fiber optic,0,...,1,0,1,1,Month-to-month,1,Electronic check,98.00,1237.85,1
4,0013-EXCHZ,Female,1,1,0,3,1,0,Fiber optic,0,...,0,1,1,0,Month-to-month,1,Mailed check,83.90,267.40,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,9987-LUTYD,Female,0,0,0,13,1,0,DSL,1,...,0,1,0,0,One year,0,Mailed check,55.15,742.90,0
7039,9992-RRAMN,Male,0,1,0,22,1,1,Fiber optic,0,...,0,0,0,1,Month-to-month,1,Electronic check,85.10,1873.70,1
7040,9992-UJOEL,Male,0,0,0,2,1,0,DSL,0,...,0,0,0,0,Month-to-month,1,Mailed check,50.30,92.75,0
7041,9993-LHIEB,Male,0,1,1,67,1,0,DSL,1,...,1,1,0,1,Two year,0,Mailed check,67.85,4627.65,0


In [6]:
customer_churn_df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService      object
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract             object
PaperlessBilling      int64
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

In [7]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

X.head()

Unnamed: 0_level_0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0002-ORFBO,0,9,65.6,593.3,1,0,0,1,0,1,...,0,0,1,0,0,1,0,0,0,1
0003-MKNFE,0,9,59.9,542.4,0,1,1,0,1,0,...,1,1,0,0,1,0,0,0,0,1
0004-TLHLJ,0,4,73.9,280.85,0,1,1,0,1,0,...,0,1,0,0,0,1,0,0,1,0
0011-IGKFF,1,13,98.0,1237.85,0,1,0,1,1,0,...,1,1,0,0,0,1,0,0,1,0
0013-EXCHZ,1,3,83.9,267.4,1,0,0,1,1,0,...,0,1,0,0,0,1,0,0,0,1


In [8]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)



In [9]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Decision Tree Model

In [10]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [11]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [12]:
tree_predictions = model.predict(X_test_scaled)

In [13]:
print(classification_report(y_test, tree_predictions))

              precision    recall  f1-score   support

          No       0.83      0.79      0.81      1305
         Yes       0.48      0.55      0.51       456

    accuracy                           0.73      1761
   macro avg       0.66      0.67      0.66      1761
weighted avg       0.74      0.73      0.73      1761



In [29]:
# Create DOT data
dot_data = tree.export_graphviz(
    model, out_file=None, feature_names=X.columns, class_names=["0", "1"], filled=True
)

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)

# Show graph
Image(graph.create_png())

## DF adjustments from HG

In [15]:
#narrow down the amount of columns (Do we want to keep this?)
new_df=customer_churn_df[["customerID","MonthlyCharges", "TotalCharges", "Churn"]]
new_df

Unnamed: 0,customerID,MonthlyCharges,TotalCharges,Churn
0,0002-ORFBO,65.60,593.30,0
1,0003-MKNFE,59.90,542.40,0
2,0004-TLHLJ,73.90,280.85,1
3,0011-IGKFF,98.00,1237.85,1
4,0013-EXCHZ,83.90,267.40,1
...,...,...,...,...
7038,9987-LUTYD,55.15,742.90,0
7039,9992-RRAMN,85.10,1873.70,1
7040,9992-UJOEL,50.30,92.75,0
7041,9993-LHIEB,67.85,4627.65,0


In [16]:
# use get dummies to transform categorical data to binary
new_df=pd.get_dummies(new_df,columns=["Churn"],dtype=float).copy()
new_df.head()

Unnamed: 0,customerID,MonthlyCharges,TotalCharges,Churn_0,Churn_1
0,0002-ORFBO,65.6,593.3,1.0,0.0
1,0003-MKNFE,59.9,542.4,1.0,0.0
2,0004-TLHLJ,73.9,280.85,0.0,1.0
3,0011-IGKFF,98.0,1237.85,0.0,1.0
4,0013-EXCHZ,83.9,267.4,0.0,1.0


In [17]:
dummies_df_1 = pd.get_dummies(customer_churn_df["Dependents"])
dummies_df_1.head()

Unnamed: 0,0,1
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0


In [18]:
customer_churn_df_dummies = pd.get_dummies(customer_churn_df["PhoneService"])
customer_churn_df_dummies.head()

Unnamed: 0,0,1
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1


In [19]:
customer_churn_df_dummies

Unnamed: 0,0,1
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
7038,0,1
7039,0,1
7040,0,1
7041,0,1


In [20]:
#standardize data for numerical values
from sklearn.preprocessing import StandardScaler

In [21]:
X=new_df[["MonthlyCharges", "TotalCharges"]]
X.head()

Unnamed: 0,MonthlyCharges,TotalCharges
0,65.6,593.3
1,59.9,542.4
2,73.9,280.85
3,98.0,1237.85
4,83.9,267.4


In [22]:
y=new_df["Churn"]

KeyError: 'Churn'

In [None]:
x_scaled = StandardScaler().fit_transform(new_df[["MonthlyCharges", "TotalCharges"]])

In [None]:
#run model, fit to model and backtest

In [None]:
customer_churn_df.dtypes

In [None]:
X=customer_churn_df_scaled[["MonthlyCharges", "TotalCharges"]]
X.head()

In [None]:
customer_churn_prediction = pd.concat([new_df, ], axis=1)