In [1]:
from math import sqrt
from scipy import stats

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pydataset import data
import statistics
import acquire
import prepare

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = acquire.get_telco_data()

In [3]:
len(df)

7043

In [4]:
df = df.drop_duplicates()

In [5]:
len(df)

7043

In [6]:
df.head(3)

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,...,No,Yes,Yes,No,2,Yes,2,65.6,593.3,No
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,No,No,No,Yes,1,No,2,59.9,542.4,No
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,...,Yes,No,No,No,1,Yes,1,73.9,280.85,Yes


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               7043 non-null   object 
 1   gender                    7043 non-null   object 
 2   senior_citizen            7043 non-null   int64  
 3   partner                   7043 non-null   object 
 4   dependents                7043 non-null   object 
 5   tenure                    7043 non-null   int64  
 6   phone_service             7043 non-null   object 
 7   multiple_lines            7043 non-null   object 
 8   internet_service_type_id  7043 non-null   int64  
 9   online_security           7043 non-null   object 
 10  online_backup             7043 non-null   object 
 11  device_protection         7043 non-null   object 
 12  tech_support              7043 non-null   object 
 13  streaming_tv              7043 non-null   object 
 14  streamin

In [8]:
# Dropping customer_id
df = df.drop(columns = ['customer_id'])

In [9]:
df.isnull().sum()

gender                      0
senior_citizen              0
partner                     0
dependents                  0
tenure                      0
phone_service               0
multiple_lines              0
internet_service_type_id    0
online_security             0
online_backup               0
device_protection           0
tech_support                0
streaming_tv                0
streaming_movies            0
contract_type_id            0
paperless_billing           0
payment_type_id             0
monthly_charges             0
total_charges               0
churn                       0
dtype: int64

In [10]:
df.nunique()

gender                         2
senior_citizen                 2
partner                        2
dependents                     2
tenure                        73
phone_service                  2
multiple_lines                 3
internet_service_type_id       3
online_security                3
online_backup                  3
device_protection              3
tech_support                   3
streaming_tv                   3
streaming_movies               3
contract_type_id               3
paperless_billing              2
payment_type_id                4
monthly_charges             1585
total_charges               6531
churn                          2
dtype: int64

In [11]:
df.payment_type_id.value_counts()

1    2365
2    1612
3    1544
4    1522
Name: payment_type_id, dtype: int64

In [12]:
payment = df.payment_type_id.map({1: 'Electronic check', 2: 'Mailed check', 3:'Bank transder', 4:'Credit card'})

In [13]:
df = pd.concat([df, payment.rename("payment")], axis = 1)

1	DSL<br>
2	Fiber optic<br>
3	None<br>

In [14]:
internet = df.internet_service_type_id.map({1: 'DSL', 2: 'Fiber optic', 3:'None'})
df = pd.concat([df, internet.rename("internet_service")], axis = 1)

1	Month-to-month<br>
2	One year<br>
3	Two year<br>

In [15]:
contract = df.contract_type_id .map({1: 'Month-to-month', 2: 'One year', 3:'Two year'})
df = pd.concat([df, contract.rename("contract")], axis = 1)

In [16]:
df = df.drop(columns=['payment_type_id','contract_type_id', 'internet_service_type_id'])

In [17]:
df.internet_service.value_counts()

Fiber optic    3096
DSL            2421
None           1526
Name: internet_service, dtype: int64

In [18]:
boolean = df.nunique()[df.nunique() <= 2].index
boolean

Index(['gender', 'senior_citizen', 'partner', 'dependents', 'phone_service',
       'paperless_billing', 'churn'],
      dtype='object')

In [19]:
boolean_dummy = pd.get_dummies(df[['gender', 'senior_citizen', 'partner', 'dependents', 'phone_service',
       'paperless_billing', 'churn']], drop_first=[True, True, True, True, True, True, True])

In [20]:
boolean_dummy

Unnamed: 0,senior_citizen,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,paperless_billing_Yes,churn_Yes
0,0,0,1,1,1,1,0
1,0,1,0,0,1,0,0
2,0,1,0,0,1,1,1
3,1,1,1,0,1,1,1
4,1,0,1,0,1,1,1
...,...,...,...,...,...,...,...
7038,0,0,0,0,1,0,0
7039,0,1,1,0,1,1,1
7040,0,1,0,0,1,1,0
7041,0,1,1,1,1,0,0


In [21]:
categ = df.nunique()[(df.nunique() > 2) & (df.nunique() < 5)].index
categ

Index(['multiple_lines', 'online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
       'payment', 'internet_service', 'contract'],
      dtype='object')

In [22]:
categ_dummy = pd.get_dummies(df[categ])

In [23]:
categ_dummy.head()

Unnamed: 0,multiple_lines_No,multiple_lines_No phone service,multiple_lines_Yes,online_security_No,online_security_No internet service,online_security_Yes,online_backup_No,online_backup_No internet service,online_backup_Yes,device_protection_No,...,payment_Bank transder,payment_Credit card,payment_Electronic check,payment_Mailed check,internet_service_DSL,internet_service_Fiber optic,internet_service_None,contract_Month-to-month,contract_One year,contract_Two year
0,1,0,0,1,0,0,0,0,1,1,...,0,0,0,1,1,0,0,0,1,0
1,0,0,1,1,0,0,1,0,0,1,...,0,0,0,1,1,0,0,1,0,0
2,1,0,0,1,0,0,1,0,0,0,...,0,0,1,0,0,1,0,1,0,0
3,1,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,1,0,1,0,0
4,1,0,0,1,0,0,1,0,0,1,...,0,0,0,1,0,1,0,1,0,0


In [24]:
#Purpose of this function is encode the telco data.  
def demo_clean_telco():
    df = acquire.get_telco_data() # grabbing the telco data
    df = df.drop_duplicates() # Dropping Duplicates
    df = df.drop(columns = ['customer_id']) # Don't need this column
    
    # If total charges was null, then remove the entire row 
    list_of_null_indexs = list(df[df.total_charges.str.contains(" ")].index)
    df = df.drop(list_of_null_indexs)
    
    # Convert total_charges from datatype object to float
    total_charges = df.total_charges.astype("float")
    df = df.drop(columns='total_charges')
    df = pd.concat([df, total_charges], axis = 1)
    
    # In the three lines below im mapping out the current values for what they represent.
    payment = df.payment_type_id.map({1: 'Electronic check', 2: 'Mailed check', 3:'Bank transder', 4:'Credit card'})
    internet = df.internet_service_type_id.map({1: 'DSL', 2: 'Fiber optic', 3:'None'})
    contract = df.contract_type_id .map({1: 'Month-to-month', 2: 'One year', 3:'Two year'})
    
    # In the three lines below im adding each series to my dataframe and renaming the columns
    df = pd.concat([df, payment.rename("payment")], axis = 1)
    df = pd.concat([df, internet.rename("internet_service")], axis = 1)
    df = pd.concat([df, contract.rename("contract")], axis = 1)
    
    df = df.drop(columns=['payment_type_id','contract_type_id', 'internet_service_type_id']) # Dropping old columns
    
    boolean = df.nunique()[df.nunique() <= 2].index # boolean is a list of columns who's values are either true/false or 1/0

    # In the line below, I am making dummies for all the boolean columns.  Dropping the first so I dont get two columns back
    boolean_dummy = pd.get_dummies(df[boolean], drop_first=[True, True, True, True, True, True, True])
    
    
    df = pd.concat([df, boolean_dummy], axis = 1) # Adding my encoded boolean_dummy DataFrame back to my original Data Frame
    df = df.drop(columns=boolean) # Dropping the none encoded columns
    
    # In the line below, I am grabbing all the categorical columns(that are greater than 2) and saving the values into categ as a list
    categ = df.nunique()[(df.nunique() > 2) & (df.nunique() < 5)].index
    categ_dummy = pd.get_dummies(df[categ]) # Grabbing dummies, this time dont drop the first columns.
    
    
    df = pd.concat([df, categ_dummy], axis = 1) # Adding my encoded categ_dummy DataFrame back to my original Data Frame
    df = df.drop(columns=categ)  # Dropping the none encoded columns
    
    df = df.rename(columns={'churn_Yes': 'churn'})
    return df

In [25]:
#df = demo_clean_telco()

In [26]:
#df.head(3)

In [27]:
#total_charges = df.total_charges.astype("float")
#df = df.drop(columns='total_charges')
#df = pd.concat([df, total_charges], axis = 1)

In [28]:
#list_of_null_indexs = list(df[df.total_charges.str.contains(" ")].index)

#df = df.drop(list_of_null_indexs)

In [29]:
#df[df.total_charges.str.contains(" ")]

In [30]:
# df = pd.concat([df, categ_dummy], axis = 1)

In [31]:
# def clean_data(df):
#     df = df.drop_duplicates()
#     df = df.drop(columns = ['customer_id'])
#     df_dummy = pd.get_dummies(df[['gender', 'partner', 'dependents', 'phone_service', 'multiple_lines', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing'  ]], drop_first=[True, True, True, True, True, True, True, True, True, True, True, True])
#     df= pd.concat([df, df_dummy], axis = 1)
#     return df

In [32]:
# train, validate, test = prepare.prep_telco_data(acquire.get_telco_data())
#categorical = df.nunique()[df.nunique() < 5].index
#for col in categorical:
#    print(f'{col}: {df[col].value_counts().index}' )

In [33]:
train, validate, test = prepare.prep_telco_data(acquire.get_telco_data())

In [34]:
X_train = train.drop(columns=['churn'])
y_train = train.churn

X_validate = validate.drop(columns=['churn'])
y_validate = validate.churn

X_test = test.drop(columns=['churn'])
y_test = test.churn

In [35]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [36]:
clf = clf.fit(X_train, y_train)

In [37]:
y_pred = clf.predict(X_train)

In [38]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.80


In [39]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.94      0.87      2891
           1       0.71      0.40      0.51      1046

    accuracy                           0.80      3937
   macro avg       0.76      0.67      0.69      3937
weighted avg       0.79      0.80      0.78      3937



In [40]:
train.head(3)

Unnamed: 0,tenure,monthly_charges,total_charges,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,paperless_billing_Yes,churn,multiple_lines_No,...,payment_Bank transder,payment_Credit card,payment_Electronic check,payment_Mailed check,internet_service_DSL,internet_service_Fiber optic,internet_service_None,contract_Month-to-month,contract_One year,contract_Two year
5919,58,71.1,4299.2,0,0,0,1,1,0,0,...,0,1,0,0,1,0,0,0,1,0
1915,71,85.45,6028.95,1,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,1,0
5054,35,25.75,882.55,0,1,1,1,1,0,0,...,0,0,1,0,0,0,1,1,0,0
