In [1]:
# standard modules
import seaborn as sns
import pandas as pd
import numpy as np
import os
#import math

# Modules for Displaying Figures
import matplotlib.pyplot as plt
import scipy.stats as stats


# Data Science Modules 
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# My modules
import src.acquire as ac
import src.prepare as pp
import src.helper as helper

# Turn off the red warnings
import warnings
warnings.filterwarnings("ignore")

The following datasets are available:
telco


In [2]:
telco = ac.get_telco_data()

In [3]:
telco.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [4]:
pp.helper()

Main functions:


train, validate, model = prep_telco_data(df)

x_train, y_train, x_validate, y_validate, x_test, y_test = model_telco_data(df)


In [5]:
x_train, y_train, x_validate, y_validate, x_test, y_test = pp.model_telco_data(telco)

In [7]:
df = ac.get_telco_data()

In [8]:
df.head(3)

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check


In [10]:
'''
Strips Data Frame down, adding dummy variables for several categories
of services. It additionally removes unneeded variables. 
'''
# Drop duplicate columns
df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'], inplace=True)

# Drop null values stored as whitespace    
df['total_charges'] = df['total_charges'].str.strip()
df = df[df.total_charges != '']

# Convert to correct datatype
df['total_charges'] = df.total_charges.astype(float)

# Convert binary categorical variables to numeric
df['is_female'] = df.gender.map({'Female': 1, 'Male': 0})
df['has_partner'] = df.partner.map({'Yes': 1, 'No': 0})
df['has_dependents'] = df.dependents.map({'Yes': 1, 'No': 0})
df['has_phone_service'] = df.phone_service.map({'Yes': 1, 'No': 0})
df['has_paperless_billing'] = df.paperless_billing.map({'Yes': 1, 'No': 0})
df['did_churn'] = df.churn.map({'Yes': 1, 'No': 0})

# Get dummies for non-binary categorical variables
    dummy_df = pd.get_dummies(df[['multiple_lines', \
                              'online_security', \
                              'online_backup', \
                              'device_protection', \
                              'tech_support', \
                              'streaming_tv', \
                              'streaming_movies', \
                              'contract_type', \
                              'internet_service_type', \
                              'payment_type']], dummy_na=False, \
                              drop_first=True)
    
df = df.drop(columns=['gender', 'partner', 'dependents', 'phone_service', 'paperless_billing']

In [11]:
df.head(3)

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,churn,contract_type,internet_service_type,payment_type,gender_encoded,partner_encoded,dependents_encoded,phone_service_encoded,paperless_billing_encoded,churn_encoded
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,...,No,One year,DSL,Mailed check,1,1,1,1,1,0
1,Male,0,No,No,9,Yes,Yes,No,No,No,...,No,Month-to-month,DSL,Mailed check,0,0,0,1,0,0
2,Male,0,No,No,4,Yes,No,No,No,Yes,...,Yes,Month-to-month,Fiber optic,Electronic check,0,0,0,1,1,1


In [11]:
x_train.shape, y_train.shape, x_validate.shape, y_validate.shape, x_test.shape, y_test.shape

((3937, 46), (3937,), (1688, 46), (1688,), (1407, 46), (1407,))

In [6]:
x_train.head(3)

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
5885,Male,0,No,No,9,Yes,No,No,No,No,...,0,0,0,0,0,1,0,0,1,0
2209,Male,0,No,No,5,Yes,No,No,No,Yes,...,1,0,1,1,0,0,0,0,0,1
4500,Male,1,Yes,No,44,Yes,Yes,No,Yes,No,...,0,0,1,1,0,1,0,0,0,1


In [15]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

ValueError: could not convert string to float: 'Male'