In [1]:
import src.prepare as pp
import src.acquire as ac
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

The following datasets are available:
telco


In [2]:
pp.helper()

Main functions:


train, validate, model = prep_telco_data(df)

x_train, y_train, x_validate, y_validate, x_test, y_test = model_telco_data(df)


In [8]:
df = ac.get_telco_data()

In [3]:
df.head(3)

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check


In [9]:
'''
Strips Data Frame down, adding dummy variables for several categories
of services. It additionally removes unneeded variables. 
'''
# Drop duplicate columns
df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'], inplace=True)

# Drop null values stored as whitespace    
df['total_charges'] = df['total_charges'].str.strip()
df = df[df.total_charges != '']

# Convert to correct datatype
df['total_charges'] = df.total_charges.astype(float)

# Convert binary categorical variables to numeric
df['is_female'] = df.gender.map({'Female': 1, 'Male': 0})
df['has_partner'] = df.partner.map({'Yes': 1, 'No': 0})
df['has_dependents'] = df.dependents.map({'Yes': 1, 'No': 0})
df['has_phone_service'] = df.phone_service.map({'Yes': 1, 'No': 0})
df['has_paperless_billing'] = df.paperless_billing.map({'Yes': 1, 'No': 0})
df['did_churn'] = df.churn.map({'Yes': 1, 'No': 0})

# Get dummies for non-binary categorical variables
dummy_df = pd.get_dummies(df[['multiple_lines', \
                          'online_security', \
                          'online_backup', \
                          'device_protection', \
                          'tech_support', \
                          'streaming_tv', \
                          'streaming_movies', \
                          'contract_type', \
                          'internet_service_type', \
                          'payment_type']], dummy_na=False, \
                          drop_first=True)
  
# Joins original dataframe with newly constructed dataframe using converted dummy variables
df = pd.concat([df, dummy_df], axis = 1)


#Drops variables which have had binary variables created to represent for machine learning 
df = df.drop(columns=['gender', 'partner', 'dependents', 'phone_service', 'paperless_billing',
                     'multiple_lines', 'online_security', 'online_backup', 'device_protection', 
                      'tech_support', 'streaming_tv', 'streaming_movies', 'contract_type', 
                      'internet_service_type', 'payment_type'])


In [10]:
df.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,churn,is_female,has_partner,has_dependents,has_phone_service,has_paperless_billing,...,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,0,9,65.6,593.3,No,1,1,1,1,1,...,1,0,0,1,0,0,0,0,0,1
1,0,9,59.9,542.4,No,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
2,0,4,73.9,280.85,Yes,0,0,0,1,1,...,0,0,0,0,0,1,0,0,1,0
3,1,13,98.0,1237.85,Yes,0,1,0,1,1,...,1,0,1,0,0,1,0,0,1,0
4,1,3,83.9,267.4,Yes,1,1,0,1,1,...,1,0,0,0,0,1,0,0,0,1


In [23]:
df['monthly_charges'].describe()

count    7032.000000
mean       64.798208
std        30.085974
min        18.250000
25%        35.587500
50%        70.350000
75%        89.862500
max       118.750000
Name: monthly_charges, dtype: float64

In [47]:
df['monthly_charges'].quantile(.25)

35.5875

In [59]:
df['charges_lower_quartile'] = df.monthly_charges <= df.monthly_charges.quantile(.25)

In [68]:
df['charges_lower_quartile'].describe()

count      7032
unique        2
top       False
freq       5274
Name: charges_lower_quartile, dtype: object

In [None]:
df['