In [1]:
import src.prepare as pp
import src.acquire as ac
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

The following datasets are available:
telco


In [2]:
df = ac.get_telco_data()

In [3]:
'''
Strips Data Frame down, adding dummy variables for several categories
of services. It additionally removes unneeded variables. 
'''
# Drop duplicate columns
df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'], inplace=True)

# Drop null values stored as whitespace    
df['total_charges'] = df['total_charges'].str.strip()
df = df[df.total_charges != '']

# Convert to correct datatype
df['total_charges'] = df.total_charges.astype(float)

# Convert binary categorical variables to numeric
df['is_female'] = df.gender.map({'Female': 1, 'Male': 0})
df['has_partner'] = df.partner.map({'Yes': 1, 'No': 0})
df['has_dependents'] = df.dependents.map({'Yes': 1, 'No': 0})
df['has_phone_service'] = df.phone_service.map({'Yes': 1, 'No': 0})
df['has_paperless_billing'] = df.paperless_billing.map({'Yes': 1, 'No': 0})
df['did_churn'] = df.churn.map({'Yes': 1, 'No': 0})

# Get dummies for non-binary categorical variables
dummy_df = pd.get_dummies(df[['multiple_lines', \
                          'online_security', \
                          'online_backup', \
                          'device_protection', \
                          'tech_support', \
                          'streaming_tv', \
                          'streaming_movies', \
                          'contract_type', \
                          'internet_service_type', \
                          'payment_type']], dummy_na=False, \
                          drop_first=True)
  
# Joins original dataframe with newly constructed dataframe using converted dummy variables
df = pd.concat([df, dummy_df], axis = 1)

# Creates dummy variables for customers based on which quartile their monthly charges fit
df['charges_lower_quartile'] = df.monthly_charges <= df.monthly_charges.quantile(.25)
df['charges_higher_quartile'] = df.monthly_charges >= df.monthly_charges.quantile(.75)
dummy_df['mid_charge1'] = df.monthly_charges < df.monthly_charges.quantile(.75)
dummy_df['mid_charge2'] = df.monthly_charges > df.monthly_charges.quantile(.25)
df['mid_charge'] = dummy_df['mid_charge1'] == dummy_df['mid_charge2']





#Drops variables which have had binary variables created to represent for machine learning 
df = df.drop(columns=['gender', 'partner', 'dependents', 'phone_service', 'paperless_billing',
                     'multiple_lines', 'online_security', 'online_backup', 'device_protection', 
                      'tech_support', 'streaming_tv', 'streaming_movies', 'contract_type', 
                      'internet_service_type', 'payment_type', 'churn'])

#Drops variables unused for models
df = df.drop(columns=['monthly_charges', 'total_charges'])


In [4]:
df.head()

Unnamed: 0,senior_citizen,tenure,is_female,has_partner,has_dependents,has_phone_service,has_paperless_billing,did_churn,multiple_lines_No phone service,multiple_lines_Yes,...,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check,charges_lower_quartile,charges_higher_quartile,mid_charge
0,0,9,1,1,1,1,1,0,0,0,...,1,0,0,0,0,0,1,False,False,True
1,0,9,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,False,False,True
2,0,4,0,0,0,1,1,1,0,0,...,0,0,1,0,0,1,0,False,False,True
3,1,13,0,1,0,1,1,1,0,0,...,0,0,1,0,0,1,0,False,True,False
4,1,3,1,1,0,1,1,1,0,0,...,0,0,1,0,0,0,1,False,False,True


In [5]:
df.head()

Unnamed: 0,senior_citizen,tenure,churn,is_female,has_partner,has_dependents,has_phone_service,has_paperless_billing,did_churn,multiple_lines_No phone service,...,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check,charges_lower_quartile,charges_higher_quartile,mid_charge
0,0,9,No,1,1,1,1,1,0,0,...,1,0,0,0,0,0,1,False,False,True
1,0,9,No,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,False,False,True
2,0,4,Yes,0,0,0,1,1,1,0,...,0,0,1,0,0,1,0,False,False,True
3,1,13,Yes,0,1,0,1,1,1,0,...,0,0,1,0,0,1,0,False,True,False
4,1,3,Yes,1,1,0,1,1,1,0,...,0,0,1,0,0,0,1,False,False,True


In [23]:
df['monthly_charges'].describe()

count    7032.000000
mean       64.798208
std        30.085974
min        18.250000
25%        35.587500
50%        70.350000
75%        89.862500
max       118.750000
Name: monthly_charges, dtype: float64

In [47]:
df['monthly_charges'].quantile(.25)

35.5875

In [59]:
df['charges_lower_quartile'] = df.monthly_charges <= df.monthly_charges.quantile(.25)
df['charges_higher_quartile'] = df.monthly_charges >= df.monthly_charges.quantile(.75)
dummy_df['mid_charge1'] = df.monthly_charges < df.monthly_charges.quantile(.75)
dummy_df['mid_charge2'] = df.monthly_charges > df.monthly_charges.quantile(.25)
df['mid_charge'] = dummy_df['value1'] == dummy_df['value2']

In [100]:
dummy_df['mid_charge'] = df['value1'] == df['value2']
dummy_df['mid_charge'].describe()

count     7032
unique       2
top       True
freq      3516
Name: mid_charge, dtype: object

In [101]:
dummy_df['mid_charge'].describe()

count     7032
unique       2
top       True
freq      3516
Name: mid_charge, dtype: object

In [102]:
dummy_df.head()

Unnamed: 0,multiple_lines_No phone service,multiple_lines_Yes,online_security_No internet service,online_security_Yes,online_backup_No internet service,online_backup_Yes,device_protection_No internet service,device_protection_Yes,tech_support_No internet service,tech_support_Yes,...,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check,mid_charge1,mid_charge2,mid_charge
0,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,1,True,True,True
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,True,True,True
2,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,True,True,True
3,0,0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,False,True,False
4,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,1,True,True,True
