In [24]:
from math import sqrt
from scipy import stats

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pydataset import data
import statistics
import acquire
import prepare

In [2]:
df = acquire.get_telco_data()

In [3]:
len(df)

7043

In [4]:
df = df.drop_duplicates()

In [5]:
len(df)

7043

In [6]:
df.head(3)

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,...,No,Yes,Yes,No,2,Yes,2,65.6,593.3,No
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,No,No,No,Yes,1,No,2,59.9,542.4,No
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,...,Yes,No,No,No,1,Yes,1,73.9,280.85,Yes


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               7043 non-null   object 
 1   gender                    7043 non-null   object 
 2   senior_citizen            7043 non-null   int64  
 3   partner                   7043 non-null   object 
 4   dependents                7043 non-null   object 
 5   tenure                    7043 non-null   int64  
 6   phone_service             7043 non-null   object 
 7   multiple_lines            7043 non-null   object 
 8   internet_service_type_id  7043 non-null   int64  
 9   online_security           7043 non-null   object 
 10  online_backup             7043 non-null   object 
 11  device_protection         7043 non-null   object 
 12  tech_support              7043 non-null   object 
 13  streaming_tv              7043 non-null   object 
 14  streamin

In [8]:
# Dropping customer_id
df = df.drop(columns = ['customer_id'])

In [9]:
df.isnull().sum()

gender                      0
senior_citizen              0
partner                     0
dependents                  0
tenure                      0
phone_service               0
multiple_lines              0
internet_service_type_id    0
online_security             0
online_backup               0
device_protection           0
tech_support                0
streaming_tv                0
streaming_movies            0
contract_type_id            0
paperless_billing           0
payment_type_id             0
monthly_charges             0
total_charges               0
churn                       0
dtype: int64

In [10]:
df.nunique()

gender                         2
senior_citizen                 2
partner                        2
dependents                     2
tenure                        73
phone_service                  2
multiple_lines                 3
internet_service_type_id       3
online_security                3
online_backup                  3
device_protection              3
tech_support                   3
streaming_tv                   3
streaming_movies               3
contract_type_id               3
paperless_billing              2
payment_type_id                4
monthly_charges             1585
total_charges               6531
churn                          2
dtype: int64

In [11]:
categorical = df.nunique()[df.nunique() < 5].index

In [12]:
for col in categorical:
    print(f'{col}: {df[col].value_counts().index}' )

gender: Index(['Male', 'Female'], dtype='object')
senior_citizen: Int64Index([0, 1], dtype='int64')
partner: Index(['No', 'Yes'], dtype='object')
dependents: Index(['No', 'Yes'], dtype='object')
phone_service: Index(['Yes', 'No'], dtype='object')
multiple_lines: Index(['No', 'Yes', 'No phone service'], dtype='object')
internet_service_type_id: Int64Index([2, 1, 3], dtype='int64')
online_security: Index(['No', 'Yes', 'No internet service'], dtype='object')
online_backup: Index(['No', 'Yes', 'No internet service'], dtype='object')
device_protection: Index(['No', 'Yes', 'No internet service'], dtype='object')
tech_support: Index(['No', 'Yes', 'No internet service'], dtype='object')
streaming_tv: Index(['No', 'Yes', 'No internet service'], dtype='object')
streaming_movies: Index(['No', 'Yes', 'No internet service'], dtype='object')
contract_type_id: Int64Index([1, 3, 2], dtype='int64')
paperless_billing: Index(['Yes', 'No'], dtype='object')
payment_type_id: Int64Index([1, 2, 3, 4], dtype='

In [13]:
df_dummy = pd.get_dummies(df[['gender', 'partner', 'dependents', 'phone_service', 'multiple_lines', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing'  ]], drop_first=[True, True, True, True, True, True, True, True, True, True, True, True])

In [14]:
df_dummy

Unnamed: 0,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,multiple_lines_No phone service,multiple_lines_Yes,online_security_No internet service,online_security_Yes,online_backup_No internet service,online_backup_Yes,device_protection_No internet service,device_protection_Yes,tech_support_No internet service,tech_support_Yes,streaming_tv_No internet service,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,paperless_billing_Yes
0,0,1,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1
1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,1,1,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,1,1
4,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
7039,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1
7040,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
7041,1,1,1,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0


In [15]:
df= pd.concat([df, df_dummy], axis = 1)
df.head(3)

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,online_backup,...,online_backup_Yes,device_protection_No internet service,device_protection_Yes,tech_support_No internet service,tech_support_Yes,streaming_tv_No internet service,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,paperless_billing_Yes
0,Female,0,Yes,Yes,9,Yes,No,1,No,Yes,...,1,0,0,0,1,0,1,0,0,1
1,Male,0,No,No,9,Yes,Yes,1,No,No,...,0,0,0,0,0,0,0,0,1,0
2,Male,0,No,No,4,Yes,No,2,No,No,...,0,0,1,0,0,0,0,0,0,1


In [16]:
df.iloc[2]

gender                                     Male
senior_citizen                                0
partner                                      No
dependents                                   No
tenure                                        4
phone_service                               Yes
multiple_lines                               No
internet_service_type_id                      2
online_security                              No
online_backup                                No
device_protection                           Yes
tech_support                                 No
streaming_tv                                 No
streaming_movies                             No
contract_type_id                              1
paperless_billing                           Yes
payment_type_id                               1
monthly_charges                            73.9
total_charges                            280.85
churn                                       Yes
gender_Male                             

In [17]:
def clean_data(df):
    df = df.drop_duplicates()
    df = df.drop(columns = ['customer_id'])
    df_dummy = pd.get_dummies(df[['gender', 'partner', 'dependents', 'phone_service', 'multiple_lines', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing'  ]], drop_first=[True, True, True, True, True, True, True, True, True, True, True, True])
    df= pd.concat([df, df_dummy], axis = 1)
    return df

In [26]:
df = prepare.prep_telco()
categorical = df.nunique()[df.nunique() < 5].index
for col in categorical:
    print(f'{col}: {df[col].value_counts().index}' )

gender: Index(['Male', 'Female'], dtype='object')
senior_citizen: Int64Index([0, 1], dtype='int64')
partner: Index(['No', 'Yes'], dtype='object')
dependents: Index(['No', 'Yes'], dtype='object')
phone_service: Index(['Yes', 'No'], dtype='object')
multiple_lines: Index(['No', 'Yes', 'No phone service'], dtype='object')
internet_service_type_id: Int64Index([2, 1, 3], dtype='int64')
online_security: Index(['No', 'Yes', 'No internet service'], dtype='object')
online_backup: Index(['No', 'Yes', 'No internet service'], dtype='object')
device_protection: Index(['No', 'Yes', 'No internet service'], dtype='object')
tech_support: Index(['No', 'Yes', 'No internet service'], dtype='object')
streaming_tv: Index(['No', 'Yes', 'No internet service'], dtype='object')
streaming_movies: Index(['No', 'Yes', 'No internet service'], dtype='object')
contract_type_id: Int64Index([1, 3, 2], dtype='int64')
paperless_billing: Index(['Yes', 'No'], dtype='object')
payment_type_id: Int64Index([1, 2, 3, 4], dtype='