## Perform Exploratory Data Analysis - Univariate
- Examine each feature individually to determine their distribution
- Form initial hypotheses regarding their level of influence

In [None]:
import os
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Read the cleaned source file into a DataFrame.
data_dir = os.path.join(os.getcwd(), 'Data')
source_file = os.path.join(data_dir, 'WA-Telco-Customer-Churn-EDA.xlsx')
df = pd.read_excel(source_file, header=0)

In [None]:
# Inspect the dataframe to determine the datatype of each feature.
df.dtypes

In [None]:
# Make appropriate data type assignments.
df.gender = df.gender.astype('category')
df.SeniorCitizen = df.SeniorCitizen.astype('category')
df.Partner = df.Partner.astype('category')
df.Dependents = df.Dependents.astype('category')
df.PhoneService = df.PhoneService.astype('category')
df.MultipleLines = df.MultipleLines.astype('category')
df.InternetService = df.InternetService.astype('category')
df.OnlineSecurity = df.OnlineSecurity.astype('category')
df.OnlineBackup = df.OnlineBackup.astype('category')
df.DeviceProtection = df.DeviceProtection.astype('category')
df.TechSupport = df.TechSupport.astype('category')
df.StreamingTV = df.StreamingTV.astype('category')
df.StreamingMovies = df.StreamingMovies.astype('category')
df.Contract = df.Contract.astype('category')
df.PaperlessBilling = df.PaperlessBilling.astype('category')
df.PaymentMethod = df.PaymentMethod.astype('category')
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.Churn = df.Churn.astype('category')

# Validate new data type assignments.
df.dtypes

In [None]:
# Get Descriptive statistics regarding all numeric features.
df.describe()

In [None]:
# Inspect the first 5 observations.
df.head()

## Univariate Analysis of Categorical Variables

In [None]:
churn_df = pd.crosstab(index = df.Churn, columns="count")
churn_df.plot.bar()

In [None]:
churn_df/churn_df.sum()

In [None]:
gender_df = pd.crosstab(index = df.gender, columns="count")
gender_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
gender_df/gender_df.sum()

In [None]:
senior_df = pd.crosstab(index = df.SeniorCitizen, columns="count")
senior_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
senior_df/senior_df.sum()

In [None]:
partner_df = pd.crosstab(index = df.Partner, columns="count")
partner_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
partner_df/partner_df.sum()

In [None]:
dependents_df = pd.crosstab(index = df.Dependents, columns="count")
dependents_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
dependents_df/dependents_df.sum()

In [None]:
contract_df = pd.crosstab(index = df.Contract, columns="count")
contract_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
contract_df/contract_df.sum()

In [None]:
billing_df = pd.crosstab(index = df.PaperlessBilling, columns="count")
billing_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
billing_df/billing_df.sum()

In [None]:
paymethod_df = pd.crosstab(index = df.PaymentMethod, columns="count")
paymethod_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
paymethod_df/paymethod_df.sum()

In [None]:
phone_df = pd.crosstab(index = df.PhoneService, columns="count")
phone_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
phone_df/phone_df.sum()

In [None]:
multiplelines_df = pd.crosstab(index = df.MultipleLines, columns="count")
multiplelines_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
multiplelines_df/multiplelines_df.sum()

In [None]:
internet_df = pd.crosstab(index = df.InternetService, columns="count")
internet_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
internet_df/internet_df.sum()

In [None]:
olsecurity_df = pd.crosstab(index = df.OnlineSecurity, columns="count")
olsecurity_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
olsecurity_df/olsecurity_df.sum()

In [None]:
olbackup_df = pd.crosstab(index = df.OnlineBackup, columns="count")
olbackup_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
olbackup_df/olbackup_df.sum()

In [None]:
devprotect_df = pd.crosstab(index = df.DeviceProtection, columns="count")
devprotect_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
devprotect_df/devprotect_df.sum()

In [None]:
techsupport_df = pd.crosstab(index = df.TechSupport, columns="count")
techsupport_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
techsupport_df/techsupport_df.sum()

In [None]:
strmovies_df = pd.crosstab(index = df.StreamingMovies, columns="count")
strmovies_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
strmovies_df/strmovies_df.sum()

In [None]:
strmtv_df = pd.crosstab(index = df.StreamingTV, columns="count")
strmtv_df.plot(kind="barh", figsize=(6,4), stacked=False)

In [None]:
strmtv_df/strmtv_df.sum()

### Univariate Analysis of Numerical Variables

In [None]:
tenure_df = pd.crosstab(index = df.tenure, columns="count")
tenure_df.plot(kind="hist", figsize=(6,4), bins=6)

In [None]:
monthly_df = pd.crosstab(index = df.MonthlyCharges, columns="count")
monthly_df.plot(kind="hist", figsize=(6,4), bins=6)

In [None]:
totals_df = pd.crosstab(index = df.TotalCharges, columns="count")
totals_df.plot(kind="hist", figsize=(6,4), bins=6)

In [None]:
df.boxplot(column='tenure', return_type='axes', vert=False)

In [None]:
df.boxplot(column='MonthlyCharges', return_type='axes', vert=False)

In [None]:
df.boxplot(column='TotalCharges', return_type='axes', vert=False)

In [None]:
df.plot.hist(x='tenure')
plt.xlabel('tenure')
plt.show()