## Perform Exploratory Data Analysis - Correlation
- Examine the level of correlation between each feature
- Examine their level of correlation to the outcome (response feature)

In [None]:
import os
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Read the cleaned source file into a DataFrame.
data_dir = os.path.join(os.getcwd(), 'Data')
source_file = os.path.join(data_dir, 'WA-Telco-Customer-Churn-ML.xlsx')
df = pd.read_excel(source_file, header=0)

# Drop the Unique Identifier
df = df.drop(labels=['customerID'], axis=1)

In [None]:
# Make appropriate data type assignments.
df.gender = pd.to_numeric(df.gender, downcast='integer', errors='coerce')
df.SeniorCitizen = pd.to_numeric(df.SeniorCitizen, downcast='integer', errors='coerce')
df.Partner = pd.to_numeric(df.Partner, downcast='integer', errors='coerce')
df.Dependents = pd.to_numeric(df.Dependents, downcast='integer', errors='coerce')
df.tenure = pd.to_numeric(df.tenure, downcast='integer', errors='coerce')
df.PhoneService = pd.to_numeric(df.PhoneService, downcast='integer', errors='coerce')
df.MultipleLines = pd.to_numeric(df.MultipleLines, downcast='integer', errors='coerce')
df.InternetService = pd.to_numeric(df.InternetService, downcast='integer', errors='coerce')
df.OnlineSecurity = pd.to_numeric(df.OnlineSecurity, downcast='integer', errors='coerce')
df.OnlineBackup = pd.to_numeric(df.OnlineBackup, downcast='integer', errors='coerce')
df.DeviceProtection = pd.to_numeric(df.DeviceProtection, downcast='integer', errors='coerce')
df.TechSupport = pd.to_numeric(df.TechSupport, downcast='integer', errors='coerce')
df.StreamingTV = pd.to_numeric(df.StreamingTV, downcast='integer', errors='coerce')
df.StreamingMovies = pd.to_numeric(df.StreamingMovies, downcast='integer', errors='coerce')
df.Contract = pd.to_numeric(df.Contract, downcast='integer', errors='coerce')
df.PaperlessBilling = pd.to_numeric(df.PaperlessBilling, downcast='integer', errors='coerce')
df.PaymentMethod = pd.to_numeric(df.PaymentMethod, downcast='integer', errors='coerce')
df.MonthlyCharges = pd.to_numeric(df.MonthlyCharges, downcast='float', errors='coerce')
df.TotalCharges = pd.to_numeric(df.TotalCharges, downcast='float', errors='coerce')
df.Churn = pd.to_numeric(df.Churn, downcast='integer', errors='coerce')

# Validate new data type assignments.
df.dtypes

### Examine Correlations Among the Entire Feature-Set

In [None]:
# Create a Pearson Correlation Matrix of the Entire Feature-Set
df.corr('pearson')  # default | kendall | spearman rank

In [None]:
# Plot a Colorbar of the Entire Feature-Set
plt.figure(figsize=(8,8))
plt.imshow(df.corr('pearson'), cmap=plt.cm.Blues, interpolation='nearest')
tick_marks = [i for i in range(len(df.columns))]
plt.xticks(tick_marks, df.columns, rotation='vertical')
plt.yticks(tick_marks, df.columns)
plt.colorbar()

### Examine Correlations Among Only the Continuous Numerical Features and Churn

In [None]:
# Create a Pearson Correlation Matrix of the Continuous Numerical Features and the Response (Churn)
num_df = df[['tenure','MonthlyCharges','TotalCharges', 'Churn']]
num_df.corr('pearson')

In [None]:
# Plot a Colorbar of the Continuous Numerical Features and the Response (Churn)
plt.imshow(num_df.corr('pearson'), cmap=plt.cm.Blues, interpolation='nearest')
tick_marks = [i for i in range(len(num_df.columns))]
plt.xticks(tick_marks, num_df.columns, rotation='vertical')
plt.yticks(tick_marks, num_df.columns)
plt.colorbar()

### Examine Correlations Among Only PhoneService, MultipleLines and Churn

In [None]:
# Create a Spearman Correlation Matrix of Only PhoneService, MultipleLines and Churn.
phoneline_df = df[['PhoneService','MultipleLines','Churn']]
phoneline_df.rank()
phoneline_df.corr('spearman')

In [None]:
# Plot a Colorbar of Only PhoneService, MultipleLines and Churn. 
plt.imshow(phoneline_df.corr('spearman'), cmap=plt.cm.Blues, interpolation='nearest')
tick_marks = [i for i in range(len(phoneline_df.columns))]
plt.xticks(tick_marks, phoneline_df.columns, rotation='vertical')
plt.yticks(tick_marks, phoneline_df.columns)
plt.colorbar()

### Examine Correlations Between the Internet-Related Services and Churn

In [None]:
# Plot a Spearman Correlation Matrix of Only the Internet-Related Services and Churn.
intsvcs_df = df[['InternetService','OnlineSecurity','OnlineBackup','DeviceProtection',
                 'TechSupport','StreamingTV','StreamingMovies', 'Churn']]
intsvcs_df.rank()
intsvcs_df.corr('spearman') # spearman rank

In [None]:
# Plot a Colorbar of Only the Internet-Related Services and Churn.
plt.figure(figsize=(6,6))
plt.imshow(intsvcs_df.corr('spearman'), cmap=plt.cm.Blues, interpolation='nearest')
tick_marks = [i for i in range(len(intsvcs_df.columns))]
plt.xticks(tick_marks, intsvcs_df.columns, rotation='vertical')
plt.yticks(tick_marks, intsvcs_df.columns)
plt.colorbar()

### Examine Correlation Between the Remaining Independent Categorical Features and Churn

In [None]:
# Plot a Spearman Correlation Matrix of Only the Remaining Independent Categorical Features and Churn.
ind_df = df[['gender','SeniorCitizen','Partner','Dependents','Contract','PaperlessBilling','PaymentMethod', 'Churn']]
ind_df.rank()
ind_df.corr('spearman') # spearman rank

In [None]:
# Plot a Colorbar of Only the Remaining Independent Categorical Features and Churn.
plt.figure(figsize=(6,6))
plt.imshow(ind_df.corr('spearman'), cmap=plt.cm.Blues, interpolation='nearest')
tick_marks = [i for i in range(len(ind_df.columns))]
plt.xticks(tick_marks, ind_df.columns, rotation='vertical')
plt.yticks(tick_marks, ind_df.columns)
plt.colorbar()