## Importing and Preparing Source Data
- Download data from the Internet
- Profile the data for initial understanding and to identify flaws
- Address any issues and prepare for Exploratory Data Analysis (EDA)
- Prepare data for Machine Learning model development

### Import Required Libraries

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder

### Load the Data

In [None]:
source_url = 'https://community.watsonanalytics.com/wp-content/uploads/2015/03/WA_Fn-UseC_-Telco-Customer-Churn.csv';
data_dir = os.path.join(os.getcwd(), 'Data')

# Read the source from the Web into a Pandas DataFrame.
df = pd.read_csv(source_url, header=0)

# Save a local copy of the raw source data file.
raw_file = os.path.join(data_dir, 'WA-Telco-Customer-Churn-Raw.csv')
df.to_csv(raw_file)

### Profile the Data

In [None]:
# Display the first 5 observations
df.head()

In [None]:
# Ensure we've identified all the features.
df.columns

In [None]:
# Inspect the unique values for each feature: looking for Nulls, NaNs, etc.
print("------------------------------------------------------------")
print("Number of Observations:", len(df))
print("Number of Unique Customers:", len(df.customerID.unique()))
print("------------------------------------------------------------")
print("Gender:", df.gender.unique())
print("SeniorCitizen:", df.SeniorCitizen.unique())
print("Partner:", df.Partner.unique())
print("Dependents:", df.Dependents.unique())
print("Tenure:", df.tenure.unique())
print("PhoneService:", df.PhoneService.unique())
print("MultipleLines:", df.MultipleLines.unique())
print("InternetService:", df.InternetService.unique())
print("OnlineSecurity:", df.OnlineSecurity.unique())
print("OnlineBackup:", df.OnlineBackup.unique())
print("DeviceProtection:", df.DeviceProtection.unique())
print("TechSupport", df.TechSupport.unique())
print("StreamingTV", df.StreamingTV.unique())
print("StreamingMovies", df.StreamingMovies.unique())
print("Contract", df.Contract.unique())
print("PaperlessBilling", df.PaperlessBilling.unique())
print("PaymentMethod", df.PaymentMethod.unique())
print("MonthlyCharges", df.MonthlyCharges.unique())
print("TotalCharges", df.TotalCharges.unique())
print("Churn", df.Churn.unique())

#### Check for Any Null or NaN Observations in the 2 Continuous Numeric Features

In [None]:
print(df.MonthlyCharges.isnull().any())
print(df.TotalCharges.isnull().any())

### Prepare for Exploratory Data Analysis
- Textual Labels Produce More Readable Data Visualizations

In [None]:
# Convert SeniorCitizen to a textual feature.
df.loc[df.SeniorCitizen==1, ('SeniorCitizen')] = 'Yes'
df.loc[df.SeniorCitizen==0, ('SeniorCitizen')] = 'No'

# Replace NaN with zero for those observations
df.TotalCharges.fillna(0, inplace=True)

#### Save the Readable Feature-Set for EDA

In [None]:
dest_file = os.path.join(data_dir, 'WA-Telco-Customer-Churn-EDA.xlsx')
df.to_excel(dest_file)

### Prepare for Machine Learning
- Machine Learning Algorithms Cannot Operate on Textual Data 
- Use LabelEncoding to Convert Textual Features into Numeric Values

In [None]:
le = LabelEncoder()
df.gender = le.fit_transform(df.gender)
df.SeniorCitizen = le.fit_transform(df.SeniorCitizen)
df.Partner = le.fit_transform(df.Partner)
df.Dependents = le.fit_transform(df.Dependents)
df.PhoneService = le.fit_transform(df.PhoneService)
df.MultipleLines = le.fit_transform(df.MultipleLines)
df.InternetService = le.fit_transform(df.InternetService)
df.OnlineSecurity = le.fit_transform(df.OnlineSecurity)
df.OnlineBackup = le.fit_transform(df.OnlineBackup)
df.DeviceProtection = le.fit_transform(df.DeviceProtection)
df.TechSupport = le.fit_transform(df.TechSupport)
df.StreamingTV = le.fit_transform(df.StreamingTV)
df.StreamingMovies = le.fit_transform(df.StreamingMovies)
df.Contract = le.fit_transform(df.Contract)
df.PaperlessBilling = le.fit_transform(df.PaperlessBilling)
df.PaymentMethod = le.fit_transform(df.PaymentMethod)
df.Churn = le.fit_transform(df.Churn)

#### Make Appropriate Data Type Assignments.

In [None]:
df.gender = df.gender.astype('category')
df.SeniorCitizen = df.SeniorCitizen.astype('category')
df.Partner = df.Partner.astype('category')
df.Dependents = df.Dependents.astype('category')
df.tenure = pd.to_numeric(df.tenure, downcast='integer', errors='coerce')
df.PhoneService = df.PhoneService.astype('category')
df.MultipleLines = df.MultipleLines.astype('category')
df.InternetService = df.InternetService.astype('category')
df.OnlineSecurity = df.OnlineSecurity.astype('category')
df.OnlineBackup = df.OnlineBackup.astype('category')
df.DeviceProtection = df.DeviceProtection.astype('category')
df.TechSupport = df.TechSupport.astype('category')
df.StreamingTV = df.StreamingTV.astype('category')
df.StreamingMovies = df.StreamingMovies.astype('category')
df.Contract = df.Contract.astype('category')
df.PaperlessBilling = df.PaperlessBilling.astype('category')
df.PaymentMethod = df.PaymentMethod.astype('category')
df.MonthlyCharges = pd.to_numeric(df.MonthlyCharges, downcast='float', errors='coerce')
df.TotalCharges = pd.to_numeric(df.TotalCharges, downcast='float', errors='coerce')
df.Churn = df.Churn.astype('category')

# Replace NaN with zero for those observations
df.TotalCharges.fillna(0, inplace=True)

# Validate new data type assignments.
df.dtypes

#### Inspect Converted Feature-Set

In [None]:
df.tail()

#### Save Converted Features for Machine Learning Modeling

In [None]:
dest_file = os.path.join(data_dir, 'WA-Telco-Customer-Churn-ML.xlsx')
df.to_excel(dest_file)