## Feature Engineering Techniques
- Imputing Missing Values
- Encoding Categorical Features
- Scaling Numerical Features
- Credibilty Estimates
- DateTime Parsing: Generalization & Discritization (Binning)

### Import the Required Libraries

In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime, date
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, MinMaxScaler

### Define Global Functions

In [2]:
def set_datatypes(dframe):
    for col in dframe.columns:
        if dframe[col].dtypes.str.startswith('int'):
            dframe[col] = pd.to_numeric(dframe[col],
                                        downcast='integer',
                                        errors='coerce')
            
        elif dframe[col].dtypes.str.startswith('float'):
            dframe[col] = pd.to_numeric(dframe[col],
                                        downcast='float',
                                        errors='coerce')
            
        elif dframe[col].dtypes == 'object':
            dframe[col] = dframe[col].astype('category')
        
    return dframe

### Load the Sample DataSet

In [3]:
# Read the cleaned source file into a DataFrame.
data_dir = os.path.join(os.getcwd(), 'Data')
source_file = os.path.join(data_dir, 'WA-Telco-Customer-Churn-EDA.csv')
df = pd.read_csv(source_file, header=0, index_col=0)

# Drop the Unique Identifier
df = df.drop(labels=['Customerid'], axis=1)

# Make sure the index values are seed=1, increment=1
df.reset_index(drop=True, inplace=True)

In [4]:
# Make the data type conversions needed for transformation
df = set_datatypes(df)

# Inspect the dataframe to determine the datatype of each feature.
df.dtypes

Gender              category
SeniorCitizen       category
Partner             category
Dependents          category
Tenure                 int64
PhoneService        category
MultipleLines       category
InternetService     category
OnlineSecurity      category
OnlineBackup        category
DeviceProtection    category
TechSupport         category
StreamingTV         category
StreamingMovies     category
Contract            category
PaperlessBilling    category
PaymentMethod       category
MonthlyCharges       float64
TotalCharges         float64
Churn               category
dtype: object

In [5]:
# Inspect a few rows from the top
df.head()

Unnamed: 0,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Encoding Categorical Features:
- Label Encoding - Convert each unique value into a numerical equivilant
- One-Hot Encoding - Creates 'Dummy' Features
- Ordinal Encoding - Convert to numerical values for Ordinal Categorical features

#### Include Only the Categorical Features

In [6]:
# Include only the Categorical features.
df_categorical = df.drop(labels=['Tenure','MonthlyCharges','TotalCharges'], axis=1)

#### Label Encoding

In [7]:
label_encoder = LabelEncoder()

for col in df_categorical.columns:
    df_categorical[col] = label_encoder.fit_transform(df_categorical[col])
    
df_categorical.head()

Unnamed: 0,Gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,0,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,0
1,1,0,0,0,1,0,0,2,0,2,0,0,0,1,0,3,0
2,1,0,0,0,1,0,0,2,2,0,0,0,0,0,1,3,1
3,1,0,0,0,0,1,0,2,0,2,2,0,0,1,0,0,0
4,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,1


#### One-Hot Encoding

In [8]:
print(df.Gender.unique())
print('--------------------------------------')
print(df_categorical.Gender.unique())

[Female, Male]
Categories (2, object): [Female, Male]
--------------------------------------
[0 1]


In [9]:
onehot_encoder = OneHotEncoder(sparse=False)
df_gender = np.array(df_categorical.Gender).reshape(len(df_categorical.Gender), 1)

onehot_encoded = onehot_encoder.fit_transform(df_gender)
print(onehot_encoded)

[[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [1. 0.]
 [0. 1.]
 [0. 1.]]


#### Ordinal Encoding

In [10]:
# There is a logical order to the 'Contract' feature.
print(df.Contract.unique())

[Month-to-month, One year, Two year]
Categories (3, object): [Month-to-month, One year, Two year]


In [11]:
ordinal_encoder = OrdinalEncoder()
df_contract = np.array(df.Contract).reshape(len(df.Contract), 1)

ordinal_encoded = ordinal_encoder.fit_transform(df_contract)
print(ordinal_encoded)

[[0.]
 [1.]
 [0.]
 ...
 [0.]
 [0.]
 [2.]]


### Scaling Numerical Features:
- Standardization: Redistribute so Mean = 0 and Standard Deviation = 1
- Normalization: Redistribute so its Range is between Zero and One (0 - 1)

#### Include Only the Numerical Features

In [12]:
df_numerical = df[['Tenure','MonthlyCharges','TotalCharges']]

#### Standardize: Conforms to a Normal Distribution

In [13]:
standard_scaler = StandardScaler()
df_tenure = np.array(df_numerical.Tenure).reshape(len(df_numerical.Tenure), 1)
scaler = standard_scaler.fit(df_tenure)

In [14]:
print("Mean: ", scaler.mean_)
print("Scale: ", scaler.scale_)

Mean:  [32.37114866]
Scale:  [24.55773742]


In [15]:
numerical_scaled = scaler.transform(df_tenure)
print(numerical_scaled)

[[-1.27744458]
 [ 0.06632742]
 [-1.23672422]
 ...
 [-0.87024095]
 [-1.15528349]
 [ 1.36937906]]


In [16]:
print("Mean: ", numerical_scaled.mean(axis=0).round())
print("StDev: ", numerical_scaled.std(axis=0))

Mean:  [-0.]
StDev:  [1.]


#### Normalize: Fits into a Range (e.g., 0 to 1)

In [17]:
min_max_scaler = MinMaxScaler()
numerical_minmax = min_max_scaler.fit_transform(df_tenure)
numerical_minmax

array([[0.01388889],
       [0.47222222],
       [0.02777778],
       ...,
       [0.15277778],
       [0.05555556],
       [0.91666667]])

In [18]:
print(min_max_scaler.scale_)

[0.01388889]


In [19]:
print(min_max_scaler.min_)

[0.]


### Credibility Estimates
**count_k * (y_bar_k - y_bar_target)**  where y_bar is the sample mean of the distribution

In [20]:
# The values in the InternetService column are imbalanced
df.InternetService.value_counts()

Fiber optic    3096
DSL            2421
No             1526
Name: InternetService, dtype: int64

In [21]:
def credibility_estimate(X, k, y, y_true):
    count_k = X.value_counts(0)[k]
    y_bar_k = X.value_counts(1)[k]
    y_bar_target = y.value_counts(1)[y_true]
    
    y_cred_k = count_k * (y_bar_k - y_bar_target)
    
    return y_cred_k.round()      

In [22]:
X = df.InternetService
y = df.Churn

for k in df.InternetService.unique():
    print(credibility_estimate(X, k, y, 'Yes'))

190.0
539.0
-74.0


#### Create a Column Containing the Credibility Estimate

In [23]:
def get_credibilityEstimate(X, y, y_true):
    df = pd.DataFrame(X)
    y_cred_k = 0
    
    for k in X.unique():
        count_k = X.value_counts(0)[k]
        y_bar_k = X.value_counts(1)[k]
        y_bar_target = y.value_counts(1)[y_true]
        y_cred_k = count_k * (y_bar_k - y_bar_target)
    
        df.loc[X == k, ('Credibility')] = y_cred_k.round()
    
    return df.Credibility  

In [24]:
df_copy = df

col_name = 'InternetService'
pos = df.columns.get_loc(col_name) + 1
new_col_name = col_name + '_CredEst'

if new_col_name in df_copy.columns:
    df_copy.drop(labels=[new_col_name], axis=1, inplace=True)

df_copy.insert(loc=pos, column=new_col_name, value=get_credibilityEstimate(X, y, 'Yes'))
df_copy.head()

Unnamed: 0,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,InternetService_CredEst,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,190.0,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,190.0,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,190.0,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,190.0,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,539.0,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### DateTime Parsing:
- Derived Features
- Generalization
- Discretization (Binning)

In [25]:
# Read sample data into a DataFrame.
data_dir = os.path.join(os.getcwd(), 'Data')
source_file = os.path.join(data_dir, 'DateTime-Parsing-Demo.csv')
df_dates = pd.read_csv(source_file, header=0)

In [26]:
df_dates.head()

Unnamed: 0,Has_QSpecs,Weight,InspectionDate
0,False,10.66,2/16/2012
1,False,0.1,4/25/2012
2,False,26.0,11/28/2012
3,False,0.0,5/6/2012
4,False,0.4,3/23/2012


In [27]:
# We can see that the InspectionDate feature isn't typed as a DateTime
df_dates.dtypes

Has_QSpecs           bool
Weight            float64
InspectionDate     object
dtype: object

In [28]:
# So, first we need to convert the feature to the correct data type
df_dates.InspectionDate = pd.to_datetime(df_dates['InspectionDate'])
df_dates

Unnamed: 0,Has_QSpecs,Weight,InspectionDate
0,False,10.66,2012-02-16
1,False,0.10,2012-04-25
2,False,26.00,2012-11-28
3,False,0.00,2012-05-06
4,False,0.40,2012-03-23
...,...,...,...
9995,False,680.00,2012-07-26
9996,False,0.05,2012-01-17
9997,False,180.00,2012-04-13
9998,False,0.26,2012-01-13


In [29]:
df_dates.dtypes

Has_QSpecs                  bool
Weight                   float64
InspectionDate    datetime64[ns]
dtype: object

### Derived Features
- DateTime Values are Very Rarely Influencial in a ML Model:
- **Get the Month:** Events and Trends Frequently Occur Seasonaly
- **Get the Year:** Events and Trends Often Occur Annually

In [30]:
def get_monthAndYear(dateColumn):
    df = pd.DataFrame(dateColumn)
    df['Month'] = pd.DatetimeIndex(dateColumn).month
    df['MonthName'] = pd.DatetimeIndex(dateColumn).month_name()
    df['Year'] = pd.DatetimeIndex(dateColumn).year
    
    return df.Month, df.MonthName, df.Year

In [31]:
df_dates['InspectionMonth'], df_dates['InspectionMonthName'], df_dates['InspectionYear'] = get_monthAndYear(df_dates.InspectionDate)
df_dates

Unnamed: 0,Has_QSpecs,Weight,InspectionDate,InspectionMonth,InspectionMonthName,InspectionYear
0,False,10.66,2012-02-16,2,February,2012
1,False,0.10,2012-04-25,4,April,2012
2,False,26.00,2012-11-28,11,November,2012
3,False,0.00,2012-05-06,5,May,2012
4,False,0.40,2012-03-23,3,March,2012
...,...,...,...,...,...,...
9995,False,680.00,2012-07-26,7,July,2012
9996,False,0.05,2012-01-17,1,January,2012
9997,False,180.00,2012-04-13,4,April,2012
9998,False,0.26,2012-01-13,1,January,2012


### Generalization:
- Intervals of Time (e.g., Age) are Usually More Influential Than Dates

In [32]:
def get_age(dateColumn):
    df = pd.DataFrame(dateColumn)
    df['Today'] = pd.to_datetime(date.today(), errors='coerce')  
    df['Age'] = abs((dateColumn - df.Today)/np.timedelta64(1,'Y'))
    df.Age = pd.to_numeric(df.Age.round(0), downcast='integer', errors='coerce')
    
    return df.Age

In [33]:
df_dates['InspectionAge'] = get_age(df_dates.InspectionDate)
df_dates.head()

Unnamed: 0,Has_QSpecs,Weight,InspectionDate,InspectionMonth,InspectionMonthName,InspectionYear,InspectionAge
0,False,10.66,2012-02-16,2,February,2012,9
1,False,0.1,2012-04-25,4,April,2012,9
2,False,26.0,2012-11-28,11,November,2012,8
3,False,0.0,2012-05-06,5,May,2012,9
4,False,0.4,2012-03-23,3,March,2012,9


### Discretization (Binning):
- Age Groups Often Produce Better Results than the Age.

In [34]:
def get_ageGroup(dateOfBirthColumn):
    df = pd.DataFrame(dateOfBirthColumn)
    df['Today'] = pd.to_datetime(date.today(), errors='coerce')  
    df['Age'] = abs((dateOfBirthColumn - df.Today)/np.timedelta64(1,'Y'))
    df.Age = pd.to_numeric(df.Age.round(0), downcast='integer', errors='coerce')
    
    bins = [0,13,20,30,40,50,60,110]
    labels = ['Ones','Teens','20s','30s','40s','50s','60+']
    df['AgeGroup'] = pd.cut(df.Age, bins=bins, labels=labels, right=False)
    df.AgeGroup = df.AgeGroup.cat.add_categories('unknown').fillna('unknown')
    df.AgeGroup = df.AgeGroup.astype('object')
    
    return df.AgeGroup

In [35]:
df_dates['AgeGroup'] = get_ageGroup(df_dates.InspectionDate)
df_dates.head()

Unnamed: 0,Has_QSpecs,Weight,InspectionDate,InspectionMonth,InspectionMonthName,InspectionYear,InspectionAge,AgeGroup
0,False,10.66,2012-02-16,2,February,2012,9,Ones
1,False,0.1,2012-04-25,4,April,2012,9,Ones
2,False,26.0,2012-11-28,11,November,2012,8,Ones
3,False,0.0,2012-05-06,5,May,2012,9,Ones
4,False,0.4,2012-03-23,3,March,2012,9,Ones
