In [7]:
# prepare a data

import numpy as np
import pandas as pd

df = pd.DataFrame({'temp': ['Hot','Cold','Very Hot','Warm','Hot','Warm','Warm','Hot','Hot','Cold'],
                   'color': ['Red','Yellow','Blue','Blue','Red','Yellow','Red','Yellow','Yellow','Yellow'],
                   'target': [1,1,1,0,1,0,1,0,1,1]
                  }, columns = ['temp','color','target'])
df

Unnamed: 0,temp,color,target
0,Hot,Red,1
1,Cold,Yellow,1
2,Very Hot,Blue,1
3,Warm,Blue,0
4,Hot,Red,1
5,Warm,Yellow,0
6,Warm,Red,1
7,Hot,Yellow,0
8,Hot,Yellow,1
9,Cold,Yellow,1


In [6]:
# one-hot encoding

onehot = pd.get_dummies(df, prefix=['temp'], columns=['temp'])
onehot

Unnamed: 0,color,target,temp_Cold,temp_Hot,temp_Very Hot,temp_Warm
0,Red,1,0,1,0,0
1,Yellow,1,1,0,0,0
2,Blue,1,0,0,1,0
3,Blue,0,0,0,0,1
4,Red,1,0,1,0,0
5,Yellow,0,0,0,0,1
6,Red,1,0,0,0,1
7,Yellow,0,0,1,0,0
8,Yellow,1,0,1,0,0
9,Yellow,1,1,0,0,0


In [9]:
# label encoding
# each category is assigned a value from 1 through n where n is the number of categories for the feature

from sklearn.preprocessing import LabelEncoder
df['temp_new'] = LabelEncoder().fit_transform(df.temp)
df

Unnamed: 0,temp,color,target,temp_new
0,Hot,Red,1,1
1,Cold,Yellow,1,0
2,Very Hot,Blue,1,2
3,Warm,Blue,0,3
4,Hot,Red,1,1
5,Warm,Yellow,0,3
6,Warm,Red,1,3
7,Hot,Yellow,0,1
8,Hot,Yellow,1,1
9,Cold,Yellow,1,0


In [11]:
# ordinal encoding
# we do Ordinal encoding to ensure the encoding of variables retains the ordinal nature of the variable
# this is reasonable only for ordinal variables

temp_dict = {'Cold': 0,
             'Warm': 1,
             'Hot': 2,
             'Very Hot': 3}
df['temp_ordered'] = df.temp.map(temp_dict)
df

Unnamed: 0,temp,color,target,temp_new,temp_ordered
0,Hot,Red,1,1,2
1,Cold,Yellow,1,0,0
2,Very Hot,Blue,1,2,3
3,Warm,Blue,0,3,1
4,Hot,Red,1,1,2
5,Warm,Yellow,0,3,1
6,Warm,Red,1,3,1
7,Hot,Yellow,0,1,2
8,Hot,Yellow,1,1,2
9,Cold,Yellow,1,0,0


In [13]:
# helmert encoding
# the mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels

import category_encoders as ce
encoder = ce.HelmertEncoder(cols=['temp'], drop_invariant=True)
temp_helmert = encoder.fit_transform(df['temp'])
df = pd.concat([df, temp_helmert], axis=1)
df

Unnamed: 0,temp,color,target,temp_new,temp_ordered,temp_0,temp_1,temp_2
0,Hot,Red,1,1,2,-1.0,-1.0,-1.0
1,Cold,Yellow,1,0,0,1.0,-1.0,-1.0
2,Very Hot,Blue,1,2,3,0.0,2.0,-1.0
3,Warm,Blue,0,3,1,0.0,0.0,3.0
4,Hot,Red,1,1,2,-1.0,-1.0,-1.0
5,Warm,Yellow,0,3,1,0.0,0.0,3.0
6,Warm,Red,1,3,1,0.0,0.0,3.0
7,Hot,Yellow,0,1,2,-1.0,-1.0,-1.0
8,Hot,Yellow,1,1,2,-1.0,-1.0,-1.0
9,Cold,Yellow,1,0,0,1.0,-1.0,-1.0


In [15]:
# binary encoding
# binary encoding converts a category into binary digits. each binary digit creates one feature column.
# compared to One Hot Encoding, this will require fewer feature columns

# steps:
    # the categories are first converted to numeric order starting from 1 (no ordinal nature)
    # then those integers are converted into binary code. 3 becomes 011, 4 becomes 100
    # the digits of the binary number form separate columns
    
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['temp'])
temp_bin = encoder.fit_transform(df['temp'])
df = pd.concat([df,temp_bin], axis=1)
df

Unnamed: 0,temp,color,target,temp_new,temp_ordered,temp_0,temp_1,temp_2,temp_0.1,temp_1.1,temp_2.1
0,Hot,Red,1,1,2,-1.0,-1.0,-1.0,0,0,1
1,Cold,Yellow,1,0,0,1.0,-1.0,-1.0,0,1,0
2,Very Hot,Blue,1,2,3,0.0,2.0,-1.0,0,1,1
3,Warm,Blue,0,3,1,0.0,0.0,3.0,1,0,0
4,Hot,Red,1,1,2,-1.0,-1.0,-1.0,0,0,1
5,Warm,Yellow,0,3,1,0.0,0.0,3.0,1,0,0
6,Warm,Red,1,3,1,0.0,0.0,3.0,1,0,0
7,Hot,Yellow,0,1,2,-1.0,-1.0,-1.0,0,0,1
8,Hot,Yellow,1,1,2,-1.0,-1.0,-1.0,0,0,1
9,Cold,Yellow,1,0,0,1.0,-1.0,-1.0,0,1,0


In [21]:
# frequency encoding
# it is a way to utilize the frequency of labels

freq = df.groupby('temp').size()/len(df)
df['temp_freq'] = df.temp.map(freq)
df

Unnamed: 0,temp,color,target,temp_new,temp_ordered,temp_0,temp_1,temp_2,temp_0.1,temp_1.1,temp_2.1,temp_freq
0,Hot,Red,1,1,2,-1.0,-1.0,-1.0,0,0,1,0.4
1,Cold,Yellow,1,0,0,1.0,-1.0,-1.0,0,1,0,0.2
2,Very Hot,Blue,1,2,3,0.0,2.0,-1.0,0,1,1,0.1
3,Warm,Blue,0,3,1,0.0,0.0,3.0,1,0,0,0.3
4,Hot,Red,1,1,2,-1.0,-1.0,-1.0,0,0,1,0.4
5,Warm,Yellow,0,3,1,0.0,0.0,3.0,1,0,0,0.3
6,Warm,Red,1,3,1,0.0,0.0,3.0,1,0,0,0.3
7,Hot,Yellow,0,1,2,-1.0,-1.0,-1.0,0,0,1,0.4
8,Hot,Yellow,1,1,2,-1.0,-1.0,-1.0,0,0,1,0.4
9,Cold,Yellow,1,0,0,1.0,-1.0,-1.0,0,1,0,0.2


In [23]:
# mean encoding
# mean encoding is similar to label encoding, except here labels are correlated directly with the target
# for example, in mean target encoding for each category in the feature label is decided with the mean value of the 
# target variable on a training data

# steps:
    # select a categorical variable 
    # groupby that categorical variable and get the aggregated sum over the target
    # groupby that categorical variable and get the aggregated count over the target
    # divide step2 by step3
    
mean_encode = df.groupby('temp')['target'].mean()
df['temp_mean'] = df.temp.map(mean_encode)
df

Unnamed: 0,temp,color,target,temp_new,temp_ordered,temp_0,temp_1,temp_2,temp_0.1,temp_1.1,temp_2.1,temp_freq,temp_mean
0,Hot,Red,1,1,2,-1.0,-1.0,-1.0,0,0,1,0.4,0.75
1,Cold,Yellow,1,0,0,1.0,-1.0,-1.0,0,1,0,0.2,1.0
2,Very Hot,Blue,1,2,3,0.0,2.0,-1.0,0,1,1,0.1,1.0
3,Warm,Blue,0,3,1,0.0,0.0,3.0,1,0,0,0.3,0.333333
4,Hot,Red,1,1,2,-1.0,-1.0,-1.0,0,0,1,0.4,0.75
5,Warm,Yellow,0,3,1,0.0,0.0,3.0,1,0,0,0.3,0.333333
6,Warm,Red,1,3,1,0.0,0.0,3.0,1,0,0,0.3,0.333333
7,Hot,Yellow,0,1,2,-1.0,-1.0,-1.0,0,0,1,0.4,0.75
8,Hot,Yellow,1,1,2,-1.0,-1.0,-1.0,0,0,1,0.4,0.75
9,Cold,Yellow,1,0,0,1.0,-1.0,-1.0,0,1,0,0.2,1.0


In [24]:
# weight of evidence encoding
# Weight of Evidence (WoE) is a measure of the “strength” of a grouping technique to separate good and bad.
# Weight of evidence (WOE) is a measure of how much the evidence supports or undermines a hypothesis.

woe = df.groupby('temp')['target'].mean()
woe = pd.DataFrame(woe)
woe = woe.rename(columns = {'target':'good'})
woe['bad'] = 1-woe
woe['bad'] = np.where(woe['bad'] == 0, 0.000001, woe['bad'])
woe['woe'] = np.log(woe.good / woe.bad)
df['temp_woe'] = df.temp.map(woe['woe'])
df