# One hot encoding and beyond

> We will use the car data to see how we can encode ordinal/categorical attributes into numerical

In [25]:
# Import libraries
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.count import CountEncoder
import pandas as pd

In [45]:
# Read car data

df = pd.read_csv('./data/car.data.csv')
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


## Label encoder: assign a number for each category

In [46]:
# Label encoding
feature_list = list(df.columns)
LE_encoder = OrdinalEncoder(feature_list)
Xt = LE_encoder.fit_transform(df)

In [47]:
Xt

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,1,1,1,1,1,1,1
1,1,1,1,1,1,2,1
2,1,1,1,1,1,3,1
3,1,1,1,1,2,1,1
4,1,1,1,1,2,2,1
...,...,...,...,...,...,...,...
1723,4,4,4,3,2,2,4
1724,4,4,4,3,2,3,3
1725,4,4,4,3,3,1,1
1726,4,4,4,3,3,2,4


## One-hot encoder: expand category values as features

In [48]:
# One-hot encoding
OHE_encoder = OneHotEncoder(feature_list)
Xt = OHE_encoder.fit_transform(df)

In [49]:
Xt

Unnamed: 0,buying_1,buying_2,buying_3,buying_4,maint_1,maint_2,maint_3,maint_4,doors_1,doors_2,...,lug_boot_1,lug_boot_2,lug_boot_3,safety_1,safety_2,safety_3,class_1,class_2,class_3,class_4
0,1,0,0,0,1,0,0,0,1,0,...,1,0,0,1,0,0,1,0,0,0
1,1,0,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,1,0,0,0
2,1,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,1,1,0,0,0
3,1,0,0,0,1,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,0
4,1,0,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,1
1724,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,1,0
1725,0,0,0,1,0,0,0,1,0,0,...,0,0,1,1,0,0,1,0,0,0
1726,0,0,0,1,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,1


## Count encoder: replace each categorical value with the number of times it appears in the dataset

In [51]:
# Count encoding
CNT_encoder = CountEncoder(feature_list)
Xt = CNT_encoder.fit_transform(df)

In [52]:
Xt

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,432,432,432,576,576,576,1210
1,432,432,432,576,576,576,1210
2,432,432,432,576,576,576,1210
3,432,432,432,576,576,576,1210
4,432,432,432,576,576,576,1210
...,...,...,...,...,...,...,...
1723,432,432,432,576,576,576,69
1724,432,432,432,576,576,576,65
1725,432,432,432,576,576,576,1210
1726,432,432,432,576,576,576,69
