In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
   
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer # melakukan transformasi (fit transform = transformer)
import category_encoders as ce

import warnings
warnings.filterwarnings("ignore")
from sklearn.utils.testing import ignore_warnings

# <center>Encoding

Terdapat 3 jenis encoding yang digunakan, yaitu :
1. One Hot Encoding
2. Ordinal Encoding
3. Binary Encoding

Encoding perlu dilakukan karena model tidak bisa melakukan prediksi apabila bentuk data kategorikal

## One Hot Encoding

In [2]:
tips = sns.load_dataset('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
transformer = ColumnTransformer([ #tuple jangan lupa
    ('encoder',OneHotEncoder(),['sex','smoker','day','time']) # format = nama, jenis transform, nama feature
])

In [4]:
transformer

ColumnTransformer(transformers=[('encoder', OneHotEncoder(),
                                 ['sex', 'smoker', 'day', 'time'])])

In [5]:
tips_encoded = transformer.fit_transform(tips) # Melakukan fitting dengan dataset
tips_encoded

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

In [6]:
pd.DataFrame(tips_encoded) # Ketika fitting dengan transformer, dataset tidak memiliki nama kolom
tips_encoded = pd.DataFrame(tips_encoded).astype(int)
tips_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,0,1,0,0,0,1,0,1,0
1,0,1,1,0,0,0,1,0,1,0
2,0,1,1,0,0,0,1,0,1,0
3,0,1,1,0,0,0,1,0,1,0
4,1,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
239,0,1,1,0,0,1,0,0,1,0
240,1,0,0,1,0,1,0,0,1,0
241,0,1,0,1,0,1,0,0,1,0
242,0,1,1,0,0,1,0,0,1,0


In [7]:
transformer.get_feature_names() # Menambahkan nama kolom

['encoder__x0_Female',
 'encoder__x0_Male',
 'encoder__x1_No',
 'encoder__x1_Yes',
 'encoder__x2_Fri',
 'encoder__x2_Sat',
 'encoder__x2_Sun',
 'encoder__x2_Thur',
 'encoder__x3_Dinner',
 'encoder__x3_Lunch']

In [8]:
tips_encoded.columns = transformer.get_feature_names()
tips_encoded

Unnamed: 0,encoder__x0_Female,encoder__x0_Male,encoder__x1_No,encoder__x1_Yes,encoder__x2_Fri,encoder__x2_Sat,encoder__x2_Sun,encoder__x2_Thur,encoder__x3_Dinner,encoder__x3_Lunch
0,1,0,1,0,0,0,1,0,1,0
1,0,1,1,0,0,0,1,0,1,0
2,0,1,1,0,0,0,1,0,1,0
3,0,1,1,0,0,0,1,0,1,0
4,1,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
239,0,1,1,0,0,1,0,0,1,0
240,1,0,0,1,0,1,0,0,1,0
241,0,1,0,1,0,1,0,0,1,0
242,0,1,1,0,0,1,0,0,1,0


In [9]:
tips_encoded = pd.concat([tips[['total_bill','tip','size']],tips_encoded],axis=1) # menambahkan feature dengan concat
tips_encoded 

Unnamed: 0,total_bill,tip,size,encoder__x0_Female,encoder__x0_Male,encoder__x1_No,encoder__x1_Yes,encoder__x2_Fri,encoder__x2_Sat,encoder__x2_Sun,encoder__x2_Thur,encoder__x3_Dinner,encoder__x3_Lunch
0,16.99,1.01,2,1,0,1,0,0,0,1,0,1,0
1,10.34,1.66,3,0,1,1,0,0,0,1,0,1,0
2,21.01,3.50,3,0,1,1,0,0,0,1,0,1,0
3,23.68,3.31,2,0,1,1,0,0,0,1,0,1,0
4,24.59,3.61,4,1,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,0,1,1,0,0,1,0,0,1,0
240,27.18,2.00,2,1,0,0,1,0,1,0,0,1,0
241,22.67,2.00,2,0,1,0,1,0,1,0,0,1,0
242,17.82,1.75,2,0,1,1,0,0,1,0,0,1,0


## Ordinal Encoding

In [10]:
tips_ordinal = tips.copy() # Melakukan copy pada dataset tips

In [11]:
tips_ordinal['day'].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [12]:
tips_ordinal['day']=tips_ordinal['day'].map({'Thur':1,'Fri':2,'Sat':3,'Sun':4})

In [13]:
tips_ordinal

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,4,Dinner,2
1,10.34,1.66,Male,No,4,Dinner,3
2,21.01,3.50,Male,No,4,Dinner,3
3,23.68,3.31,Male,No,4,Dinner,2
4,24.59,3.61,Female,No,4,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,3,Dinner,3
240,27.18,2.00,Female,Yes,3,Dinner,2
241,22.67,2.00,Male,Yes,3,Dinner,2
242,17.82,1.75,Male,No,3,Dinner,2


In [14]:
ordinal_mapping=[{'col':'day','mapping':{None:0,'Fri':2,'Thur':1,'Sat':3,'Sun':4}}] # format list dictionary, input nilai dari variabel yang ingin diubah

In [15]:
ordinal_encoder=ce.OrdinalEncoder(mapping=ordinal_mapping)
df_ord = ordinal_encoder.fit_transform(tips['day'])
df_ord

Unnamed: 0,day
0,4
1,4
2,4
3,4
4,4
...,...
239,3
240,3
241,3
242,3


Jika ingin melakukan ordinal encoding lebih dari 1 kolom maka dapat dilakukan dengan cara :

In [16]:
ordinal_mapping=[
    {'col':'day','mapping':{None:0,'Fri':2,'Thur':1,'Sat':3,'Sun':4}},
    {'col':'time','mapping':{None:0,'Lunch':1,'Dinner':2}}
] # format list dictionary

In [17]:
ordinal_encoder = ce.OrdinalEncoder(mapping=ordinal_mapping)
df_ord = ordinal_encoder.fit_transform(tips[['day','time']])

In [18]:
df_ord.head()

Unnamed: 0,day,time
0,4,2
1,4,2
2,4,2
3,4,2
4,4,2


In [19]:
tips_ordinal_encoded = pd.concat([tips[['total_bill','tip','sex','smoker','size']],df_ord],axis=1)
tips_ordinal_encoded

Unnamed: 0,total_bill,tip,sex,smoker,size,day,time
0,16.99,1.01,Female,No,2,4,2
1,10.34,1.66,Male,No,3,4,2
2,21.01,3.50,Male,No,3,4,2
3,23.68,3.31,Male,No,2,4,2
4,24.59,3.61,Female,No,4,4,2
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,3,3,2
240,27.18,2.00,Female,Yes,2,3,2
241,22.67,2.00,Male,Yes,2,3,2
242,17.82,1.75,Male,No,2,3,2


Terdapat perubahan nilai variabel menjadi nominal pada feature day dan time

## Binary Encoding

In [20]:
tips['day']

0       Sun
1       Sun
2       Sun
3       Sun
4       Sun
       ... 
239     Sat
240     Sat
241     Sat
242     Sat
243    Thur
Name: day, Length: 244, dtype: category
Categories (4, object): [Thur, Fri, Sat, Sun]

In [21]:
binary_encoder = ce.BinaryEncoder(cols='day')
df_bin = binary_encoder.fit_transform(tips['day'])
df_bin

Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
239,0,1,0
240,0,1,0
241,0,1,0
242,0,1,0
