# Encoding

## One Hot Encoding : Pandas Get Dummies

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
tips = sns.load_dataset('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [4]:
for i in ['sex', 'smoker', 'day', 'time'] :
    print (tips[i].value_counts())

Male      157
Female     87
Name: sex, dtype: int64
No     151
Yes     93
Name: smoker, dtype: int64
Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64
Dinner    176
Lunch      68
Name: time, dtype: int64


In [5]:
tips_dummy = pd.get_dummies(tips, columns=['sex', 'smoker', 'day', 'time'])
tips_dummy

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,2,0,1,0,1,0,0,0,1,0,1
1,10.34,1.66,3,1,0,0,1,0,0,0,1,0,1
2,21.01,3.50,3,1,0,0,1,0,0,0,1,0,1
3,23.68,3.31,2,1,0,0,1,0,0,0,1,0,1
4,24.59,3.61,4,0,1,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,1,0,0,1,0,0,1,0,0,1
240,27.18,2.00,2,0,1,1,0,0,0,1,0,0,1
241,22.67,2.00,2,1,0,1,0,0,0,1,0,0,1
242,17.82,1.75,2,1,0,0,1,0,0,1,0,0,1


## One Hot Encoding : Scikit Learn (OneHotEncoder)

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [7]:
tranformer = ColumnTransformer([
    ('encoder', OneHotEncoder(), ['sex', 'smoker', 'day', 'time'])
])

In [8]:
tips_encoded = pd.DataFrame(tranformer.fit_transform(tips))
tips_encoded = tips_encoded.astype(int)
tips_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,0,1,0,0,0,1,0,1,0
1,0,1,1,0,0,0,1,0,1,0
2,0,1,1,0,0,0,1,0,1,0
3,0,1,1,0,0,0,1,0,1,0
4,1,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
239,0,1,1,0,0,1,0,0,1,0
240,1,0,0,1,0,1,0,0,1,0
241,0,1,0,1,0,1,0,0,1,0
242,0,1,1,0,0,1,0,0,1,0


### Notes :
Using this method you don't automatically rename the variables. You must get the name of the feature separately. But, in the later course you will find out that preprocessing method from skelarn can make your whole process of the modeling easier for: cross validation and hyperparameter tuning.

In [9]:
tranformer.get_feature_names()

['encoder__x0_Female',
 'encoder__x0_Male',
 'encoder__x1_No',
 'encoder__x1_Yes',
 'encoder__x2_Fri',
 'encoder__x2_Sat',
 'encoder__x2_Sun',
 'encoder__x2_Thur',
 'encoder__x3_Dinner',
 'encoder__x3_Lunch']

In [10]:
tips_encoded.columns = tranformer.get_feature_names()
tips_encoded = pd.concat([tips[['total_bill','tip','size']], tips_encoded], axis = 1)
tips_encoded

Unnamed: 0,total_bill,tip,size,encoder__x0_Female,encoder__x0_Male,encoder__x1_No,encoder__x1_Yes,encoder__x2_Fri,encoder__x2_Sat,encoder__x2_Sun,encoder__x2_Thur,encoder__x3_Dinner,encoder__x3_Lunch
0,16.99,1.01,2,1,0,1,0,0,0,1,0,1,0
1,10.34,1.66,3,0,1,1,0,0,0,1,0,1,0
2,21.01,3.50,3,0,1,1,0,0,0,1,0,1,0
3,23.68,3.31,2,0,1,1,0,0,0,1,0,1,0
4,24.59,3.61,4,1,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,0,1,1,0,0,1,0,0,1,0
240,27.18,2.00,2,1,0,0,1,0,1,0,0,1,0
241,22.67,2.00,2,0,1,0,1,0,1,0,0,1,0
242,17.82,1.75,2,0,1,1,0,0,1,0,0,1,0


## Ordinal Encoding : Using .map Function

In [11]:
tips_ordinal_encoded = tips.copy()
tips_ordinal_encoded['day'].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [12]:
tips_ordinal_encoded['day'] = tips_ordinal_encoded['day'].map({'Thur' : 1, 'Fri' : 2, 'Sat' : 3, 'Sun' : 4})
tips_ordinal_encoded

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,4,Dinner,2
1,10.34,1.66,Male,No,4,Dinner,3
2,21.01,3.50,Male,No,4,Dinner,3
3,23.68,3.31,Male,No,4,Dinner,2
4,24.59,3.61,Female,No,4,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,3,Dinner,3
240,27.18,2.00,Female,Yes,3,Dinner,2
241,22.67,2.00,Male,Yes,3,Dinner,2
242,17.82,1.75,Male,No,3,Dinner,2


## Ordinal Encoding : Using category_encoders Library

In [13]:
import category_encoders as ce

In [14]:
ordinal_mapping = [
    {'col' : 'day',
    'mapping' : {None : 0, 'Thur' : 1, 'Fri' : 2, 'Sat' : 3, 'Sun' : 4}}
]

ordinal_mapping

[{'col': 'day', 'mapping': {None: 0, 'Thur': 1, 'Fri': 2, 'Sat': 3, 'Sun': 4}}]

In [15]:
ordinal_encoder = ce.OrdinalEncoder(cols = 'day', mapping = ordinal_mapping)
dford = ordinal_encoder.fit_transform(tips['day'])
dford

Unnamed: 0,day
0,4
1,4
2,4
3,4
4,4
...,...
239,3
240,3
241,3
242,3


In [16]:
tips_ordinal_encoded = pd.concat([tips[['total_bill', 'tip', 'sex', 'smoker', 'time', 'size']], dford], axis = 1)
tips_ordinal_encoded

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day
0,16.99,1.01,Female,No,Dinner,2,4
1,10.34,1.66,Male,No,Dinner,3,4
2,21.01,3.50,Male,No,Dinner,3,4
3,23.68,3.31,Male,No,Dinner,2,4
4,24.59,3.61,Female,No,Dinner,4,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Dinner,3,3
240,27.18,2.00,Female,Yes,Dinner,2,3
241,22.67,2.00,Male,Yes,Dinner,2,3
242,17.82,1.75,Male,No,Dinner,2,3


In [17]:
ordinal_mapping = [
    {'col' : 'day',
    'mapping' : {None : 0, 'Thur' : 1, 'Fri' : 2, 'Sat' : 3, 'Sun' : 4}},
    {'col' : 'time', 
    'mapping' : {None : 0, 'Lunch' : 1, 'Dinner' : 2}}
]

ordinal_encoder = ce.OrdinalEncoder(cols = ['day', 'time'], mapping = ordinal_mapping)
dford = ordinal_encoder.fit_transform(tips[['day', 'time']])
dford

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,day,time
0,4,2
1,4,2
2,4,2
3,4,2
4,4,2
...,...,...
239,3,2
240,3,2
241,3,2
242,3,2


In [18]:
dford['time'].value_counts()

2    176
1     68
Name: time, dtype: int64

In [19]:
tips['time'].value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [20]:
tips_ordinal_encoded = pd.concat([tips[['total_bill', 'tip', 'sex', 'smoker', 'size']], dford], axis = 1)
tips_ordinal_encoded

Unnamed: 0,total_bill,tip,sex,smoker,size,day,time
0,16.99,1.01,Female,No,2,4,2
1,10.34,1.66,Male,No,3,4,2
2,21.01,3.50,Male,No,3,4,2
3,23.68,3.31,Male,No,2,4,2
4,24.59,3.61,Female,No,4,4,2
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,3,3,2
240,27.18,2.00,Female,Yes,2,3,2
241,22.67,2.00,Male,Yes,2,3,2
242,17.82,1.75,Male,No,2,3,2


## Binary Encoding

In [21]:
binary_encoder = ce.BinaryEncoder(cols=['day'])
dfbin = binary_encoder.fit_transform(tips['day'])
dfbin

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
239,0,1,0
240,0,1,0
241,0,1,0
242,0,1,0


In [22]:
tips['day']

0       Sun
1       Sun
2       Sun
3       Sun
4       Sun
       ... 
239     Sat
240     Sat
241     Sat
242     Sat
243    Thur
Name: day, Length: 244, dtype: category
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [23]:
tips_binary_encoded = pd.concat([tips[['total_bill', 'tip', 'sex', 'smoker', 'time', 'size']], dfbin], axis = 1)
tips_binary_encoded

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_0,day_1,day_2
0,16.99,1.01,Female,No,Dinner,2,0,0,1
1,10.34,1.66,Male,No,Dinner,3,0,0,1
2,21.01,3.50,Male,No,Dinner,3,0,0,1
3,23.68,3.31,Male,No,Dinner,2,0,0,1
4,24.59,3.61,Female,No,Dinner,4,0,0,1
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Dinner,3,0,1,0
240,27.18,2.00,Female,Yes,Dinner,2,0,1,0
241,22.67,2.00,Male,Yes,Dinner,2,0,1,0
242,17.82,1.75,Male,No,Dinner,2,0,1,0


# Latihan 1 : Case 1A -> Ridge Regression

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import RobustScaler

In [25]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


## Preprocessing  Scheme :

1. One Hot Encoding : Sex, Smoker, Time
2. Binary Encoding : day
3. Robust Scaller : total_bill
4. No_treatement : Size

In [26]:
# Step 1 : Buat Skema Transformer

transformer2 =ColumnTransformer([
    ('One Hot Encoder', OneHotEncoder(drop='first'), ['sex', 'smoker', 'time']),
    ('Binary Encoder', ce.BinaryEncoder(), ['day']),
    ('Robust Scaller', RobustScaler(), ['total_bill'])
], remainder='passthrough')

In [27]:
# Step 2 : Data Split

X = tips[['total_bill', 'sex', 'smoker', 'day', 'time', 'size']]
Y = tips['tip']

In [28]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(
    X,
    Y,
    random_state=10
)

In [29]:
# Step 3 : Data Transform

X_train_pre = transformer2.fit_transform(Xtrain)
X_test_pre = transformer2.transform(Xtest)

  elif pd.api.types.is_categorical(cols):


In [30]:
X_train_pre = pd.DataFrame(X_train_pre)
X_test_pre = pd.DataFrame(X_test_pre)
X_train_pre

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,1.0,0.0,0.0,0.0,1.0,-0.566396,2.0
1,1.0,0.0,0.0,0.0,1.0,0.0,-0.647696,3.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.316170,3.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.245709,2.0
4,1.0,1.0,0.0,0.0,1.0,0.0,2.081301,2.0
...,...,...,...,...,...,...,...,...
178,1.0,0.0,0.0,0.0,0.0,1.0,0.007227,3.0
179,1.0,0.0,0.0,0.0,1.0,0.0,0.367660,2.0
180,1.0,0.0,0.0,0.0,0.0,1.0,-0.382114,2.0
181,0.0,0.0,1.0,0.0,1.0,1.0,1.110208,6.0


In [31]:
transformer2.transformers_

[('One Hot Encoder', OneHotEncoder(drop='first'), ['sex', 'smoker', 'time']),
 ('Binary Encoder', BinaryEncoder(), ['day']),
 ('Robust Scaller', RobustScaler(), ['total_bill']),
 ('remainder', 'passthrough', [5])]

In [32]:
transformer2.transformers_[0][1].get_feature_names()

array(['x0_Male', 'x1_Yes', 'x2_Lunch'], dtype=object)

In [33]:
transformer2.transformers_[1][1].get_feature_names()


['day_0', 'day_1', 'day_2']

In [34]:
features = list(transformer2.transformers_[0][1].get_feature_names()) + transformer2.transformers_[1][1].get_feature_names() + ['total_bill scalled'] + ['size']
X_train_pre.columns = features
X_train_pre.columns = features
features

['x0_Male',
 'x1_Yes',
 'x2_Lunch',
 'day_0',
 'day_1',
 'day_2',
 'total_bill scalled',
 'size']

In [35]:
features

['x0_Male',
 'x1_Yes',
 'x2_Lunch',
 'day_0',
 'day_1',
 'day_2',
 'total_bill scalled',
 'size']

In [36]:
X_train_pre

Unnamed: 0,x0_Male,x1_Yes,x2_Lunch,day_0,day_1,day_2,total_bill scalled,size
0,1.0,1.0,0.0,0.0,0.0,1.0,-0.566396,2.0
1,1.0,0.0,0.0,0.0,1.0,0.0,-0.647696,3.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.316170,3.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.245709,2.0
4,1.0,1.0,0.0,0.0,1.0,0.0,2.081301,2.0
...,...,...,...,...,...,...,...,...
178,1.0,0.0,0.0,0.0,0.0,1.0,0.007227,3.0
179,1.0,0.0,0.0,0.0,1.0,0.0,0.367660,2.0
180,1.0,0.0,0.0,0.0,0.0,1.0,-0.382114,2.0
181,0.0,0.0,1.0,0.0,1.0,1.0,1.110208,6.0


In [37]:
# Step 5 : Modelling

model = Ridge(alpha=0.01)
model.fit(X_train_pre, Ytrain)

Ridge(alpha=0.01)

In [38]:
y_pred = model.predict(X_test_pre)

In [39]:
print('MAPE : ' , mean_absolute_percentage_error(Ytest, y_pred))

MAPE :  0.2707398673827411


In [40]:
Xtrain

Unnamed: 0,total_bill,sex,smoker,day,time,size
58,11.24,Male,Yes,Sat,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
68,20.23,Male,No,Sat,Dinner,2
184,40.55,Male,Yes,Sun,Dinner,2
...,...,...,...,...,...,...
64,17.59,Male,No,Sat,Dinner,3
15,21.58,Male,No,Sun,Dinner,2
228,13.28,Male,No,Sat,Dinner,2
125,29.80,Female,No,Thur,Lunch,6
