Target Guided Ordinal Encoding

It is a technique used to encode categorical variables based on their relationship with the target variable. This encoding technique is useful when we have a categorical variable with a large number of unique categories, and we want to use this variable as a feature in our machine learning model.

In Target Guided Ordinal Encoding, we replace each category in the categorical variable with a numerical value based on the mean or median of the target variable for that category. This creates a monotonic relationship between the categorical variable and the target variable, which can improve the predictive power of our model.

In [17]:
import pandas as pd

df = pd.DataFrame({
    'City' : ['New York', 'Los Angeles', 'Chicago', 'Houston', 'New York', 'Chicago'],
    'Price' : [130, 500, 200, 600, 140, 456]
})

In [18]:
df

Unnamed: 0,City,Price
0,New York,130
1,Los Angeles,500
2,Chicago,200
3,Houston,600
4,New York,140
5,Chicago,456


In [20]:
mean_price = df.groupby('City')['Price'].mean().to_dict()

In [21]:
mean_price

{'Chicago': 328.0, 'Houston': 600.0, 'Los Angeles': 500.0, 'New York': 135.0}

In [22]:
df['City_encoded'] = df['City'].map(mean_price)

In [23]:
df

Unnamed: 0,City,Price,City_encoded
0,New York,130,135.0
1,Los Angeles,500,500.0
2,Chicago,200,328.0
3,Houston,600,600.0
4,New York,140,135.0
5,Chicago,456,328.0


In [25]:
df[['Price', 'City_encoded']]

Unnamed: 0,Price,City_encoded
0,130,135.0
1,500,500.0
2,200,328.0
3,600,600.0
4,140,135.0
5,456,328.0


In [28]:
import seaborn as sns
df = sns.load_dataset('tips')

In [29]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [30]:
mean_bill = df.groupby('time')['total_bill'].mean().to_dict()

  mean_bill = df.groupby('time')['total_bill'].mean().to_dict()


In [31]:
mean_bill

{'Lunch': 17.168676470588235, 'Dinner': 20.79715909090909}

In [32]:
df['time_encoded'] = df['time'].map(mean_bill)

In [33]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,time_encoded
0,16.99,1.01,Female,No,Sun,Dinner,2,20.797159
1,10.34,1.66,Male,No,Sun,Dinner,3,20.797159
2,21.01,3.50,Male,No,Sun,Dinner,3,20.797159
3,23.68,3.31,Male,No,Sun,Dinner,2,20.797159
4,24.59,3.61,Female,No,Sun,Dinner,4,20.797159
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,20.797159
240,27.18,2.00,Female,Yes,Sat,Dinner,2,20.797159
241,22.67,2.00,Male,Yes,Sat,Dinner,2,20.797159
242,17.82,1.75,Male,No,Sat,Dinner,2,20.797159


In [34]:
df[['total_bill', 'time_encoded']]

Unnamed: 0,total_bill,time_encoded
0,16.99,20.797159
1,10.34,20.797159
2,21.01,20.797159
3,23.68,20.797159
4,24.59,20.797159
...,...,...
239,29.03,20.797159
240,27.18,20.797159
241,22.67,20.797159
242,17.82,20.797159


In [38]:
df[df['time'] == 'Lunch']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,time_encoded
77,27.20,4.00,Male,No,Thur,Lunch,4,17.168676
78,22.76,3.00,Male,No,Thur,Lunch,2,17.168676
79,17.29,2.71,Male,No,Thur,Lunch,2,17.168676
80,19.44,3.00,Male,Yes,Thur,Lunch,2,17.168676
81,16.66,3.40,Male,No,Thur,Lunch,2,17.168676
...,...,...,...,...,...,...,...,...
222,8.58,1.92,Male,Yes,Fri,Lunch,1,17.168676
223,15.98,3.00,Female,No,Fri,Lunch,3,17.168676
224,13.42,1.58,Male,Yes,Fri,Lunch,2,17.168676
225,16.27,2.50,Female,Yes,Fri,Lunch,2,17.168676
