# Data Encoding

1. Nominal/ OHE Encoding
2. Label and Ordinal Encoding
3. Target Guided Ordinal Encoding

## 1. Nominal/OHE Encoding

Red:   [0,0,1]

Green: [0,1,0]

Blue:  [1,0,0]

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
## Create a simple dataframe
df = pd.DataFrame({
    "color": ["red", "blue", "green", "red", "blue"]
})

In [3]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,red
4,blue


In [None]:
## Create an instance of OneHotEncoder
encoder = OneHotEncoder()
## Perform fit and transform
encoded=encoder.fit_transform(df[['color']]).toarray()
encoded_df=pd.DataFrame(encoded,columns=encoder.get_feature_names_out())
df=pd.concat([df, encoded_df], axis=1)
df.head()

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,red,0.0,0.0,1.0
4,blue,1.0,0.0,0.0


In [10]:
encoder.transform([['blue']]).toarray()



array([[1., 0., 0.]])

In [13]:
import seaborn as sns
tips=sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [None]:
sex_encoder=OneHotEncoder()
sex_encoded=sex_encoder.fit_transform(tips[['sex']]).toarray()
sex_encoded_df=pd.DataFrame(sex_encoded,columns=sex_encoder.get_feature_names_out())
tips=pd.concat([tips,sex_encoded_df], axis=1)
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0
2,21.01,3.5,Male,No,Sun,Dinner,3,0.0,1.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0


In [40]:
smoker_encoder=OneHotEncoder()
smoker_encoded=smoker_encoder.fit_transform(tips[['smoker']]).toarray()
smoker_encoded_df=pd.DataFrame(smoker_encoded,columns=smoker_encoder.get_feature_names_out())
tips=pd.concat([tips,smoker_encoded_df],axis=1)
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male,smoker_No,smoker_Yes
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0
2,21.01,3.5,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0,1.0,0.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0


In [44]:
tips.drop(['sex', 'smoker'], axis=1, inplace=True)
tips.head()

Unnamed: 0,total_bill,tip,day,time,size,sex_Female,sex_Male,smoker_No,smoker_Yes
0,16.99,1.01,Sun,Dinner,2,1.0,0.0,1.0,0.0
1,10.34,1.66,Sun,Dinner,3,0.0,1.0,1.0,0.0
2,21.01,3.5,Sun,Dinner,3,0.0,1.0,1.0,0.0
3,23.68,3.31,Sun,Dinner,2,0.0,1.0,1.0,0.0
4,24.59,3.61,Sun,Dinner,4,1.0,0.0,1.0,0.0


In [48]:
day_encoder=OneHotEncoder()
day_encoded=day_encoder.fit_transform(tips[['day']]).toarray()
day_encoded_df=pd.DataFrame(day_encoded,columns=day_encoder.get_feature_names_out())
tips=pd.concat([tips,day_encoded_df],axis=1)
tips.drop('day',axis=1,inplace=True)
tips.head()

Unnamed: 0,total_bill,tip,time,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur
0,16.99,1.01,Dinner,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,10.34,1.66,Dinner,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,21.01,3.5,Dinner,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,23.68,3.31,Dinner,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,24.59,3.61,Dinner,4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [49]:
time_encoder=OneHotEncoder()
time_encoded=time_encoder.fit_transform(tips[['time']]).toarray()
time_encoded_df=pd.DataFrame(time_encoded,columns=time_encoder.get_feature_names_out())
tips=pd.concat([tips,time_encoded_df],axis=1)
tips.drop('time',axis=1,inplace=True)
tips.head()

Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,10.34,1.66,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,21.01,3.5,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,23.68,3.31,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,24.59,3.61,4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


## 2. Label Encoding
Blue:0

Green:1

Red:2



In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.DataFrame({
    "color": ["red", "blue", "green", "red", "blue"]
})

In [3]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,red
4,blue


In [4]:
lbl_encoder=LabelEncoder()

In [7]:
lbl_encoder.fit_transform(df[['color']])

  y = column_or_1d(y, warn=True)


array([2, 0, 1, 2, 0])

## 3. Ordinal Encoder

It is used to encode categorical data that have an intrinsic order or ranking. In this technique each category is assigned a numerical value based on its position in the order.
1. High School: 1
2. College: 2
3. Graduate: 3
4. Post-graduate: 4

In [9]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

In [10]:
df=pd.DataFrame({
    'size':['small','medium','large','medium','small','large']
})
df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [11]:
## Create an instance of ordinalencoder and then fit_transform
encoder=OrdinalEncoder(categories=[['small','medium','large']])

In [14]:
df['size']=encoder.fit_transform(df[['size']])
df

Unnamed: 0,size
0,0.0
1,1.0
2,2.0
3,1.0
4,0.0
5,2.0


## 3. Target Guided Ordinal Encoding

In [15]:
import pandas as pd
df=pd.DataFrame({
    'city':['New York','London','Paris','Tokyo','New York','Paris'],
    'price':[200,150,300,250,180,320]
})
df.head()

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180


In [17]:
mean_price=df.groupby('city')['price'].mean().to_dict()
mean_price

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [19]:
df['city_encoded']=df['city'].map(mean_price)
df[['city_encoded','price']]

Unnamed: 0,city_encoded,price
0,190.0,200
1,150.0,150
2,310.0,300
3,250.0,250
4,190.0,180
5,310.0,320


In [20]:
import seaborn as sns
tips=sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [25]:
mean_time=tips.groupby('time')['total_bill'].mean().to_dict()
mean_time

  mean_time=tips.groupby('time')['total_bill'].mean().to_dict()


{'Lunch': 17.168676470588235, 'Dinner': 20.79715909090909}

In [27]:
tips['time_encoded']=tips['time'].map(mean_time)
tips.time_encoded.unique()

[20.797159, 17.168676]
Categories (2, float64): [17.168676, 20.797159]