# Nominal or OHE Encoding 

In [1]:
import pandas as pd

In [2]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
df = pd.DataFrame({
    'weather':['cold','hot','warm','warm','cold','hot']
})
df.head()

Unnamed: 0,weather
0,cold
1,hot
2,warm
3,warm
4,cold


In [16]:
encoder = OneHotEncoder()

In [17]:
encoder

OneHotEncoder()

In [18]:
encoded = encoder.fit_transform(df[['weather']])

In [19]:
encoded.toarray()

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [23]:
encoded_df = pd.DataFrame(encoded.toarray(), columns = encoder.get_feature_names_out())

In [25]:
pd.concat([df,encoded_df],axis=1)

Unnamed: 0,weather,weather_cold,weather_hot,weather_warm
0,cold,1.0,0.0,0.0
1,hot,0.0,1.0,0.0
2,warm,0.0,0.0,1.0
3,warm,0.0,0.0,1.0
4,cold,1.0,0.0,0.0
5,hot,0.0,1.0,0.0


# Label Encoding

In [26]:
from sklearn.preprocessing import LabelEncoder 

In [27]:
data = ['red','green','blue','green']

In [28]:
label_encoder = LabelEncoder()

In [29]:
label_encoder

LabelEncoder()

In [30]:
encoded_data = label_encoder.fit_transform(data)

In [32]:
print("Orginal_data",data)
print("Encoded_data",encoded_data)

Orginal_data ['red', 'green', 'blue', 'green']
Encoded_data [2 1 0 1]


In [31]:
encoded_data

array([2, 1, 0, 1], dtype=int64)

# Ordinal Encoding

In [33]:
from sklearn.preprocessing import OrdinalEncoder

In [35]:
df = pd.DataFrame({
    'size':['small','medium','large','medium','small','large']
})

In [36]:
df.head()

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small


In [37]:
encoder =  OrdinalEncoder(categories=[['small','medium','large']])

In [38]:
encoder

OrdinalEncoder(categories=[['small', 'medium', 'large']])

In [39]:
encoder.fit_transform(df[['size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

# Target Guided Ordinal Encoding

- Target Encoding (also known as Mean Encoding or Probability Encoding) is a technique used in machine learning for encoding categorical variables based on the mean of the target variable for each category.

In [40]:
df = pd.DataFrame({
    'city':['Delhi','Pune','Noida','Bangalore','Delhi','Noida'],
    'price':[200,150,300,250,180,320]
    
})

In [43]:
df

Unnamed: 0,city,price
0,Delhi,200
1,Pune,150
2,Noida,300
3,Bangalore,250
4,Delhi,180
5,Noida,320


In [46]:
mean_price = df.groupby('city')['price'].mean().to_dict()
mean_price

{'Bangalore': 250.0, 'Delhi': 190.0, 'Noida': 310.0, 'Pune': 150.0}

In [47]:
df['encoded_city']=df['city'].map(mean_price)

In [48]:
df

Unnamed: 0,city,price,encoded_city
0,Delhi,200,190.0
1,Pune,150,150.0
2,Noida,300,310.0
3,Bangalore,250,250.0
4,Delhi,180,190.0
5,Noida,320,310.0
