<a href="https://colab.research.google.com/github/johnjoel2001/Machine_Learning/blob/main/Data_Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## One Hot Encoding

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [3]:
df=pd.DataFrame({
    'color':['red','blue','green','green','red','blue']
})

In [4]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [5]:
## Create an instance of OneHotEncoder
encoder=OneHotEncoder()

In [8]:
encoded=encoder.fit_transform(df[['color']]).toarray()  # A spare matrix is created which is converted into an array

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [9]:
encoder_df=pd.DataFrame(encoded,columns=encoder.get_feature_names_out())

In [10]:
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


## We see that categorical variables are converted into numeric values

In [13]:
pd.concat([df,encoder_df],axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


## Label Encoding

In [14]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
lbl_encoder=LabelEncoder()

In [17]:
lbl_encoder.fit_transform(df[['color']])

  y = column_or_1d(y, warn=True)


array([2, 0, 1, 1, 2, 0])

## We can see labels assigned to each color

In [18]:
lbl_encoder.transform([['red']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2])

In [19]:
lbl_encoder.transform([['blue']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([0])

In [20]:
lbl_encoder.transform([['green']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([1])

### Ordinal Encoding ( Used for Ranking Variables )

In [21]:
from sklearn.preprocessing import OrdinalEncoder

In [22]:
df=pd.DataFrame({
    'size':['small','medium','large','medium','small','large']
})

In [23]:
df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [24]:
##Create an Instance of Ordinal Encoder and then fit_transform
encoder=OrdinalEncoder(categories=[['small','medium','large']]) # Assign Rankings

In [25]:
encoder.fit_transform(df[['size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

## We see small has been assigned rank as 0, medium as 1 and large as 2

In [26]:
encoder.transform([['small']])



array([[0.]])

In [27]:
encoder.transform([['large']])



array([[2.]])

## Target Guided Ordinal Encoding

## We replace each category in the categorical variable with a numerical value based on the mean or median of the target variable of that category

In [28]:
df=pd.DataFrame({
    'city':['New York','London','Paris','Tokyo','New York','Paris'],
    'price':[200,150,300,250,150,320]
})

In [29]:
df

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,150
5,Paris,320


In [32]:
mean_price=df.groupby('city')['price'].mean().to_dict()
mean_price

{'London': 150.0, 'New York': 175.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [33]:
df['city_encoded']=df.city.map(mean_price)

In [34]:
df

Unnamed: 0,city,price,city_encoded
0,New York,200,175.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New York,150,175.0
5,Paris,320,310.0


In [35]:
df[['price','city_encoded']] # Only these are fed into ML Model

Unnamed: 0,price,city_encoded
0,200,175.0
1,150,150.0
2,300,310.0
3,250,250.0
4,150,175.0
5,320,310.0
