### Data Encoding

1. Nominal/ OHE Encoding
2. Label and Ordinal Encoding
3. Target Guided Ordinal Encoding

#### Nominal/ OHE Encoding

1. Red [1, 0, 1]
2. Green [0, 1, 0]
3. Blue [0, 0, 1]

In [17]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({
      'color': ['pink', 'red', 'blue', 'green', 'green', 'red', 'blue']
})

df

Unnamed: 0,color
0,pink
1,red
2,blue
3,green
4,green
5,red
6,blue


In [18]:
## Create an instance of OneHotEncoder

encoder = OneHotEncoder()

In [23]:
## Perform fit and than transform
encoded=encoder.fit_transform(df[['color']]).toarray()
encoded

array([[0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]])

In [22]:
import pandas as pd
encoder_df=pd.DataFrame(encoded, columns=encoder.get_feature_names_out())
encoder_df

Unnamed: 0,color_blue,color_green,color_pink,color_red
0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0
5,0.0,0.0,0.0,1.0
6,1.0,0.0,0.0,0.0


In [27]:
pd.concat([df, encoder_df], axis=1)

Unnamed: 0,color,color_blue,color_green,color_pink,color_red
0,pink,0.0,0.0,1.0,0.0
1,red,0.0,0.0,0.0,1.0
2,blue,1.0,0.0,0.0,0.0
3,green,0.0,1.0,0.0,0.0
4,green,0.0,1.0,0.0,0.0
5,red,0.0,0.0,0.0,1.0
6,blue,1.0,0.0,0.0,0.0


In [32]:
## Assignment
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns

df=sns.load_dataset('tips')


In [33]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [40]:
encoder = OneHotEncoder()

encoded = encoder.fit_transform(df[['sex']]).toarray()
encoded_df=pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

pd.concat([df, encoded_df], axis=1)


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0
2,21.01,3.50,Male,No,Sun,Dinner,3,0.0,1.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.0,1.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,1.0,0.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.0,1.0
242,17.82,1.75,Male,No,Sat,Dinner,2,0.0,1.0


#### Label Encoding

1. Red 1
2. Green 2
3. Blue 3

In [41]:
import pandas as pd

df = pd.DataFrame({
      'color': ['pink', 'red', 'blue', 'green', 'green', 'red', 'blue']
})

df

Unnamed: 0,color
0,pink
1,red
2,blue
3,green
4,green
5,red
6,blue


In [42]:
from sklearn.preprocessing import LabelEncoder

lbl_encoder = LabelEncoder()

In [44]:
encoded = lbl_encoder.fit_transform(df[['color']])
encoded

  y = column_or_1d(y, warn=True)


array([2, 3, 0, 1, 1, 3, 0])

### Ordinal Encoding

1. High school: 1
2. College: 2
3. Graduate: 3
4. Post-Graduate: 4

In [45]:
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

df = pd.DataFrame({
      'size': ['small', 'medium', 'large', 'medium', 'small', 'large']
})

df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [46]:
ord_encoder = OrdinalEncoder(categories=[['small', 'medium', 'large']])

In [58]:
encoded = ord_encoder.fit_transform(df[['size']])
encoded

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

### Target Guided Ordinal Encoding

In [60]:
import pandas as pd

df = pd.DataFrame({
      'city': ['New York', 'London', 'Paris', 'Tokyo', 'New York', 'Paris'],
      'price': [200, 150, 300, 250, 180, 320]
})

df

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [62]:
mean_price=df.groupby('city')['price'].mean().to_dict()

In [63]:
mean_price

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [66]:
df['city_encoded']=df['city'].map(mean_price)
# df[['city', 'city_encoded']]
df

Unnamed: 0,city,price,city_encoded
0,New York,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New York,180,190.0
5,Paris,320,310.0
