# One Hot Encoding

In [43]:
# import libraries
import pandas as pd
import numpy as np

In [44]:
# load dataset
df = pd.read_csv('../Dataset/cars.csv')

In [45]:
# display first five rows of the dataframe
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [46]:
# check the number of unique brands in the 'brand' column
df['brand'].nunique()

32

In [47]:
# check the distribution of values in the 'fuel' column
df['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

In [48]:
# check the distribution of values in the 'owner' column
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

# One Hot Encoding using pandas

In [49]:
pd.get_dummies(df, columns=['fuel', 'owner'], dtype=int)

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


# K-1 One Hot Encoding

In [50]:
pd.get_dummies(df, columns=['fuel', 'owner'], dtype=int, drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


# One Hot Encoding using Sklearn

In [51]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4], df.iloc[:,-1], test_size=0.2, random_state=0)

In [52]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [53]:
# from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

In [54]:
# Count the occurrences of each unique value in the 'brand' column.
counts = df['brand'].value_counts()

In [55]:
# create instance of OneHotEncoder with drop='first' to avoid dummy variable trap
ohe= OneHotEncoder(drop='first', sparse_output=False, dtype=int)

In [56]:
# One-hot encode the 'fuel' and 'owner' columns from the training data.
X_train_new= ohe.fit_transform(X_train[['fuel', 'owner']])

In [57]:
# Apply the *already fitted* one-hot encoding to the test data.
X_test_new= ohe.transform(X_test[['fuel', 'owner']])

In [58]:
X_train_new

array([[0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [59]:
# Count unique combinations of 'brand' and 'km_driven' in the training data.
X_train[['brand', 'km_driven']].value_counts()

brand   km_driven
Maruti  70000        133
        50000        113
        60000        109
        80000         99
        90000         96
                    ... 
Audi    25000          1
        23600          1
        20000          1
        15000          1
        11500          1
Name: count, Length: 1714, dtype: int64

In [60]:
# Horizontally stack the 'brand'/'km_driven' columns with the 'X_train_new' array.
np.hstack((X_train[['brand', 'km_driven']].values, X_train_new))

array([['Hyundai', 60000, 0, ..., 0, 0, 0],
       ['Tata', 150000, 1, ..., 0, 0, 1],
       ['Hyundai', 110000, 1, ..., 1, 0, 0],
       ...,
       ['Hyundai', 90000, 0, ..., 1, 0, 0],
       ['Volkswagen', 90000, 1, ..., 0, 0, 0],
       ['Hyundai', 110000, 0, ..., 0, 0, 0]], dtype=object)

# One Hot Encoding with top categories

In [61]:
counts =df['brand'].value_counts()

In [62]:
# Get the total count of unique brands.
df['brand'].nunique()
# Set a minimum frequency limit (e.g., for filtering).
threshold = 100

In [63]:
repl =counts[counts <= threshold].index

In [64]:
# One-hot encode 'brand', grouping rare values (in 'repl') as 'uncommon'.
pd.get_dummies(df['brand'].replace(repl, 'uncommon'), dtype=int)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0
