In [35]:
# Implementation of Encoding Categorical Data - One-Hot Encoding
# Author: Muhammad Humayun Khan

import numpy as np
import pandas as pd

dataset = '/content/drive/MyDrive/datasets/cars.csv'
df = pd.read_csv(dataset)
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
192,Maruti,120600,Petrol,Second Owner,215000
4976,Maruti,120000,Petrol,Second Owner,110000
5999,Tata,148000,Diesel,Second Owner,135000
5237,Honda,60000,Petrol,Third Owner,200000
5425,Maruti,98000,Diesel,First Owner,270000


In [36]:
# calculate the brand cars as we need to reduce the number of categories having less cars
df['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [37]:
# total number of brands
df['brand'].nunique()

32

In [38]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

**1. OneHotEncoding using Pandas**

In [39]:
# pretend categorical data is the fuel and owner at present
# get_dummies take columns which we want to apply the functionality
# fuel will be replace with the 4 new data such as diesel, petrol, CNG, LPG
# owner will be replaced with 5 more new data first owner, second owner etc
# total 12 columns will be there in new data frame as fuel = 4, owner = 5 and rest is the prev 3 columns
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


**2. n-1 OneHotEncoding - dummy variable rule**

In [40]:
# the first column from fuel and owner will be removed due to dummy variable rule
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


**3. OneHotEncoding using Sklearn**

In [41]:
# Pandas is good with the data preparation but not with the machine learning model
# Pandas didn't remember the position of the columns while the sklearn remember the position of the columns
# Use sklearn while training the model in the machine learning and avoid the pandas

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=2)
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [42]:
# import one-hot encoder from sklearn preprocessing

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)

ohe.fit_transform(X_train[['fuel','owner']])    # it will return 9 columns as 4 of fuel and 5 of owner




array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [43]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])  # store new data frame in the variable
X_test_new = ohe.transform(X_test[['fuel','owner']])
X_train_new.shape



(6502, 7)

In [44]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

**4. OneHotEncoding with Top Categories**

In [45]:
counts = df['brand'].value_counts()
df['brand'].nunique()
threshold = 100

repl = counts[counts <= threshold].index

pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
7957,False,False,False,True,False,False,False,False,False,False,False,False,False
4667,False,False,True,False,False,False,False,False,False,False,False,False,False
6068,False,False,False,False,False,False,True,False,False,False,False,False,False
1492,False,False,False,False,False,False,True,False,False,False,False,False,False
5420,False,False,False,False,False,True,False,False,False,False,False,False,False
