## Categorical Data Type 2 - Nominal Data

## Import libraries

In [3]:
import numpy as np
import pandas as pd

## Import Datasets

In [4]:
df = pd.read_csv('cars.csv')

In [5]:
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
3386,Maruti,15000,Petrol,First Owner,630000
5855,Hyundai,100000,Petrol,Second Owner,265000
553,Hyundai,25000,Petrol,First Owner,570000
7134,Chevrolet,70000,Petrol,Third Owner,148000
6119,Datsun,40000,Petrol,First Owner,335000


In [6]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [8]:
df['brand'].value_counts()
df['brand'].nunique()

32

In [9]:
df['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

## 1. One hot encoding with pandas

In [11]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


# 2. K-1 One hot Encoding

In [12]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


## 3. One hot encoding with Scikit learn

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2)

In [16]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [17]:
x_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
6492,Maruti,72000,Diesel,First Owner
7352,Hyundai,40000,Diesel,Second Owner
930,Maruti,100000,Diesel,Third Owner
6536,Hyundai,74770,CNG,Third Owner
7178,Mercedes-Benz,70000,Petrol,First Owner


In [18]:
x_test.head()

Unnamed: 0,brand,km_driven,fuel,owner
2316,Maruti,69000,Diesel,Second Owner
2660,Mahindra,35000,Diesel,First Owner
5465,Tata,90000,Diesel,First Owner
4250,Honda,7032,Petrol,First Owner
138,BMW,27000,Diesel,First Owner


## Import OHE: SKlearn

In [19]:
from sklearn.preprocessing import OneHotEncoder

In [22]:
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)

In [24]:
x_train_new = ohe.fit_transform(x_train[['fuel', 'owner']])

In [25]:
x_train_new

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 1, ..., 1, 0, 0]], dtype=int32)

In [27]:
x_train_new.shape

(6502, 7)

In [28]:
np.hstack((x_train[['brand','km_driven']].values,x_train_new))

array([['Maruti', 72000, 1, ..., 0, 0, 0],
       ['Hyundai', 40000, 1, ..., 1, 0, 0],
       ['Maruti', 100000, 1, ..., 0, 0, 1],
       ...,
       ['BMW', 7500, 1, ..., 0, 0, 0],
       ['Maruti', 220000, 0, ..., 1, 0, 0],
       ['Maruti', 40000, 0, ..., 1, 0, 0]], dtype=object)