In [20]:
import numpy as np
import pandas as pd

In [21]:
df = pd.read_csv("cars.csv")

In [22]:
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
6721,Renault,129000,Diesel,First Owner,650000
6749,Nissan,110000,Diesel,Third Owner,220000
2640,Maruti,5621,Petrol,First Owner,650000
876,Tata,100000,Diesel,Second Owner,120000
3034,Hyundai,15000,Petrol,First Owner,1000000


## One Hot Encoding by Pandas
It is not recommended to do One hot encoding by pandas because pandas do not remember anything after the changes it had done on the dataset thats why it is not the good practice to do One Hot Encoding by Pandas.

In [23]:
pd.get_dummies(df ,columns = ["fuel" , "owner"])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


In [24]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True) 

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


It is recommended to remove the first attribute after doing one hot encoding on a particular attribute because if we do not remove it , it leads to the dummy variable trap due to this it ise called n-1 encoding

## Train_test_split

In [25]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=2)

In [26]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


# One Hot Encoding by using Sci-Kit learn

In [27]:
from sklearn.preprocessing import OneHotEncoder

ohen = OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32)

X_train_new = ohen.fit_transform(X_train[['fuel','owner']])

X_test_new = ohen.transform(X_test[['fuel','owner']])

In [28]:
X_train_new.shape

(6502, 7)

### In this we have to separate the columns from the original dataset and perform the OHE and then again after doing OHE we need to combine the new attributes with the previous data.

In [29]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new)) # Combining the dummy variables with the brand and km_driven attibute.

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

# OHE by using most frequent categories

In [30]:
counts = df['brand'].value_counts() # Counting the no. of cars in a particular brand.

df['brand'].nunique() # There will be 32 different categories of brands.

threshold = 100 # Threshold is set it means only the brands having the minimum count of 100 will be taken for creating dummy variables rest will not be taken.

repl = counts[counts <= threshold].index # Logic for replacing 

pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(5) # Creating the dummy variables (One Hot Encoding)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
7505,False,False,False,False,False,True,False,False,False,False,False,False,False
6045,False,False,False,False,False,True,False,False,False,False,False,False,False
1263,False,False,False,False,True,False,False,False,False,False,False,False,False
94,False,False,False,False,False,False,True,False,False,False,False,False,False
4051,False,False,False,False,True,False,False,False,False,False,False,False,False
