In [4]:
#one hot encoding is done on nominal categorical variables

In [49]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [50]:
df=pd.read_csv('cars.csv')

In [51]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [52]:
df.brand.value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [53]:
df.fuel.value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

In [54]:
df.owner.value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [55]:
#brand has a huge number of unique values 
#so, we will firstly focus on fuel and owner columns

In [56]:
#we will apply one hot encoding on fuel and owner columns

In [57]:
#one hot encoding using pandas
#dtyp=int is used for 0 and 1, by default the dtype will be bool

In [58]:
pd.get_dummies(df, columns=['fuel','owner'], dtype='int')

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


In [59]:
#drop_first=True is for overcoming the problem of multicollinearity, it drops the first dummy column i.e fuel_CNG and owner_First Owner 
pd.get_dummies(df, columns=['fuel','owner'], dtype='int', drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


In [60]:
#However, one hot encoding using pandas is not used in machine learning projects
#one hot encoding using sklearn is preferred

In [61]:
#one hot encoding using sklearn

In [62]:
#splitting the data into train and test

In [63]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [64]:
from sklearn.model_selection import train_test_split

In [65]:
X=df.iloc[:,:-1] #df.iloc[all rows, all columns except the last column]
y=df.iloc[:,4] #df.iloc[all rows, 4th column]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [66]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5928,Hyundai,125000,Diesel,Third Owner
4580,Toyota,120000,Diesel,Second Owner
6505,Mahindra,70000,Diesel,First Owner
1609,Ford,19100,Petrol,First Owner
7361,Maruti,72179,Diesel,First Owner


In [67]:
y_train.head()

5928    475000
4580    650000
6505    800000
1609    700000
7361    352000
Name: selling_price, dtype: int64

In [81]:
#importing onehotencoder from sklearn
#we need to first perform one hot encoding on the required columns them merge them back with the remaining columns of the data

In [69]:
from sklearn.preprocessing import OneHotEncoder

In [70]:
#drop='first' drops the first dummy variable column to overcome the problem of multicollinearity
#sparse=False prevents from forming a sparse matrix which gets rid of the need to write extra code to convert the data to an array
#dtyp=np.int32 converts values to int type, by default values are float type

In [71]:
ohe=OneHotEncoder(drop='first', sparse=False, dtype=np.int32)

In [74]:
#fitting and transforming fuel and owner columns of X_train to ohe

In [75]:
X_train_new=ohe.fit_transform(X_train[['fuel', 'owner']])

In [76]:
#transforming fuel and owner columns of X_test to ohe

In [77]:
X_test_new=ohe.transform(X_test[['fuel', 'owner']])

In [80]:
#we take X_train_new and merge them with brand and kmk_driven columns 
#we use np.hstack to stack them horizontally i.e. one next to the other

In [87]:
X_train_new

array([[1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [88]:
X_train[['brand','km_driven']].values

array([['Hyundai', 125000],
       ['Toyota', 120000],
       ['Mahindra', 70000],
       ...,
       ['Maruti', 50000],
       ['Hyundai', 10000],
       ['Maruti', 36000]], dtype=object)

In [89]:
#now we merge the above two values

In [91]:
np.hstack((X_train[['brand','km_driven']].values, X_train_new))

array([['Hyundai', 125000, 1, ..., 0, 0, 1],
       ['Toyota', 120000, 1, ..., 1, 0, 0],
       ['Mahindra', 70000, 1, ..., 0, 0, 0],
       ...,
       ['Maruti', 50000, 0, ..., 0, 0, 0],
       ['Hyundai', 10000, 0, ..., 0, 0, 0],
       ['Maruti', 36000, 1, ..., 0, 0, 0]], dtype=object)

In [92]:
#now we will one hot encode the 'brand' column
#it has a big number of unique values so we will encode the valeus with the highest frequency and put the rest of the values in 'uncommon' column

In [95]:
X_train.brand.nunique()

31

In [96]:
#brand has 31 unique values

In [98]:
X_train.brand.value_counts()

brand
Maruti           1972
Hyundai          1134
Mahindra          616
Tata              589
Toyota            407
Honda             358
Ford              302
Renault           190
Chevrolet         183
Volkswagen        148
BMW                95
Skoda              85
Nissan             62
Jaguar             59
Volvo              52
Datsun             51
Mercedes-Benz      42
Fiat               36
Audi               31
Lexus              28
Jeep               24
Mitsubishi         11
Isuzu               5
Land                5
Ambassador          4
Force               4
Kia                 3
MG                  2
Daewoo              2
Peugeot             1
Opel                1
Name: count, dtype: int64

In [99]:
#we will set a variable threshold=100 
#all the brands with value < 100 will be considered in the uncommon column

In [100]:
threshold=100

In [104]:
counts=X_train.brand.value_counts()

In [106]:
repl=counts[counts<=100].index

In [107]:
#replacing all the brands with count<100 with uncommon

In [110]:
pd.get_dummies(X_train.brand.replace(repl, 'uncommon'), dtype=int)

Unnamed: 0,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Tata,Toyota,Volkswagen,uncommon
5928,0,0,0,1,0,0,0,0,0,0,0
4580,0,0,0,0,0,0,0,0,1,0,0
6505,0,0,0,0,1,0,0,0,0,0,0
1609,0,1,0,0,0,0,0,0,0,0,0
7361,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6706,0,0,0,0,0,0,0,0,0,0,1
3809,0,0,0,1,0,0,0,0,0,0,0
2296,0,0,0,0,0,1,0,0,0,0,0
731,0,0,0,1,0,0,0,0,0,0,0
