In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('cars.csv')

In [4]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


### Firstly, we'll perform One Hot Encoding on fuel and owner

## Train test split

In [5]:
X=df.drop(['selling_price'],axis=1)
y=df['selling_price']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [8]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
3042,Hyundai,60000,LPG,First Owner
1520,Tata,150000,Diesel,Third Owner
2611,Hyundai,110000,Diesel,Second Owner
3544,Mahindra,28000,Diesel,Second Owner
4138,Maruti,15000,Petrol,First Owner


### OneHotEncoder on fuel and owner

In [11]:
from sklearn.preprocessing import OneHotEncoder

#drop='first' will drop the first dummy variable avoiding the problem of multicollinearity
#sparse=False will give a numpy array instead of a sparse matrix
#dtype=np.int32 will give integer values
ohe=OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32)

In [12]:
X_train_new=ohe.fit_transform(X_train[['fuel','owner']])
X_test_new=ohe.transform(X_test[['fuel','owner']])

In [13]:
X_train.shape, X_train_new.shape

((6502, 4), (6502, 7))

### we need to stack the X_train_new with brand and km_driven of X_train horizontally

In [15]:
np.hstack((X_train[['brand','km_driven']].values, X_train_new))

array([['Hyundai', 60000, 0, ..., 0, 0, 0],
       ['Tata', 150000, 1, ..., 0, 0, 1],
       ['Hyundai', 110000, 1, ..., 1, 0, 0],
       ...,
       ['Hyundai', 90000, 0, ..., 1, 0, 0],
       ['Volkswagen', 90000, 1, ..., 0, 0, 0],
       ['Hyundai', 110000, 0, ..., 0, 0, 0]], dtype=object)

## One Hot Encoding with top categories in the brand column

In [16]:
counts=df.brand.value_counts()

In [17]:
df.brand.nunique()

32

### any brand with less than 100 cars will be put into others column

In [18]:
threshold=100

In [19]:
repl=counts[counts<=threshold].index

In [21]:
pd.get_dummies(df.brand.replace(repl,'uncommon'),dtype=int).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
2425,0,0,0,0,1,0,0,0,0,0,0,0,0
6815,0,0,0,0,0,0,0,0,0,0,0,1,0
7068,0,0,0,0,0,0,0,0,0,0,0,0,1
3525,0,0,0,0,0,0,0,0,0,0,1,0,0
4048,0,0,0,0,0,0,0,0,0,1,0,0,0
