In [17]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


### oneHotEncoding using pandas

In [4]:
df_encoded = pd.get_dummies(df,columns=['fuel','owner'])
df_encoded

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


### k-1 oneHotEncoding

In [6]:
df_encoded_k_1 = pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)
df_encoded_k_1

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


### OneHotEncoding using sklearn

In [10]:
from sklearn.model_selection import train_test_split
x = df.drop(['selling_price'],axis=1)
y = df[['selling_price']]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=1)

x_train.shape , x_test.shape

((5689, 4), (2439, 4))

In [12]:
x_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
2246,Chevrolet,50000,Petrol,First Owner
2095,Chevrolet,120000,Diesel,Second Owner
3290,Hyundai,60000,Petrol,First Owner
6868,Maruti,5000,Petrol,First Owner
6895,Mahindra,40000,Diesel,First Owner


In [11]:
from sklearn.preprocessing import OneHotEncoder

In [29]:
encoder = OneHotEncoder(dtype='int',sparse_output=False,drop='first')

x_train_encoded =  encoder.fit_transform(x_train[['fuel','owner']])
x_test_encoded = encoder.transform(x_test[['fuel','owner']])
x_train_encoded

array([[0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [30]:
remain = x_train[['brand','km_driven']].values
remain

array([['Chevrolet', 50000],
       ['Chevrolet', 120000],
       ['Hyundai', 60000],
       ...,
       ['Hyundai', 70000],
       ['Maruti', 110000],
       ['Maruti', 65755]], dtype=object)

In [31]:
np.hstack((remain,x_train_encoded))

array([['Chevrolet', 50000, 0, ..., 0, 0, 0],
       ['Chevrolet', 120000, 1, ..., 1, 0, 0],
       ['Hyundai', 60000, 0, ..., 0, 0, 0],
       ...,
       ['Hyundai', 70000, 0, ..., 0, 0, 0],
       ['Maruti', 110000, 1, ..., 1, 0, 0],
       ['Maruti', 65755, 0, ..., 0, 0, 0]], dtype=object)

### OneHotEncoder with top categories

In [33]:
threshold = 100
counts = df[['brand']].value_counts()
repl = counts[counts<=threshold].index
repl

MultiIndex([(       'Nissan',),
            (       'Jaguar',),
            (        'Volvo',),
            (       'Datsun',),
            ('Mercedes-Benz',),
            (         'Fiat',),
            (         'Audi',),
            (        'Lexus',),
            (         'Jeep',),
            (   'Mitsubishi',),
            (         'Land',),
            (        'Force',),
            (        'Isuzu',),
            (          'Kia',),
            (   'Ambassador',),
            (           'MG',),
            (       'Daewoo',),
            (         'Opel',),
            (      'Peugeot',),
            (        'Ashok',)],
           names=['brand'])

In [35]:
pd.get_dummies(df.replace(repl,"other")).head()

Unnamed: 0,km_driven,selling_price,brand_BMW,brand_Chevrolet,brand_Ford,brand_Honda,brand_Hyundai,brand_Mahindra,brand_Maruti,brand_Renault,...,brand_other,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,145500,450000,False,False,False,False,False,False,True,False,...,False,False,True,False,False,True,False,False,False,False
1,120000,370000,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False
2,140000,158000,False,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
3,127000,225000,False,False,False,False,True,False,False,False,...,False,False,True,False,False,True,False,False,False,False
4,120000,130000,False,False,False,False,False,False,True,False,...,False,False,False,False,True,True,False,False,False,False
