## One Hot Encoding - variables with many categories

In [3]:
#Dataset - https://www.kaggle.com/c/mercedes-benz-greener-manufacturing/data
import pandas as pd
data = pd.read_csv("mercedesbenz.csv", usecols = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,n,f,d,t,a
1,b,ai,a,d,b,g
2,v,as,f,d,a,j
3,l,n,f,d,z,l
4,s,as,c,d,y,i


In [4]:
#Let's have a look at how many labels each variable has

for col in data.columns:
    print (col, ":", len(data[col].unique()), 'labels')

X1 : 27 labels
X2 : 45 labels
X3 : 7 labels
X4 : 4 labels
X5 : 32 labels
X6 : 12 labels


In [5]:
#Let's examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(data, drop_first=True).shape

(4209, 121)

In [7]:
#Let's find the top 10 most frequent categories for the variable X2 
data.X2.value_counts().head(20)

as    1658
ae     478
ai     462
m      348
ak     260
r      155
n      113
s      100
f       85
e       84
ay      78
aq      72
a       44
b       38
t       25
k       25
ag      23
ac      20
ao      19
i       15
Name: X2, dtype: int64

In [10]:
#Let's make a list with the most frequent categories of the variable 
top_10 = [x for x in data.X2.value_counts().head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [16]:
#and now we make the 10 binary variables
import numpy as np
for label in top_10:
    data[label] = np.where(data['X2'] == label, 1, 0)
    
data[['X2'] + top_10].head(10)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,n,0,0,0,0,0,0,1,0,0,0
1,ai,0,0,1,0,0,0,0,0,0,0
2,as,1,0,0,0,0,0,0,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,as,1,0,0,0,0,0,0,0,0,0
5,ai,0,0,1,0,0,0,0,0,0,0
6,ae,0,1,0,0,0,0,0,0,0,0
7,ae,0,1,0,0,0,0,0,0,0,0
8,s,0,0,0,0,0,0,0,1,0,0
9,as,1,0,0,0,0,0,0,0,0,0


In [18]:
#get whole set of dummy variables, for all the categorical variables

def one_hot_top_x(df, variable, top_x_labels):
    #Function to create the dummy variables for the most frequent labels
    #We can vary the no.of most frequent labels that we encode
    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(data[variable]==label, 1, 0)
#read the data again
data = pd.read_csv('mercedesbenz.csv', usecols=['X1','X2','X3','X4','X5','X6'])

#Encode X2 into the 10 most frequent categories

one_hot_top_x(data, 'X2', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,n,f,d,t,a,0,0,0,0,0,0,1,0,0,0
1,b,ai,a,d,b,g,0,0,1,0,0,0,0,0,0,0
2,v,as,f,d,a,j,1,0,0,0,0,0,0,0,0,0
3,l,n,f,d,z,l,0,0,0,0,0,0,1,0,0,0
4,s,as,c,d,y,i,1,0,0,0,0,0,0,0,0,0


In [19]:
#find the 10 most frequent categories for X1
top_10 = [x for x in data.X1.value_counts().head(10).index]
one_hot_top_x(data, 'X1', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X1_aa,X1_s,X1_l,X1_b,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o
0,v,n,f,d,t,a,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,b,ai,a,d,b,g,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,v,as,f,d,a,j,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,l,n,f,d,z,l,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,s,as,c,d,y,i,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [20]:
#find the 10 most frequent categories for X3
top_10 = [x for x in data.X3.value_counts().head(10).index]
one_hot_top_x(data, 'X3', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X1_a,X1_c,X1_o,X3_c,X3_f,X3_a,X3_d,X3_g,X3_e,X3_b
0,v,n,f,d,t,a,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,b,ai,a,d,b,g,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,v,as,f,d,a,j,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,l,n,f,d,z,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,s,as,c,d,y,i,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [21]:
#find the 10 most frequent categories for X4
top_10 = [x for x in data.X4.value_counts().head(10).index]
one_hot_top_x(data, 'X4', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X3_f,X3_a,X3_d,X3_g,X3_e,X3_b,X4_d,X4_b,X4_c,X4_a
0,v,n,f,d,t,a,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,b,ai,a,d,b,g,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
2,v,as,f,d,a,j,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,l,n,f,d,z,l,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,s,as,c,d,y,i,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [22]:
#find the 10 most frequent categories for X5
top_10 = [x for x in data.X5.value_counts().head(10).index]
one_hot_top_x(data, 'X5', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X5_v,X5_r,X5_p,X5_w,X5_af,X5_ad,X5_ac,X5_n,X5_l,X5_s
0,v,n,f,d,t,a,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,b,ai,a,d,b,g,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,v,as,f,d,a,j,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,l,n,f,d,z,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,s,as,c,d,y,i,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
#find the 10 most frequent categories for X6
top_10 = [x for x in data.X6.value_counts().head(10).index]
one_hot_top_x(data, 'X6', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_h,X6_a,X6_k,X6_c,X6_f
0,v,n,f,d,t,a,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,b,ai,a,d,b,g,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,v,as,f,d,a,j,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,l,n,f,d,z,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,s,as,c,d,y,i,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [24]:
data.drop(columns=['X1','X2','X3','X4','X5','X6'], axis = 1)

Unnamed: 0,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_h,X6_a,X6_k,X6_c,X6_f
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [25]:
data.shape

(4209, 57)