In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# One Hot Encoding

consist of replacing the categorical variable by different boolean variables, which take value 0 or 1 to indicate whether or not a certain category / label of the variable was present for that observation

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv("/content/drive/MyDrive/Feature Engineering/mercedesbenz.csv", usecols=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [None]:
for col in data.columns:
    print(col, ':', len(data[col].unique()), ' labels')

X1 : 27  labels
X2 : 44  labels
X3 : 7  labels
X4 : 4  labels
X5 : 29  labels
X6 : 12  labels


In [None]:
pd.concat([data, pd.get_dummies(data)], axis=1).head()

Unnamed: 0,Sex,Sex_female,Sex_male
0,male,0,1
1,male,0,1
2,male,0,1
3,male,0,1
4,male,0,1


In [None]:
# one hot encoding 

pd.get_dummies(data, drop_first=True).shape

(4209, 117)

we can see that just 6 initial categorical variables, we end up with 177 new variables

These numbers are still not huge, and in practice we could work with them relatively easily.


solution is to limit one hot encoding to the 10 most frequent labels of the variable

In [None]:
# find the top 10 most frequent categories for variable X2
data.X2.value_counts().sort_values(ascending=False).head(10)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
Name: X2, dtype: int64

In [None]:
top_10 = [x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [None]:
for label in top_10:
    data[label] = np.where(data['X2']==label, 1, 0)
    
data[['X2']+top_10].head(10)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [None]:
def one_hot_top_x(df, variable, top_x_labels):
    
    for label in top_10:
        df[label] = np.where(data['X2']==label, 1, 0)

In [None]:
data = pd.read_csv("../mercedesbenz.csv", usecols=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [None]:
one_hot_top_x(data, 'X2', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,as,ae,ai,m,ak,r,n,s,f,e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0
