# Encoding Categorical Variables

## prepare the dataset

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
sns.set()

In [5]:
visa = pd.read_csv("Visadataset.csv")

In [6]:
visa.replace('?', np.nan)
cat_cols = [i for i in visa.columns if visa[i].dtypes == 'O']
num_cols = [i for i in visa.columns if visa[i].dtypes != 'O']
visa[cat_cols].fillna("missing")
visa[num_cols].fillna(0)

Unnamed: 0,no_of_employees,yr_of_estab,prevailing_wage
0,14513,2007,592.2029
1,2412,2002,83425.6500
2,44444,2008,122996.8600
3,98,1897,83434.0300
4,1082,2005,149907.3900
...,...,...,...
25475,2601,2008,77092.5700
25476,3274,2006,279174.7900
25477,1121,1910,146298.8500
25478,1918,1887,86154.7700


In [7]:
visa.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


## Ordinal Encoding

In [5]:
visa.education_of_employee.unique()

array(['High School', "Master's", "Bachelor's", 'Doctorate'], dtype=object)

In [6]:
# we have education of employee as ordinal category therefore we will encode this
temp_dict = {
    "High School":1,
    "Bachelor's":2,
    "Master's":3,
    "Doctorate":4
}
visa['education_of_employee_encoded'] = visa.education_of_employee.map(temp_dict)
visa.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,education_of_employee_encoded
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied,1
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified,3
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied,2
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied,2
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified,3


## One Hot Encoding


In [None]:
# In some articels it is written that one-hot encoding -> N features for N categories in feature
#                                     dummy encoding -> N-1 features for N categories in feature

In [28]:
visa.continent.unique()

array(['Asia', 'Africa', 'North America', 'Europe', 'South America',
       'Oceania'], dtype=object)

In [27]:
visa.region_of_employment.unique()

array(['West', 'Northeast', 'South', 'Midwest', 'Island'], dtype=object)

In [7]:
visa.has_job_experience.unique()

array(['N', 'Y'], dtype=object)

In [12]:
# We will encode has_job_experience feature using pandas get_dummies method
pd.get_dummies(visa, prefix="job_exp", prefix_sep="_", columns=['has_job_experience'], drop_first=True).head()

Unnamed: 0,case_id,continent,education_of_employee,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,education_of_employee_encoded,job_exp_Y
0,EZYV01,Asia,High School,N,14513,2007,West,592.2029,Hour,Y,Denied,1,0
1,EZYV02,Asia,Master's,N,2412,2002,Northeast,83425.65,Year,Y,Certified,3,1
2,EZYV03,Asia,Bachelor's,Y,44444,2008,West,122996.86,Year,Y,Denied,2,0
3,EZYV04,Asia,Bachelor's,N,98,1897,West,83434.03,Year,Y,Denied,2,0
4,EZYV05,Africa,Master's,N,1082,2005,South,149907.39,Year,Y,Certified,3,1


In [24]:
# Now we will encode this using OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
encoder = ce.OneHotEncoder(cols='has_job_experience', handle_unknown='return_nan',return_df=True, use_cat_names=True)

# after creating encoder fit and transform
data_encoded = encoder.fit_transform(visa)
data_encoded.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience_N,has_job_experience_Y,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,education_of_employee_encoded
0,EZYV01,Asia,High School,1.0,0.0,N,14513,2007,West,592.2029,Hour,Y,Denied,1
1,EZYV02,Asia,Master's,0.0,1.0,N,2412,2002,Northeast,83425.65,Year,Y,Certified,3
2,EZYV03,Asia,Bachelor's,1.0,0.0,Y,44444,2008,West,122996.86,Year,Y,Denied,2
3,EZYV04,Asia,Bachelor's,1.0,0.0,N,98,1897,West,83434.03,Year,Y,Denied,2
4,EZYV05,Africa,Master's,0.0,1.0,N,1082,2005,South,149907.39,Year,Y,Certified,3


## Label Encoding

In [34]:
from sklearn.preprocessing import LabelEncoder
visa['region_of_employment_encoded'] = LabelEncoder().fit_transform(visa.region_of_employment)
visa.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,education_of_employee_encoded,region_of_employment_encoded
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied,1,4
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified,3,2
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied,2,4
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied,2,4
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified,3,3


In [42]:
# First lets drop the encoded column To perform another method

visa.drop('region_of_employment_encoded', axis=1, inplace=True)

In [44]:
# Pandas factorize also perform the same thing
visa['region_of_employment_encoded'] = pd.factorize(visa['region_of_employment'])[0].reshape(-1, 1)
visa.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,education_of_employee_encoded,region_of_employment_encoded
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied,1,0
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified,3,1
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied,2,0
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied,2,0
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified,3,2


## Helmert Encoding

In [52]:
# we will use HelmertEncoder method

encoder = ce.HelmertEncoder(cols=['requires_job_training'], drop_invariant=True)
dfh = encoder.fit_transform(visa['requires_job_training'])
dfh.head()

Unnamed: 0,requires_job_training_0
0,-1.0
1,-1.0
2,1.0
3,-1.0
4,-1.0


## Binary Encoding


![image.png](attachment:image.png)

In [50]:
# we will use BinaryEncoder method on continent feature

encoder = ce.BinaryEncoder(cols=['continent'])
df = encoder.fit_transform(visa['continent'])
df.head()

Unnamed: 0,continent_0,continent_1,continent_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,1,0


## Frequency Encoding


In [59]:
fe = visa.groupby('continent').size() / len(visa)
visa['continent_encoded'] = visa['continent'].map(fe)
visa.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,education_of_employee_encoded,region_of_employment_encoded,continent_encoded
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied,1,0,0.661735
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified,3,1,0.661735
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied,2,0,0.661735
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied,2,0,0.661735
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified,3,2,0.021625


In [61]:
# Now lets drop the column
visa.drop('continent_encoded', axis=1, inplace=True)

## Mean Encoding / Target Encoding

* It is similar to label encoding. It this method lables are correlated with target variable.
  Mean target encoding for each category in the feature label is decided with the mean value of the target           variable on training data.
* connections are bounded within the categories and target itself.
* it does not affect the volume of the data and helps in faster learning

**steps -**
1) select feature
2) Find sum of target (1 or 0) for each category in feature
3) Find count of category for each category in feature
4) Divide step 2 ans step 3 and this will result in mean
5) map the mean of each category

* This method can cause overfitting (soution - smoothening)

* when to use - 
1) High-cardinality features
2) Domain-motivated features

* ref - https://www.kaggle.com/code/ryanholbrook/target-encoding

![image.png](attachment:image.png)

In [70]:
# we will mean encode continent

# first we will make target variable
temp = {"Denied":0, "Certified":1}
visa['target'] = visa.case_status.map(temp)


mean_encode = visa.groupby('continent')['target'].mean() # step 2 and 3
visa['continent_mean_encoded'] = visa['continent'].map(mean_encode) # step 5
visa.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,education_of_employee_encoded,region_of_employment_encoded,target,continent_mean_encoded
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied,1,0,0,0.653105
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified,3,1,1,0.653105
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied,2,0,0,0.653105
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied,2,0,0,0.653105
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified,3,2,1,0.720508


## Effect Encoding

* This encoding technique is also known as Deviation Encoding or Sum Encoding. Effect encoding is almost similar to dummy encoding, with a little difference. In dummy coding, we use 0 and 1 to represent the data but in effect encoding, we use three values i.e. 1,0, and -1.

In [8]:
visa.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [28]:
# Lets apply Effect Encoding on continent column

encoder=ce.sum_coding.SumEncoder(cols=['continent'],verbose=False)
encoder.fit_transform(visa).head()

Unnamed: 0,intercept,case_id,continent_0,continent_1,continent_2,continent_3,continent_4,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,1,EZYV01,1.0,0.0,0.0,0.0,0.0,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,1,EZYV02,1.0,0.0,0.0,0.0,0.0,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,1,EZYV03,1.0,0.0,0.0,0.0,0.0,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,1,EZYV04,1.0,0.0,0.0,0.0,0.0,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,1,EZYV05,0.0,1.0,0.0,0.0,0.0,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


## Hash Encoder

* In This method categories (N) of feature are hashed (Using some hashing algorithm)(By default MD5) into the features (we can make any number of features)

* if we have 100 categories in feature we can encode it into any number of new features with this method

* This method can create a problem of collision and loss of information

In [26]:
# we can try this method on continent column

encoder = ce.HashingEncoder(cols='continent', n_components=2)
encoder.fit_transform(visa).head()

# it will create a new feature col_0 and col_1 based on continent

Unnamed: 0,col_0,col_1,case_id,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,0,1,EZYV01,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,0,1,EZYV02,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,0,1,EZYV03,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,0,1,EZYV04,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,1,0,EZYV05,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


## Base N encoding

* when we have large number of categories which can not be handled by binary encoding (base 2) then we can use base N encoding (base 4, base 8, etc)

In [37]:
# we will give a try on continent feature

encoder = ce.BaseNEncoder(cols='continent',base=2)
encoder.fit_transform(visa)

Unnamed: 0,case_id,continent_0,continent_1,continent_2,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,0,0,1,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,0,0,1,Master's,Y,N,2412,2002,Northeast,83425.6500,Year,Y,Certified
2,EZYV03,0,0,1,Bachelor's,N,Y,44444,2008,West,122996.8600,Year,Y,Denied
3,EZYV04,0,0,1,Bachelor's,N,N,98,1897,West,83434.0300,Year,Y,Denied
4,EZYV05,0,1,0,Master's,Y,N,1082,2005,South,149907.3900,Year,Y,Certified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25475,EZYV25476,0,0,1,Bachelor's,Y,Y,2601,2008,South,77092.5700,Year,Y,Certified
25476,EZYV25477,0,0,1,High School,Y,N,3274,2006,Northeast,279174.7900,Year,Y,Certified
25477,EZYV25478,0,0,1,Master's,Y,N,1121,1910,South,146298.8500,Year,N,Certified
25478,EZYV25479,0,0,1,Master's,Y,Y,1918,1887,West,86154.7700,Year,Y,Certified
