# Encoding

* The characteristics of our data are sometimes in the form of labels or categories
* ML models cannot work with data in this way
* It is necessary to convert them, in some way, to numbers

In [1]:
import pandas as pd
import numpy as np

dataset = pd.DataFrame([
    ("Mexico", "Married", "High school"),
    ("Colombia", "Single", "Undergraduate"),
    ("Guinea Equatorial","Divorced", "College"),
    ("Mexico", "Single", "Primary"),
    ("Colombia", "Single","Primary"),
    ], columns=["Country", "Marital status", "Education"]
)

dataset

Unnamed: 0,Country,Marital status,Education
0,Mexico,Married,High school
1,Colombia,Single,Undergraduate
2,Guinea Equatorial,Divorced,College
3,Mexico,Single,Primary
4,Colombia,Single,Primary


# One-Hot encoding

* Converts a categorical variable on a matrix of zeros and ones
* Each column represents a value from our category
* Each observation is a row
* If the observation has that value for the category, the input is 1, otherwise it is 0

In [2]:
from sklearn.preprocessing import OneHotEncoder

In [3]:
encoder = OneHotEncoder()

In [4]:
encoder.fit(dataset[['Country']])

In [9]:
country_encoded = encoder.transform(dataset[['Country']])
country_encoded.todense()

matrix([[0., 0., 1.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.]])

In [10]:
encoder.categories_

[array(['Colombia', 'Guinea Equatorial', 'Mexico'], dtype=object)]

In [11]:
encoder.inverse_transform(country_encoded)

array([['Mexico'],
       ['Colombia'],
       ['Guinea Equatorial'],
       ['Mexico'],
       ['Colombia']], dtype=object)

### What if we add a new country to the encoder?

In [12]:
error_encoder = OneHotEncoder(handle_unknown='error')
ignore_encoder = OneHotEncoder(handle_unknown='ignore')

In [13]:
error_encoder.fit(dataset[['Country']])
ignore_encoder.fit(dataset[['Country']])

In [14]:
new_data = pd.DataFrame(['Costa Rica'], columns=['Country'])
new_data

Unnamed: 0,Country
0,Costa Rica


In [15]:
error_encoder.transform(new_data).todense()

ValueError: Found unknown categories ['Costa Rica'] in column 0 during transform

In [16]:
ignore_encoder.transform(new_data).todense()

matrix([[0., 0., 0.]])

# Ordinal Encoding

* Preserves the hierarchy between categories
* You should use caution when defining these hierarchies, if and only if they are reflected in real life

In [17]:
from sklearn.preprocessing import OrdinalEncoder

In [18]:
ordinal_encoder = OrdinalEncoder(categories=[[
    "Primary", "Secondary", "High school", "Undergraduate","College"
]])

In [19]:
ordinal_encoder.fit(dataset[['Education']])

In [20]:
ordinal_encoder.transform(dataset[['Education']])

array([[2.],
       [3.],
       [4.],
       [0.],
       [0.]])

In [22]:
dataset[['Education']]

Unnamed: 0,Education
0,High school
1,Undergraduate
2,College
3,Primary
4,Primary


### What if we add a new value to the encoder?

In [25]:
error_encoder_encoder = OrdinalEncoder(categories=[[
    "Primary", "Secondary", "High school", "Undergraduate","College"
]], handle_unknown='error')

In [26]:
error_encoder.fit(dataset[['Education']])

In [27]:
error_encoder.transform([["Kindergarten"]])



ValueError: Found unknown categories ['Kindergarten'] in column 0 during transform

In [30]:
default_encoder = OrdinalEncoder(categories=[[
    "Primary", "Secondary", "High school", "Undergraduate","College"
]], handle_unknown='use_encoded_value', unknown_value=np.nan)

In [31]:
default_encoder.fit(dataset[['Education']])

In [32]:
default_encoder.transform([["Kindergarten"]])



array([[nan]])