# Robustly Encoding categorical features

The categorical features in the tabular playground series are stored as strings. They need to be encoded to something else to be used in machine learning models. I would like to build some encoding that is robust to having new categories that have not yet been seen in the training data.

# Load data

In [180]:
import pandas as pd

In [181]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,A,B,A,A,B,D,A,E,C,...,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903,6.994023
1,2,B,A,A,A,B,B,A,E,A,...,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464,8.071256
2,3,A,A,A,C,B,D,A,B,C,...,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352,5.760456
3,4,A,A,A,C,B,D,A,E,G,...,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766,7.806457
4,6,A,B,A,A,B,B,A,E,C,...,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743,6.868974


In [182]:
cat_cols = [x for x in train.columns if x.startswith('cat')]
cat_cols

['cat0',
 'cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9']

# Test sample with new categories to validate encodings

In [183]:
test_new_cats = pd.DataFrame(columns=test.columns, data = [[1]+['Z']*len(cat_cols)+[0]*14])
test_new_cats

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,1,Z,Z,Z,Z,Z,Z,Z,Z,Z,...,0,0,0,0,0,0,0,0,0,0


# Using the LabelEncoder
The sklearn LabelEncoder can encode strings to some integer label.

In [187]:
from sklearn.preprocessing import LabelEncoder

In [188]:
le = LabelEncoder()
le.fit(train['cat8'])

LabelEncoder()

In [189]:
le.transform(['A'])

array([0])

The LabelEncoder is meant to encode the target variable, not the features. It cannot deal with not previously seen categories.

In [190]:
le.transform(['Z'])

ValueError: y contains previously unseen labels: ['Z']

# Using pandas CategoricalDtype

The CategoricalDtype from pandas can be used in many machine learning models, including LightGBM and CatBoost. I created a simple transformer that 'learns' the available categories from the training data and encodes strings to categories.

In [221]:
from sklearn.base import BaseEstimator, TransformerMixin
from pandas.api.types import CategoricalDtype

class CategoricalTransform(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        
    def _transform_column(self, col, col_name):
        return col.astype(self.cat_type[col_name]) 
        
    def transform(self, df, **transform_params):
        df_cat = df.copy()
        for col in self.cat_cols:
            df_cat[col] = self._transform_column(df_cat[col], col)
        return df_cat
        
    def fit(self, X, y=None, **fit_params):
        self.cat_type = dict()
        for col in self.cat_cols:
            self.cat_type[col] = CategoricalDtype(X[col].unique())
        return self

In [222]:
ct = CategoricalTransform(cat_cols)

In [223]:
t = ct.fit_transform(train)
t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 26 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   id      300000 non-null  int64   
 1   cat0    300000 non-null  category
 2   cat1    300000 non-null  category
 3   cat2    300000 non-null  category
 4   cat3    300000 non-null  category
 5   cat4    300000 non-null  category
 6   cat5    300000 non-null  category
 7   cat6    300000 non-null  category
 8   cat7    300000 non-null  category
 9   cat8    300000 non-null  category
 10  cat9    300000 non-null  category
 11  cont0   300000 non-null  float64 
 12  cont1   300000 non-null  float64 
 13  cont2   300000 non-null  float64 
 14  cont3   300000 non-null  float64 
 15  cont4   300000 non-null  float64 
 16  cont5   300000 non-null  float64 
 17  cont6   300000 non-null  float64 
 18  cont7   300000 non-null  float64 
 19  cont8   300000 non-null  float64 
 20  cont9   300000 non-null  f

## Non-existing categories are encoded as NaN:

In [224]:
test_new_cats

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,1,Z,Z,Z,Z,Z,Z,Z,Z,Z,...,0,0,0,0,0,0,0,0,0,0


In [225]:
ct.transform(test_new_cats)

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,1,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0


## The transformer can be embedded in a sklearn Pipeline

In [226]:
from sklearn.pipeline import Pipeline
from lightgbm.sklearn import LGBMRegressor
p = Pipeline([('cat_trans', CategoricalTransform(cat_cols)), 
              ('lgbm', LGBMRegressor(n_jobs=-2))])

In [227]:
x_train = train.drop(columns=['target','id'])
y_train = train['target']
p.fit(x_train, y_train)

Pipeline(steps=[('cat_trans',
                 CategoricalTransform(cat_cols=['cat0', 'cat1', 'cat2', 'cat3',
                                                'cat4', 'cat5', 'cat6', 'cat7',
                                                'cat8', 'cat9'])),
                ('lgbm', LGBMRegressor(n_jobs=-2))])

In [228]:
p.predict(test.drop(columns=['id']))

array([7.62396118, 7.79491627, 7.61545603, ..., 7.47019117, 7.5067693 ,
       7.40374487])

In [229]:
from sklearn.model_selection import cross_validate
scores = cross_validate(p, X=x_train, y=y_train, cv=5, return_train_score = True,
                         scoring='neg_root_mean_squared_error')
scores

{'fit_time': array([1.3807838 , 1.39194703, 1.39730239, 1.37761903, 1.40416694]),
 'score_time': array([0.13198853, 0.12257838, 0.1241498 , 0.12441874, 0.12701011]),
 'test_score': array([-0.8458857 , -0.84599698, -0.84685842, -0.84714281, -0.84521346]),
 'train_score': array([-0.83652531, -0.83677879, -0.83625818, -0.83637756, -0.83685619])}

In [230]:
scores['test_score'].mean()

-0.8462194722947265

prediction with new categories

In [231]:
p.predict(test_new_cats.drop(columns=['id']))

array([7.34958224])

# Transform to integer value
If you need to transform to an integer value, for example to train an embedding in tensorflow, you can use the codes from the categorical feature instead. Below is a small transformer to do so.

In [243]:
class IntegerCategoricalTransform(CategoricalTransform):
    def _transform_column(self, col, col_name):
        return super()._transform_column(col, col_name).values.codes

In [244]:
ct = IntegerCategoricalTransform(cat_cols)

In [245]:
t = ct.fit_transform(train)
t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 26 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      300000 non-null  int64  
 1   cat0    300000 non-null  int8   
 2   cat1    300000 non-null  int8   
 3   cat2    300000 non-null  int8   
 4   cat3    300000 non-null  int8   
 5   cat4    300000 non-null  int8   
 6   cat5    300000 non-null  int8   
 7   cat6    300000 non-null  int8   
 8   cat7    300000 non-null  int8   
 9   cat8    300000 non-null  int8   
 10  cat9    300000 non-null  int8   
 11  cont0   300000 non-null  float64
 12  cont1   300000 non-null  float64
 13  cont2   300000 non-null  float64
 14  cont3   300000 non-null  float64
 15  cont4   300000 non-null  float64
 16  cont5   300000 non-null  float64
 17  cont6   300000 non-null  float64
 18  cont7   300000 non-null  float64
 19  cont8   300000 non-null  float64
 20  cont9   300000 non-null  float64
 21  cont10  30

## Missing values are encoded as -1

In [246]:
ct.transform(test_new_cats)

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,0,0,0,0,0,0,0,0,0,0


# Using this in a sklearn Pipeline
Here I'm using CatBoost, as it is not trivial to use integer encoded features in the sklearn API of LightGBM.

In [269]:
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor
p = Pipeline([('cat_trans', IntegerCategoricalTransform(cat_cols)), 
              ('cb', CatBoostRegressor(iterations=50, thread_count=3, cat_features=cat_cols))])

In [270]:
x_train = train.drop(columns=['target','id'])
y_train = train['target']
p.fit(x_train, y_train)

Learning rate set to 0.5
0:	learn: 0.8752914	total: 65ms	remaining: 3.19s
1:	learn: 0.8695146	total: 131ms	remaining: 3.14s
2:	learn: 0.8653475	total: 186ms	remaining: 2.92s
3:	learn: 0.8615959	total: 248ms	remaining: 2.85s
4:	learn: 0.8593585	total: 305ms	remaining: 2.74s
5:	learn: 0.8575339	total: 368ms	remaining: 2.7s
6:	learn: 0.8564736	total: 419ms	remaining: 2.57s
7:	learn: 0.8556105	total: 473ms	remaining: 2.48s
8:	learn: 0.8546407	total: 526ms	remaining: 2.4s
9:	learn: 0.8539448	total: 579ms	remaining: 2.31s
10:	learn: 0.8531275	total: 632ms	remaining: 2.24s
11:	learn: 0.8524850	total: 690ms	remaining: 2.18s
12:	learn: 0.8519798	total: 743ms	remaining: 2.11s
13:	learn: 0.8514680	total: 797ms	remaining: 2.05s
14:	learn: 0.8510933	total: 852ms	remaining: 1.99s
15:	learn: 0.8505936	total: 902ms	remaining: 1.92s
16:	learn: 0.8502564	total: 954ms	remaining: 1.85s
17:	learn: 0.8498441	total: 1.01s	remaining: 1.79s
18:	learn: 0.8495924	total: 1.06s	remaining: 1.73s
19:	learn: 0.849304

Pipeline(steps=[('cat_trans',
                 IntegerCategoricalTransform(cat_cols=['cat0', 'cat1', 'cat2',
                                                       'cat3', 'cat4', 'cat5',
                                                       'cat6', 'cat7', 'cat8',
                                                       'cat9'])),
                ('cb',
                 <catboost.core.CatBoostRegressor object at 0x7f62683bdca0>)])

In [271]:
p.predict(test.drop(columns=['id']))

array([7.56977581, 7.84586832, 7.6672673 , ..., 7.52127828, 7.50150109,
       7.38816294])

In [272]:
from sklearn.model_selection import cross_validate
scores = cross_validate(p, X=x_train, y=y_train, cv=5, return_train_score = True,
                         scoring='neg_root_mean_squared_error')
scores

Learning rate set to 0.5
0:	learn: 0.8749972	total: 49.4ms	remaining: 2.42s
1:	learn: 0.8695887	total: 100ms	remaining: 2.41s
2:	learn: 0.8658750	total: 148ms	remaining: 2.31s
3:	learn: 0.8621802	total: 194ms	remaining: 2.23s
4:	learn: 0.8603197	total: 238ms	remaining: 2.14s
5:	learn: 0.8584933	total: 286ms	remaining: 2.1s
6:	learn: 0.8572564	total: 329ms	remaining: 2.02s
7:	learn: 0.8561371	total: 371ms	remaining: 1.95s
8:	learn: 0.8555310	total: 412ms	remaining: 1.87s
9:	learn: 0.8546609	total: 454ms	remaining: 1.81s
10:	learn: 0.8540354	total: 496ms	remaining: 1.76s
11:	learn: 0.8535526	total: 538ms	remaining: 1.71s
12:	learn: 0.8528470	total: 578ms	remaining: 1.65s
13:	learn: 0.8522906	total: 621ms	remaining: 1.6s
14:	learn: 0.8517395	total: 664ms	remaining: 1.55s
15:	learn: 0.8511970	total: 708ms	remaining: 1.5s
16:	learn: 0.8506601	total: 750ms	remaining: 1.46s
17:	learn: 0.8502955	total: 792ms	remaining: 1.41s
18:	learn: 0.8498646	total: 835ms	remaining: 1.36s
19:	learn: 0.84958

12:	learn: 0.8514679	total: 598ms	remaining: 1.7s
13:	learn: 0.8510980	total: 639ms	remaining: 1.64s
14:	learn: 0.8507506	total: 681ms	remaining: 1.59s
15:	learn: 0.8503690	total: 723ms	remaining: 1.54s
16:	learn: 0.8500481	total: 764ms	remaining: 1.48s
17:	learn: 0.8495696	total: 807ms	remaining: 1.43s
18:	learn: 0.8491125	total: 852ms	remaining: 1.39s
19:	learn: 0.8490056	total: 883ms	remaining: 1.32s
20:	learn: 0.8487024	total: 926ms	remaining: 1.28s
21:	learn: 0.8484041	total: 969ms	remaining: 1.23s
22:	learn: 0.8480921	total: 1.01s	remaining: 1.19s
23:	learn: 0.8476652	total: 1.05s	remaining: 1.14s
24:	learn: 0.8472241	total: 1.09s	remaining: 1.09s
25:	learn: 0.8468791	total: 1.14s	remaining: 1.05s
26:	learn: 0.8465537	total: 1.18s	remaining: 1s
27:	learn: 0.8462464	total: 1.22s	remaining: 959ms
28:	learn: 0.8459922	total: 1.26s	remaining: 915ms
29:	learn: 0.8457167	total: 1.31s	remaining: 872ms
30:	learn: 0.8454435	total: 1.35s	remaining: 828ms
31:	learn: 0.8451320	total: 1.39s	r

{'fit_time': array([6.54078436, 6.66119599, 6.72755527, 6.57151961, 6.61343169]),
 'score_time': array([1.0065434 , 1.01369572, 1.02134514, 1.01544619, 1.01414657]),
 'test_score': array([-0.84647447, -0.84604765, -0.84793436, -0.84749195, -0.84542941]),
 'train_score': array([-0.84114496, -0.84112825, -0.84081544, -0.84076416, -0.84156822])}

In [273]:
scores['test_score'].mean()

-0.8466755696380319

In [274]:
p.predict(test_new_cats.drop(columns=['id']))

array([7.07252767])

# One hot encoding
Robust One hot encoding can be achieved by chaining the categorical transformer with the OneHotTransform below.

In [304]:
from sklearn.base import BaseEstimator, TransformerMixin

class OneHotTransform(BaseEstimator, TransformerMixin):
    def transform(self, df, **transform_params):
        return pd.get_dummies(df)
    
    def fit(self, X, y=None, **fit_params):
        return self

In [305]:
oh_pipe = Pipeline([('cat_trans', CategoricalTransform(cat_cols)),
                    ('oh_trans', OneHotTransform())])

In [306]:
train_oh = oh_pipe.fit_transform(train_cat)
train_oh.head()

Unnamed: 0,id,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,...,cat9_L,cat9_G,cat9_H,cat9_O,cat9_A,cat9_J,cat9_M,cat9_C,cat9_D,cat9_E
0,1,0.923191,0.684968,0.124454,0.217886,0.281421,0.881122,0.42165,0.741413,0.895799,...,0,0,0,0,0,0,0,0,0,0
1,2,0.437627,0.014213,0.357438,0.846127,0.282354,0.440011,0.34623,0.278495,0.593413,...,0,0,0,0,0,0,0,0,0,0
2,3,0.732209,0.760122,0.454644,0.81299,0.293756,0.914155,0.369602,0.832564,0.86562,...,0,0,0,0,0,0,0,0,0,0
3,4,0.705142,0.771678,0.153735,0.732893,0.769785,0.934138,0.57893,0.407313,0.868099,...,0,0,0,0,0,0,0,0,0,0
4,6,0.486063,0.639349,0.496212,0.354186,0.279105,0.3826,0.70594,0.325193,0.440967,...,0,0,0,0,0,0,0,0,0,0


In [300]:
cat5_cols = [col for col in train_oh.columns if col.startswith('cat5')]
cat5_cols

['cat5_D', 'cat5_B', 'cat5_A', 'cat5_C']

In [301]:
train_oh[cat5_cols]

Unnamed: 0,cat5_D,cat5_B,cat5_A,cat5_C
0,1,0,0,0
1,0,1,0,0
2,1,0,0,0
3,1,0,0,0
4,0,1,0,0
...,...,...,...,...
299995,0,1,0,0
299996,0,1,0,0
299997,0,1,0,0
299998,0,1,0,0


## New categories are encoded as zeros for each category column.

In [303]:
oh_pipe.transform(test_new_cats)[cat5_cols]

Unnamed: 0,cat5_D,cat5_B,cat5_A,cat5_C
0,0,0,0,0


# Using this in a sklearn Pipeline
The sklearn Randomforestregressor does not support categorical variables, so in this example I use one hot encoding for the categorical features. Note that the one hot pipeline defined before can be used as an element in the new pipeline.

In [308]:
from sklearn.ensemble import RandomForestRegressor
p = Pipeline([('oh_trans', oh_pipe), 
              ('rf', RandomForestRegressor(n_jobs=-2))])

In [309]:
x_train = train.drop(columns=['target','id'])
y_train = train['target']
p.fit(x_train.head(10000), y_train.head(10000))

Pipeline(steps=[('oh_trans',
                 Pipeline(steps=[('cat_trans',
                                  CategoricalTransform(cat_cols=['cat0', 'cat1',
                                                                 'cat2', 'cat3',
                                                                 'cat4', 'cat5',
                                                                 'cat6', 'cat7',
                                                                 'cat8',
                                                                 'cat9'])),
                                 ('oh_trans', OneHotTransform())])),
                ('rf', RandomForestRegressor(n_jobs=-2))])

In [310]:
p.predict(test.drop(columns=['id']))

array([7.29935054, 7.77576226, 7.68424659, ..., 7.51558685, 7.40574406,
       7.52332792])

In [311]:
from sklearn.model_selection import cross_validate
scores = cross_validate(p, X=x_train.head(10000), y=y_train.head(10000), cv=5, return_train_score = True,
                         scoring='neg_root_mean_squared_error')
scores

{'fit_time': array([3.27622175, 2.31571436, 2.31708813, 2.31778646, 2.32605338]),
 'score_time': array([0.05249524, 0.04676676, 0.04955649, 0.05177331, 0.0508647 ]),
 'test_score': array([-0.86127785, -0.87747438, -0.86341516, -0.86767243, -0.86399085]),
 'train_score': array([-0.3248739 , -0.32360226, -0.32486811, -0.32363932, -0.32612375])}

In [312]:
scores['test_score'].mean()

-0.8667661326947869

In [313]:
p.predict(test_new_cats.drop(columns=['id']))

array([7.30321088])