[Target Encoder — Category Encoders latest documentation : target-encoder](https://contrib.scikit-learn.org/categorical-encoding/targetencoder.html#target-encoder)

[scikit-learn-contrib/categorical-encoding: A library of sklearn compatible categorical variable encoders : examples](https://github.com/scikit-learn-contrib/categorical-encoding#examples)

In [1]:
from category_encoders import *
import pandas as pd
from sklearn.datasets import load_boston


bunch = load_boston()
y_train = bunch.target[0:250]
y_test = bunch.target[250:506]
X_train = pd.DataFrame(bunch.data[0:250], columns=bunch.feature_names)
X_test = pd.DataFrame(bunch.data[250:506], columns=bunch.feature_names)

In [2]:
X_train.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [3]:
X_train['CHAS'].value_counts(dropna=False)

0.0    230
1.0     20
Name: CHAS, dtype: int64

In [4]:
X_train['RAD'].value_counts(dropna=False)

5.0    77
4.0    71
3.0    30
8.0    24
2.0    24
6.0    15
7.0     6
1.0     3
Name: RAD, dtype: int64

In [5]:
pd.concat([X_train, pd.DataFrame(y_train, columns=['target'])], axis=1).groupby('CHAS').mean()['target']

CHAS
0.0    24.248261
1.0    24.695000
Name: target, dtype: float64

In [6]:
pd.concat([X_train, pd.DataFrame(y_train, columns=['target'])], axis=1).groupby('RAD').mean()['target']

RAD
1.0    28.066667
2.0    26.833333
3.0    28.533333
4.0    20.826761
5.0    23.840260
6.0    20.813333
7.0    21.933333
8.0    30.358333
Name: target, dtype: float64

In [35]:
# use target encoding to encode two categorical features
enc = TargetEncoder(cols=['CHAS', 'RAD'], smoothing=0.3).fit(X_train, y_train)

In [36]:
# transform the datasets
training_numeric_dataset = enc.transform(X_train, y_train)
testing_numeric_dataset = enc.transform(X_test)

In [37]:
training_numeric_dataset.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,24.248261,0.538,6.575,65.2,4.09,28.061859,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,24.248261,0.469,6.421,78.9,4.9671,26.833333,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,24.248261,0.469,7.185,61.1,4.9671,26.833333,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,24.248261,0.458,6.998,45.8,6.0622,28.533333,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,24.248261,0.458,7.147,54.2,6.0622,28.533333,222.0,18.7,396.9,5.33
