<a href="https://colab.research.google.com/github/lilianabs/kaggle-notebooks/blob/main/Custom_encoders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
data = {
    'cat1': ['A', 'B', 'A', 'C', 'A', 'B', 'C', 'C', 'A', 'A', 'C'],
    'cat2': ['X', 'Y', 'X', 'Z', 'X', 'Y', 'Z', 'Z', 'X', 'Z', 'Y'],
    'num1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
}

df = pd.DataFrame(data)

In [3]:
df.head()

Unnamed: 0,cat1,cat2,num1
0,A,X,1
1,B,Y,2
2,A,X,3
3,C,Z,4
4,A,X,5


# Frequency encoder

In [11]:
class FrequencyEncoder(BaseEstimator, TransformerMixin):
  def __init__(self, normalize=True):
    self.freq_maps = {}
    self.normalize = normalize

  def fit(self, X, y=None, normalize=False):
    for col in X.columns:
      freqs = X[col].value_counts(normalize=self.normalize)
      self.freq_maps[col] = freqs

    return self

  def transform(self, X):
    X_transformed = X.copy()
    for col in X.columns:
      X_transformed[col] = X[col].map(self.freq_maps[col])

    return X_transformed

In [12]:
fe = FrequencyEncoder(normalize=True)
df_encoded = fe.fit_transform(df[["cat1", "cat2"]])

In [13]:
df_encoded.head()

Unnamed: 0,cat1,cat2
0,0.454545,0.363636
1,0.181818,0.272727
2,0.454545,0.363636
3,0.363636,0.363636
4,0.454545,0.363636


# Target encoder

In [14]:
def target_encode(df, cat_feature, target):
    encoded_feature = df.groupby(cat_feature)[target].mean()
    return df[cat_feature].map(encoded_feature)

In [15]:
def smoothed_target_encoding(df, cat_feature, target, smoothing_param=10):
    mean_target = df[target].mean()
    agg = df.groupby(cat_feature)[target].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    smooth = (counts * means + smoothing_param * mean_target) / (counts + smoothing_param)
    return df[cat_feature].map(smooth)

In [16]:
df_2 = target_encode(df, 'cat1', 'num1')

df_2.head()

Unnamed: 0,cat1
0,5.6
1,4.0
2,5.6
3,7.5
4,5.6


In [17]:
df_3 = smoothed_target_encoding(df, 'cat1', 'num1')
df_3.head()

Unnamed: 0,cat1
0,5.866667
1,5.666667
2,5.866667
3,6.428571
4,5.866667


In [None]:
class TargetEncoder(BaseEstimator, TransformerMixin):
  def __init__(self, smoothing=False, smoothing_param=10):
    self.smoothing = smoothing
    self.smoothing_param = smoothing_param

  def fit(self, X, y=None):
    pass

  def transforms(self, X):
    pass