In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.compose import ColumnTransformer

In [2]:
class RareCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, threshold: float=0.05):
        self.threshold = threshold
        
    def __rare_category_detector(self, X, y=None):
        X = pd.Series(X).copy()
        val_counts = X.value_counts(normalize=True)
        rare_cats = [*val_counts[val_counts < self.threshold].index]
        self.rare_cat_list.append(rare_cats)

    def fit(self, X, y=None):
        X = pd.DataFrame(X).copy()
        self.feature_names = X.columns
        self.rare_cat_list = []
        X.apply(self.__rare_category_detector)
        return self
    
    def transform(self, X, y=None):
        X = pd.DataFrame(X).copy()
        for i in range(X.shape[1]):
            x = X.iloc[:, i].copy()
            x[x.isin(self.rare_cat_list[i])] = 'rare_category'
            X.iloc[:, i] = x
        return X
    
    def get_rare_cats(self):
        return self.rare_cat_list
    
    def get_feature_names_out(self, input_features=None):
        return self.feature_names

In [3]:
X = pd.DataFrame({'x':['b','b','a', np.nan], 'y':['b','a','b', 'c'], 'z':[1, 2, 3, 1]})

In [4]:
X

Unnamed: 0,x,y,z
0,b,b,1
1,b,a,2
2,a,b,3
3,,c,1


In [5]:
ct_rare = ColumnTransformer(transformers=[['rare', RareCategoryEncoder(threshold=0.5), [0,1]]],
                            remainder='passthrough', verbose_feature_names_out=False)

In [6]:
X1 = pd.DataFrame(ct_rare.fit_transform(X), columns=ct_rare.get_feature_names_out())
X1

Unnamed: 0,x,y,z
0,b,b,1
1,b,rare_category,2
2,rare_category,b,3
3,,rare_category,1
