In [4]:
# Example of Transformer to learn and store parameters

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin

class MeanImputer(BaseEstimator, TransformerMixin):
    """Numerical missing value imputer."""

    def __init__(self, variables, mappings):
        if not isinstance(variables, list):
            raise ValueError('variables should be a list')
        self.variables = variables

    def fit(self, X, y=None):
        # persist mean values in a dictionary
        self.imputer_dict_ = X[self.variables].mean().to_dict()
        return self

    def transform(self, X):
        X= X.copy()
        for feature in self.variables:
            X[feature].fillna(self.imputer_dict_[feature], inplace=True)

        return X
    
class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):
    """ Groups infrequent categories into a single string """

    def __init__(self, tol=0.05, variables=None):

        if not isinstance(variables, list):
            raise ValueError('variables should be a list')
        self.tol = tol
        self.variables = variables

    def fit(self, X, y=None):
        # persist frequent labels in dictionary
        self.encoder_dict_ = {}

        for var in self.variables:
            # the encoder will learn the most frequent categories
            t = pd.Series(X[var].value_counts(normalize=True))
            # frequent labels:
            self.encoder_dict_[var] = list(t[t >= self.tol].index)

        return self
    
    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature] = np.where(
                X[feature].isin(self.encoder_dict_[feature]), X[feature], "Rare")

        return X

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """ String to numbers categorical encoder """
    def __init__(self, variables):
        if not isinstance(variables, list):
            raise ValueError('variables should be a list')
        self.variables = variables
    
    def fit(self, X, y):
         temp = pd.concat([X,y], axis=1)
         temp.columns = list(X.columns) + ["target"]

         # persist transforming dictionary
         self.encoder_dict_ = {}

         for var in self.variables:
             t = temp.groupby([var])["target"].mean().sort_values(ascending=True).index
             self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}
         
         return self
        
    def transform(self, X):
        # encode labels
        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].map(self.encoder_dict_[feature])
            
        return X