# Create scikit-learn compatible transformers that learns parameters
- In the previous notebook 06b, we created transformers that don't really have fit methods (the fit methods just return self) because the transformations (e.g., recoding categorical variables) do not need to learn any parameters.
- We will now show examples of transformers that DO require parameter learning and see how we can code the fit method.
- We WON'T be using the transformers that we code in this notebook. Instead we use the transformers from feature-engine and scikit-learn. The point of this notebook is to give you an idea of how transformers that requires parameters learning are created.

## Tip: to learn more about how different transformers were coded, you can go look at the source codes of transformers in scikit-learn and feature-engine

In [31]:
import numpy as np
import pandas as pd

'''sklearn'''
from sklearn.base import BaseEstimator, TransformerMixin

## Create our own sklearn compatible transformer that does mean imputation (for missing values in continuous variables)

In [12]:
class MeanImputer(BaseEstimator, TransformerMixin):
    '''Mean imputer for continuous variables with missing values'''
    
    def __init__(self,variables):
        '''
        Constructor
        
        Args:
            variables (List[str]): a list of variables with missing values to be imputed
        
        Returns:
            void        
        '''
        # Error handling: check to see if variables is a list
        if not isinstance(variables,list):
            raise ValueError('variables should be a list')
        
        # Set the variables attribute
        self.variables = variables
        
    
    def fit(self,X,y=None): # need to have y as argument to make class compatible with sklearn pipeline
        """ Fit
        
        Args:
            X (DataFrame): a input dataframe of features to train the transformer
            y (DataFrame): a input Series of response variable to train the transformer (optional)
            
        Returns:
            self    
        """
        
        # we need to compute the column means of variables, and persist them in a dictionary
        # (an attribute of the class) called "imputer_dict_"
        self.imputer_dict_=X[self.variables].mean().to_dict()
        
        return self
    
    def transform(self,X):
        """ Transform
        
        Args:
            X (DataFrame): a input dataframe of features to be transformed
          
        Returns:
            X (DataFrame): the transformed Dataframe of features    
        """
        # Make a copy of the input Dataframe of features to be transformed
        # so we won't overwrite the original Dataframe that was passed as argument
        X=X.copy()
        
        # Perform mean imputation of the missing values in X[var]
        
        for var in self.variables:
            X[var].fillna(self.imputer_dict_[var],
                         inplace=True)
            
        return X

## RareLabelCategoricalEncoder
- A custom sklearn compatible transformer that groups all infrequent levels of a variable of interest in a single level called "Rare"

In [13]:
class RareLabelCategoricalEncoder(BaseEstimator,TransformerMixin):
    '''Groups infrequent categories into a single level "Missing"'''
    
    # need to set default values for "variables" also (since I set it for tol)
    def __init__(self,tol=0.05,variables=None): 
        '''
        Constructor
        
        Args:
            tol (float): the min threshold of the proportion of observations that a level must have in order
                         to be considered to be "frequent" (not Rare)
            variables (List[str]): a list of variables to be transformed
            
        
        Returns:
            void        
        '''
        # Error handling: check to ensure variables is a list
        if not isinstance(variables,list):
            raise ValueError('variables should be a list')
            
        # set attributes
        self.tol = tol
        self.variables = variables
        
        
    def fit(self,X,y=None): # need to have y as argument to make class compatible with sklearn pipeline
        '''
        fit
        
         Args:
            X (DataFrame): a input dataframe of features to train the transformer
            y (DataFrame): a input Series of response variable to train the transformer (optional)
            
        Returns:
            self    
        '''
        # persist freqeunt levels for each variable in a dictionary (an attribute of the class) called "encoder_dict_"
        # The key of this dict is the variable name, and the value is a list of frequent levels
        # init encoder_dict_
        self.encoder_dict_ = {}
        
        for var in self.variables:
            # the encoder will learn the frequent levels in var
            
            # Get the proportion of observations for each level in the variable of interest
            # Note: X[var].value_counts() is a series already, no need to cast it as pd.Series again
            t = pd.Series(X[var].value_counts(normalize=True)) 
            # get a list of frequent levels (those with proportion of observations > threshold)
            self.encoder_dict_[var] = list(t[t>=self.tol].index) 
            
        return self
    
    def transform(self,X):
        """ Transform
        
        Args:
            X (DataFrame): a input dataframe of features to be transformed
          
        Returns:
            X (DataFrame): the transformed Dataframe of features    
        """
        
        # Make a copy of the input Dataframe of features to be transformed
        # so we won't overwrite the original Dataframe that was passed as argument
        X=X.copy()
        for var in self.variables:
            # Use np.where
            # The condition is: the levels of var is in the value (a list of frequent variables) 
            # of encoder_dict_
            # If condition is satisfied, then no changes, otherwise, set the observation to "Rare"
            X[var] = np.where(X[var].isin(self.encoder_dict_[var]),
                             X[var],"Rare")
            
        return X

## CategoricalEncoder
- A custom sklearn compatible transformer that takes a categorical variable, and then recode its level to a number such that the higher the number, the higher the SalePrice
- Converts a categorical variable to an ordinal variable by recoding it.

In [14]:
class CategoricalEncodder(BaseEstimator, TransformerMixin):
    '''Strings to number categorical encoder (categorical->ordinal)'''
    
    """
    Constructor
        
        Args:
            variables (List[str]): a list of variables to be transformed

        Returns:
            void        
    """
    
    def __init__(self,variables):
        
        if not isinstance(variables,list):
            raise ValueError('variables should be a list')
        
        # set attributes
        self.variables = variables
        
    def fit(self,X,y):
        # create a temp dataframe with both the features and response variable
        temp = pd.concat([X,y],axis=1)
        temp.columns = list(X.columns) + ['target']
        
        # persist the transforming dictionary
        self.encoder_dict_={}
        
        for var in self.variables:
            # Create a list of index (levels of var) that are sorted in ascending order based on
            # their mean SalePrice (i.e. the mean SalePrice of each level)
            t= temp.groupby([var])['target'].mean().sort_values(ascending=True).index
            
            # populate the encoder_dict_
            # The key of the dictionary is the variable name
            # The value of the dictionary is another dictionary with key as levels, and value as numeric encoding
            self.encoder_dict_[var]={k:i for i,k in enumerate(t,start=0)}
            
            print(self.encoder_dict_)
            
            return self
        
    def transform(self,X):
        """ Transform
        
        Args:
            X (DataFrame): a input dataframe of features to be transformed
          
        Returns:
            X (DataFrame): the transformed Dataframe of features    
        """
        # Make a copy of the input Dataframe of features to be transformed
        # so we won't overwrite the original Dataframe that was passed as argument
        X=X.copy()
        
        # encode labels
        for var in self.variables:
            X[var] = X[var].map(self.encoder_dict_[var])
            
        return X

## Import data

In [15]:
data = pd.read_csv('../data/train.csv')

In [16]:
X=data.drop('SalePrice', axis=1)

In [17]:
X

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,8,2007,WD,Normal
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal


######################################################################################################################

In [18]:
y=data['SalePrice']

In [19]:
temp =pd.concat([X,y],axis=1)

In [20]:
list(X.columns)+['SalePrice'];

In [21]:
temp.columns = list(X.columns)+['SalePrice']

In [22]:
temp

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [23]:
temp['MSZoning'].value_counts(normalize=True)

RL         0.788356
RM         0.149315
FV         0.044521
RH         0.010959
C (all)    0.006849
Name: MSZoning, dtype: float64

In [24]:
temp.groupby(['MSZoning'])['SalePrice'].mean()

MSZoning
C (all)     74528.000000
FV         214014.061538
RH         131558.375000
RL         191004.994787
RM         126316.830275
Name: SalePrice, dtype: float64

In [25]:
temp.groupby('MSZoning')['SalePrice'].mean().sort_values(ascending=True)

MSZoning
C (all)     74528.000000
RM         126316.830275
RH         131558.375000
RL         191004.994787
FV         214014.061538
Name: SalePrice, dtype: float64

In [26]:
t=temp.groupby('MSZoning')['SalePrice'].mean().sort_values(ascending=True).index
t

Index(['C (all)', 'RM', 'RH', 'RL', 'FV'], dtype='object', name='MSZoning')

In [27]:
{k:i for i,k in enumerate(t,start=0)}

{'C (all)': 0, 'RM': 1, 'RH': 2, 'RL': 3, 'FV': 4}

######################################################################################################################

## Test the CateogricalEncoder Class

In [28]:
cat_encoder=CategoricalEncodder(['MSZoning'])

In [29]:
cat_encoder.fit(X,y)

{'MSZoning': {'C (all)': 0, 'RM': 1, 'RH': 2, 'RL': 3, 'FV': 4}}


CategoricalEncodder(variables=['MSZoning'])

In [30]:
cat_encoder.transform(X)['MSZoning'].sample(10)

1200    3
494     1
108     1
815     3
530     3
26      3
5       3
623     4
733     3
847     3
Name: MSZoning, dtype: int64