## Implement a mean imputation class that is compatible with scikit-learn

In [1]:
import pandas as pd

## Constructor

In [2]:
class MeanImputer:
    # constructor
    def __init__(self,variables):
        self.variables = variables

In [3]:
obj1 = MeanImputer(['age','weight'])

In [4]:
obj1.variables

['age', 'weight']

## Fit method

In [5]:
class MeanImputer:
    # constructor
    def __init__(self,variables):
        self.variables=variables
        
    # fit method
    # To be compatible with sklearn, my arguments needs to 
    # be X (input feature data), and y (input response variable, optional)
    def fit(self,X,y=None):
        # I need to store parameters in an attribute called imputer_dict_. 
        # imputer_dict_ is a dictionary that contains the names of the variables (specified by users)
        # in the constructors as key, and the column mean of each variable as values
        self.imputer_dict_ = X[self.variables].mean().to_dict()
        
        # optional return statement
        return self

## Import data

In [6]:
# Import data
data = pd.read_csv('../data/train.csv')

In [7]:
obj2 = MeanImputer(variables = ['LotFrontage','MasVnrArea'])

In [8]:
obj2.variables

['LotFrontage', 'MasVnrArea']

In [9]:
X= data[data!='SalePrice']

In [10]:
y= data['SalePrice']

## Apply fit method

In [11]:
obj2.fit(X)

<__main__.MeanImputer at 0x1172ba370>

In [12]:
obj2.imputer_dict_

{'LotFrontage': 70.04995836802665, 'MasVnrArea': 103.68526170798899}

## Transform method

In [13]:
class MeanImputer:
    def __init__(self,variables):
        self.variables = variables
        
    def fit(self,X,y=None):
        self.imputer_dict_ = X[self.variables].mean().to_dict()
        return self
    
    # To be compatible with sklearn, the transform method only takes self and 
    # X (the input data) as arguments
    def transform(self,X):
        
        # Perform mean imputation to the "variables" columns in the input dataframe X
        for var in self.variables:
            # Fill the missing values in var with its column mean
            X[var] = X[var].fillna(self.imputer_dict_[var])
            
        # Return the transformed data
        return X

In [14]:
obj3 = MeanImputer(['LotFrontage','MasVnrArea'])

In [15]:
obj3.variables

['LotFrontage', 'MasVnrArea']

In [16]:
obj3.fit(X)

<__main__.MeanImputer at 0x1172e0070>

In [17]:
obj3.imputer_dict_

{'LotFrontage': 70.04995836802665, 'MasVnrArea': 103.68526170798899}

In [18]:
X_transform = obj3.transform(X)

In [19]:
# Check to see if there is missing values in the two columns
X_transform[['LotFrontage','MasVnrArea']].isnull().sum()

LotFrontage    0
MasVnrArea     0
dtype: int64