In [1]:
import pandas as pd

In [2]:
from sklearn.base import TransformerMixin
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline

In [3]:
X = pd.DataFrame({'city':['tokyo', None, 'london', 'seattle', 'sanfrancisco', 'tokyo'],                  
                  'boolean':['yes', 'no', None, 'no', 'no', 'yes'],                  
                  'ordinal_column':['somewhat like', 'like','somewhat like', 'like', 'somewhat like', 'dislike'],                  
                  'quantitative_column':[1, 11, -.5, 10, None,20]})

In [4]:
X

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,,like,11.0
2,,london,somewhat like,-0.5
3,no,seattle,like,10.0
4,no,sanfrancisco,somewhat like,
5,yes,tokyo,dislike,20.0


In [5]:
mostFrequentCity = X.loc[:, 'city'].value_counts().index[0]

In [6]:
X.loc[:, 'city'].fillna(mostFrequentCity)

0           tokyo
1           tokyo
2          london
3         seattle
4    sanfrancisco
5           tokyo
Name: city, dtype: object

In [7]:
class CustomCategoryImputer(TransformerMixin):
    
    def __init__(self, cols = None):
        self.cols = cols
        
    def transform(self, df):
        X = df.copy()
        for col in self.cols:
            X.loc[:, col].fillna(X.loc[:, col].value_counts().index[0], inplace = True)
        return X
    
    def fit(self, *_):
        return self

In [8]:
cci = CustomCategoryImputer(cols = ['city', 'boolean'])

In [9]:
cci.fit_transform(X)

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,tokyo,like,11.0
2,no,london,somewhat like,-0.5
3,no,seattle,like,10.0
4,no,sanfrancisco,somewhat like,
5,yes,tokyo,dislike,20.0


In [10]:
class CustomQuantitativeImputer(TransformerMixin):
    
    def __init__(self, cols = None, strategy = 'mean'):
        
        self.cols = cols
        self.strategy = strategy
        
    def transform(self, df):
        
        X = df.copy()
        impute = Imputer(strategy = self.strategy)
        
        for col in self.cols:
            X.loc[:, col] = impute.fit_transform(X[[col]])
        
        return X
    
    def fit(self, *_):
        return self
        

In [11]:
cqi = CustomQuantitativeImputer(cols = ['quantitative_column'])

In [12]:
cqi.fit_transform(X)



Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,,like,11.0
2,,london,somewhat like,-0.5
3,no,seattle,like,10.0
4,no,sanfrancisco,somewhat like,8.3
5,yes,tokyo,dislike,20.0


In [13]:
X = pd.DataFrame({'city':['tokyo', None, 'london', 'seattle', 'sanfrancisco', 'tokyo'],                  
                  'boolean':['yes', 'no', None, 'no', 'no', 'yes'],                  
                  'ordinal_column':['somewhat like', 'like','somewhat like', 'like', 'somewhat like', 'dislike'],                  
                  'quantitative_column':[1, 11, -.5, 10, None,20]})

In [14]:
X

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,,like,11.0
2,,london,somewhat like,-0.5
3,no,seattle,like,10.0
4,no,sanfrancisco,somewhat like,
5,yes,tokyo,dislike,20.0


In [15]:
imputer = Pipeline([('quant', cqi), ('category', cci)])

In [16]:
imputer.fit_transform(X)



Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,tokyo,like,11.0
2,no,london,somewhat like,-0.5
3,no,seattle,like,10.0
4,no,sanfrancisco,somewhat like,8.3
5,yes,tokyo,dislike,20.0


In [34]:
class CustomDummifier(TransformerMixin):  
    
    def __init__(self, cols=None):        
        self.cols = cols    
        
    def transform(self, X):        
        return pd.get_dummies(X, columns=self.cols)    
    
    def fit(self, *_):
        return self

In [35]:
cd = CustomDummifier(cols=['boolean', 'city'])

In [18]:
pd.get_dummies(X, columns = ['city', 'boolean'], prefix_sep='__')

Unnamed: 0,ordinal_column,quantitative_column,city__london,city__sanfrancisco,city__seattle,city__tokyo,boolean__no,boolean__yes
0,somewhat like,1.0,0,0,0,1,0,1
1,like,11.0,0,0,0,0,1,0
2,somewhat like,-0.5,1,0,0,0,0,0
3,like,10.0,0,0,1,0,1,0
4,somewhat like,,0,1,0,0,1,0
5,dislike,20.0,0,0,0,1,0,1


In [20]:
class CustomEncoding(TransformerMixin):
   
    def __init__(self, col ,ordering = None):
        self.ordering = ordering
        self.col = col
        
    def transform(self, df):
        
        X = df.copy()
        X.loc[:, self.col] = X.loc[:, self.col].map(lambda x: self.ordering.index(x))
        return X
    
    def fit(self, *_):
        return self

In [21]:
ce = CustomEncoding(col = 'ordinal_column', ordering = ['dislike', 'somewhat like', 'like'])

In [22]:
ce.fit_transform(X)

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,1,1.0
1,no,,2,11.0
2,,london,1,-0.5
3,no,seattle,2,10.0
4,no,sanfrancisco,1,
5,yes,tokyo,0,20.0


In [24]:
pd.cut(X.loc[:, 'quantitative_column'], bins = 3)

0     (-0.52, 6.333]
1    (6.333, 13.167]
2     (-0.52, 6.333]
3    (6.333, 13.167]
4                NaN
5     (13.167, 20.0]
Name: quantitative_column, dtype: category
Categories (3, interval[float64]): [(-0.52, 6.333] < (6.333, 13.167] < (13.167, 20.0]]

In [25]:
pd.cut(X.loc[:, 'quantitative_column'], bins = 3, labels = False)

0    0.0
1    1.0
2    0.0
3    1.0
4    NaN
5    2.0
Name: quantitative_column, dtype: float64

In [31]:
class CustomCutter(TransformerMixin):
    
    def __init__(self, col, bins, labels = False):
        
        self.labels = labels
        self.col = col
        self.bins = bins
        
    def transform(self, df):
        
        X = df.copy()
        X.loc[:, self.col] = pd.cut(X.loc[:, self.col], bins = self.bins, labels = self.labels)
        return X
        
    def fit(self, *_):
        return self

In [32]:
cc = CustomCutter(col = 'quantitative_column', bins = 3)

In [33]:
cc.fit_transform(X)

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,0.0
1,no,,like,1.0
2,,london,somewhat like,0.0
3,no,seattle,like,1.0
4,no,sanfrancisco,somewhat like,
5,yes,tokyo,dislike,2.0


In [36]:
pipe = Pipeline([("imputer", imputer), ('dummify', cd), ('encode', ce), ('cut', cc)])

In [38]:
pipe.fit(X)



Pipeline(memory=None,
     steps=[('imputer', Pipeline(memory=None,
     steps=[('quant', <__main__.CustomQuantitativeImputer object at 0x7f78bb120828>), ('category', <__main__.CustomCategoryImputer object at 0x7f78bb1549e8>)])), ('dummify', <__main__.CustomDummifier object at 0x7f78bb062630>), ('encode', <__main__.CustomEncoding object at 0x7f78bb0bb278>), ('cut', <__main__.CustomCutter object at 0x7f78bb054b70>)])

In [39]:
pipe.transform(X)



Unnamed: 0,ordinal_column,quantitative_column,boolean_no,boolean_yes,city_london,city_sanfrancisco,city_seattle,city_tokyo
0,1,0,0,1,0,0,0,1
1,2,1,1,0,0,0,0,1
2,1,0,1,0,1,0,0,0
3,2,1,1,0,0,0,1,0
4,1,1,1,0,0,1,0,0
5,0,2,0,1,0,0,0,1
