## Pipeline example

In [22]:
import pandas as pd

In [23]:
X = pd.DataFrame({'city': ['tokyo', None, 'london', 'seatle', 'san francisco', 'tokyo'],
                 'boolean': ['yes', 'no', None, 'no', 'no', 'yes', ],
                 'ordinal_column': ['somewhat like', 'like', 'somewhat like', 'like', 'somewhat like', 'dislike'],
                 'quantitative_column': [1, 11, -.5, 10, None, 20]})

In [24]:
X.isnull().sum()

boolean                1
city                   1
ordinal_column         0
quantitative_column    1
dtype: int64

In [25]:
# most common city to fill up missing values
X['city'].value_counts().index[0]

'tokyo'

### Custom category imputer

In [26]:
from sklearn.base import TransformerMixin

class CustomCategoryImputer(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
    
    def transform(self, df):
        X = df.copy()
        for col in self.cols:
            X[col].fillna(X[col].value_counts().index[0], inplace=True)
        return X
    
    def fit(self, *_):
        return self

In [27]:
# Implement our custom categorical imputer on our categorical columns.
cci = CustomCategoryImputer(cols=['city', 'boolean'])

In [28]:
cci.fit_transform(X)

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,tokyo,like,11.0
2,no,london,somewhat like,-0.5
3,no,seatle,like,10.0
4,no,san francisco,somewhat like,
5,yes,tokyo,dislike,20.0


### Custom quantitative imputer

In [29]:
# Lets make an imputer that can apply a strategy to select columns by name

from sklearn.preprocessing import Imputer
class CustomQuantitativeImputer(TransformerMixin):
    def __init__(self, cols=None, strategy='mean'): 
        # strategy parameter that will allow us to specify exactly how we want 
        # to impute missing values for our quantitative data.
        self.cols = cols
        self.strategy = strategy
    
    def transform(self, df):
        X = df.copy()
        impute = Imputer(strategy=self.strategy)
        for col in self.cols:
            X[col] = impute.fit_transform(X[[col]])
        return X
    
    def fit(self, *_):
        return self

In [31]:
cqi = CustomQuantitativeImputer(cols=['quantitative_column'], strategy='mean')
cqi.fit_transform(X)

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,,like,11.0
2,,london,somewhat like,-0.5
3,no,seatle,like,10.0
4,no,san francisco,somewhat like,8.3
5,yes,tokyo,dislike,20.0


### Pipeline to transform data in one go

In [32]:
from sklearn.pipeline import Pipeline

In [34]:
# pass through our custom imputers
imputer = Pipeline([('quant', cqi), ('category', cci)])
imputer.fit_transform(X)

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,tokyo,like,11.0
2,no,london,somewhat like,-0.5
3,no,seatle,like,10.0
4,no,san francisco,somewhat like,8.3
5,yes,tokyo,dislike,20.0


### Encoding at the nominal level

In [36]:
pd.get_dummies(X, columns=['city', 'boolean'], prefix_sep='__') # the separator between the prefix (column name) and cell value

Unnamed: 0,ordinal_column,quantitative_column,city__london,city__san francisco,city__seatle,city__tokyo,boolean__no,boolean__yes
0,somewhat like,1.0,0,0,0,1,0,1
1,like,11.0,0,0,0,0,1,0
2,somewhat like,-0.5,1,0,0,0,0,0
3,like,10.0,0,0,1,0,1,0
4,somewhat like,,0,1,0,0,1,0
5,dislike,20.0,0,0,0,1,0,1


### Custom dummifier

In [37]:
# create our own custom dummifier
# Our custom dummifier mimics scikit-learn's OneHotEncoding, 
# but with the added advantage of working on our entire DataFrame. 
class CustomDummifier(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
    
    def transform(self, X):
        return pd.get_dummies(X, columns=self.cols)

    def fit(self, *_):
        return self

In [64]:
cd = CustomDummifier(cols=['city', 'boolean'])
cd.fit_transform(X)

Unnamed: 0,ordinal_column,quantitative_column,city_london,city_san francisco,city_seatle,city_tokyo,boolean_no,boolean_yes
0,somewhat like,1.0,0,0,0,1,0,1
1,like,11.0,0,0,0,0,1,0
2,somewhat like,-0.5,1,0,0,0,0,0
3,like,10.0,0,0,1,0,1,0
4,somewhat like,,0,1,0,0,1,0
5,dislike,20.0,0,0,0,1,0,1


### Encoding at the ordinal level

In [38]:
# set up a list with our ordinal data corresponding the list index
ordering = ['dislike', 'somewhat like', 'like'] # 0 for dislike, 1 for somewhat like, and 2 for like
# before we map our ordering to our ordinal column, let's take a look at the column 

X['ordinal_column']

0    somewhat like
1             like
2    somewhat like
3             like
4    somewhat like
5          dislike
Name: ordinal_column, dtype: object

In [50]:
ordering.index('dislike'), ordering.index('like')

(0, 2)

In [48]:
# lambad x: ordering.index(x)
# this specific code is creating a function that will apply the index of our list called 
# ordering to each element.
# now map our ordering to our ordinal column:
X['ordinal_column'].map(lambda x:ordering.index(x))

0    1
1    2
2    1
3    2
4    1
5    0
Name: ordinal_column, dtype: int64

### Custom label encoder

In [51]:
class CustomEncoder(TransformerMixin):
    def __init__(self, col, ordering=None):
        self.ordering = ordering #Note the key parameter, ordering, which will determine which numerical values the labels will be encoding into. 
        self.col = col
    
    def transform(self, df):
        X = df.copy()
        X[self.col] = X[self.col].map(lambda x: self.ordering.index(x))
        return X
    
    def fit(self, *_):
        return self

In [52]:
ce = CustomEncoder(col='ordinal_column', ordering=['dislike', 'somewhat like', 'like'])
ce.fit_transform(X)

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,1,1.0
1,no,,2,11.0
2,,london,1,-0.5
3,no,seatle,2,10.0
4,no,san francisco,1,
5,yes,tokyo,0,20.0


Our ordinal column is now labeled.

Up to this point, we have transformed the following columns accordingly:

-  boolean, city: dummy encoding
-  ordinal_column: label encoding

### Bucketing continuous features into categories

In [53]:
# name of category is the bin by default
pd.cut(X['quantitative_column'], bins=3)

0     (-0.52, 6.333]
1    (6.333, 13.167]
2     (-0.52, 6.333]
3    (6.333, 13.167]
4                NaN
5     (13.167, 20.0]
Name: quantitative_column, dtype: category
Categories (3, interval[float64]): [(-0.52, 6.333] < (6.333, 13.167] < (13.167, 20.0]]

In [55]:
# using no labels
pd.cut(X['quantitative_column'], bins=3, labels=False)

0    0.0
1    1.0
2    0.0
3    1.0
4    NaN
5    2.0
Name: quantitative_column, dtype: float64

### Custom cutter

In [61]:
class CustomCutter(TransformerMixin):
    def __init__(self, col, bins, labels=False):
        self.labels = labels
        self.bins = bins
        self.col = col
    
    def transform(self, df):
        X = df.copy()
        X[self.col] = pd.cut(X[self.col], bins=self.bins, labels=self.labels)
        return X
    
    def fit(self, *_):
        return self

In [62]:
cc = CustomCutter(col='quantitative_column', bins=3)
cc.fit_transform(X)

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,0.0
1,no,,like,1.0
2,,london,somewhat like,0.0
3,no,seatle,like,1.0
4,no,san francisco,somewhat like,
5,yes,tokyo,dislike,2.0


Note that our quantitative_column is now ordinal, and so there is no need to dummify the data. 

## Creating our pipeline

In [63]:
# lets put everything together in a pipeline
from sklearn.pipeline import Pipeline

In [65]:
pipe = Pipeline([('imputer', imputer), ('dummify', cd), ('encode', ce), ('cut', cc)])
# will use our initial imputer
# will dummify variables first
# then encode the ordinal column
# then bucket (bin) the quantitative column

In [66]:
# before transformations
X

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,,like,11.0
2,,london,somewhat like,-0.5
3,no,seatle,like,10.0
4,no,san francisco,somewhat like,
5,yes,tokyo,dislike,20.0


In [67]:
# now fit out pipeline
pipe.fit(X)

Pipeline(memory=None,
     steps=[('imputer', Pipeline(memory=None,
     steps=[('quant', <__main__.CustomQuantitativeImputer object at 0x0000029F78091C50>), ('category', <__main__.CustomCategoryImputer object at 0x0000029F779CA278>)])), ('dummify', <__main__.CustomDummifier object at 0x0000029F77A47AC8>), ('encode', <__main__.CustomEncoder object at 0x0000029F781A4550>), ('cut', <__main__.CustomCutter object at 0x0000029F77A61908>)])

In [68]:
# we've craeted our pipeline object, let's transform our DataFrame
pipe.transform(X)

Unnamed: 0,ordinal_column,quantitative_column,city_london,city_san francisco,city_seatle,city_tokyo,boolean_no,boolean_yes
0,1,0,0,0,0,1,0,1
1,2,1,0,0,0,1,1,0
2,1,0,1,0,0,0,1,0
3,2,1,0,0,1,0,1,0
4,1,1,0,1,0,0,1,0
5,0,2,0,0,0,1,0,1


In [None]:
# References and credits to
# Feature Engineering Made Easy