In [None]:
# Last amended: 26th June, 2020
# Myfolder: D:\data\OneDrive\Documents\bank_loan_status
# Ref: https://stackoverflow.com/a/54648023/3282777
# Objectives:
#             i) Demonstrate nested Pipelining
#            ii) Demonstrate CustomTransformer
#           iii) Get feature name after OneHotEncode In ColumnTransformer

In [1]:
# 1.0 Call libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

In [5]:
# 2.0 Our demo dataframe
df = pd.DataFrame({'brand': ['aaaa', 'asdfasdf', 'sadfds', 'NaN', 'NaN'],
                   'category': ['asdf', 'asfa', 'asdfas', 'as', np.nan],
                   'num1': [1, 1, 0, np.nan, 1],
                   'num2' : [np.nan, 2,1,2,np.nan],
                   'target': [0.2, 0.11, 1.34, 1.123, 1.414]})
df

Unnamed: 0,brand,category,num1,num2,target
0,aaaa,asdf,1.0,,0.2
1,asdfasdf,asfa,1.0,2.0,0.11
2,sadfds,asdfas,0.0,1.0,1.34
3,,as,,2.0,1.123
4,,,1.0,,1.414


In [6]:
# 3.0
pipe1 = Pipeline([('si', SimpleImputer(strategy = 'median')), ('ss', StandardScaler())         ])

In [8]:
# 3.1
pipe2 = Pipeline([('si', SimpleImputer(strategy = "most_frequent")),('ohe', OneHotEncoder()) ])

In [22]:
# 3.2
categorical_features = ['brand', 'category']
ct = ColumnTransformer([('pipe1', pipe1, ['num1', 'num2']), ('pipe2', pipe2, categorical_features)])

In [23]:
# 4.0
lr = LinearRegression()

In [24]:
# 4.1
pipe3 = Pipeline([('ct', ct), ('lr', lr)])

In [25]:
# 4.2
X = df[['brand', 'category', 'num1', 'num2']]
X
y = df['target']

In [26]:
# 5.0
pipe3.fit(X,y)

Pipeline(memory=None,
         steps=[('ct',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipe1',
                                                  Pipeline(memory=None,
                                                           steps=[('si',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                                          

In [27]:
# 5.1
pipe3.predict(X)

array([0.2   , 0.11  , 1.34  , 1.2685, 1.2685])

In [28]:
# 5.2
pipe3.named_steps['ct'].transformers_[1][1]\
   .named_steps['ohe'].get_feature_names(categorical_features)

array(['brand_NaN', 'brand_aaaa', 'brand_asdfasdf', 'brand_sadfds',
       'category_as', 'category_asdf', 'category_asdfas', 'category_asfa'],
      dtype=object)

In [None]:
##############################