In [1]:
# References
# 1) https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/
# 2) https://zablo.net/blog/post/pandas-dataframe-in-scikit-learn-feature-union/
# 3) https://github.com/shaypal5/pdpipe
# 4) https://medium.com/dunder-data/from-pandas-to-scikit-learn-a-new-exciting-workflow-e88e2271ef62 <- really good one

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer

In [2]:
data = pd.read_csv('heart.csv')

## Single example

In [15]:
data['cp'].value_counts() # let's try the OneHotEncoder on this column

0    143
2     87
1     50
3     23
Name: cp, dtype: int64

In [42]:
# most sklearn transformers require 2D data. We can achieve it in those two following ways
cp = data['cp'].values.reshape(-1,1)
cp = data[['cp']]
# data['cp'] - produces a 1D serie

In [43]:
cp

Unnamed: 0,cp
0,3
1,2
2,1
3,1
4,0
...,...
298,0
299,3
300,0
301,0


In [44]:
ohe = OneHotEncoder(categories = 'auto', sparse = False) # is you specify sparse = True then you get the array back

In [45]:
ohe.fit(cp)

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=False)

In [46]:
ohe.transform(cp) # to get transformed data

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]])

In [41]:
ohe.get_feature_names() # to get the name of the new features

array(['x0_0', 'x0_1', 'x0_2', 'x0_3'], dtype=object)

## From a column transformer

In [3]:
pipe_numeric = Pipeline(steps=[
    ('impute_num', SimpleImputer(
        missing_values = np.nan, 
        strategy = 'median', 
        copy = False, 
        add_indicator = True)
    )
])

pipe_categorical = Pipeline(steps=[
    ('impute_cat', SimpleImputer(
        missing_values = np.nan, 
        strategy = 'constant', 
        fill_value = 99999,
        copy = False)
    ),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
transformer_union = ColumnTransformer([
    ('feat_numeric', pipe_numeric, ['age']),
    ('feat_categorical', pipe_categorical, ['cp']),
], remainder = 'passthrough')

In [51]:
transformer_union.fit(data[['age', 'cp']])

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('feat_numeric',
                                 Pipeline(memory=None,
                                          steps=[('impute_num',
                                                  SimpleImputer(add_indicator=True,
                                                                copy=False,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0))],
                                          verbose=False),
                                 ['age']),
                                ('feat_categorical',
                                 Pipeline(memory=None,
                      

In [52]:
transformer_union.transform(data[['age', 'cp']])

array([[63.,  0.,  0.,  0.,  1.],
       [37.,  0.,  0.,  1.,  0.],
       [41.,  0.,  1.,  0.,  0.],
       ...,
       [68.,  1.,  0.,  0.,  0.],
       [57.,  1.,  0.,  0.,  0.],
       [57.,  0.,  1.,  0.,  0.]])

In [80]:
# There's unfortunately no global method for extracting feature names of from column transformer/ feature union
transformer_union.named_transformers_['feat_categorical'].named_steps['one_hot'].get_feature_names()

array(['x0_0', 'x0_1', 'x0_2', 'x0_3'], dtype=object)

In [8]:
# Let's see how the output looks like when we do not limit the input
transformer_union.fit(data)

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('feat_numeric',
                                 Pipeline(memory=None,
                                          steps=[('impute_num',
                                                  SimpleImputer(add_indicator=True,
                                                                copy=False,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0))],
                                          verbose=False),
                                 ['age']),
                                ('feat_categorical',
                                 Pipeline(memory=None,
               

In [9]:
transformer_union.transform(data)

array([[63.,  0.,  0., ...,  0.,  1.,  1.],
       [37.,  0.,  0., ...,  0.,  2.,  1.],
       [41.,  0.,  1., ...,  0.,  2.,  1.],
       ...,
       [68.,  1.,  0., ...,  2.,  3.,  0.],
       [57.,  1.,  0., ...,  1.,  3.,  0.],
       [57.,  0.,  1., ...,  1.,  2.,  0.]])

In [11]:
transformer_union.transform(data).shape

(303, 17)

In [21]:
# transformer_union.named_transformers_['feat_categorical'].named_steps['one_hot'].get_feature_names()
transformers = transformer_union.named_transformers_
print(transformers)

{'feat_numeric': Pipeline(memory=None,
         steps=[('impute_num',
                 SimpleImputer(add_indicator=True, copy=False, fill_value=None,
                               missing_values=nan, strategy='median',
                               verbose=0))],
         verbose=False), 'feat_categorical': Pipeline(memory=None,
         steps=[('impute_cat',
                 SimpleImputer(add_indicator=False, copy=False,
                               fill_value=99999, missing_values=nan,
                               strategy='constant', verbose=0)),
                ('one_hot',
                 OneHotEncoder(categorical_features=None, categories=None,
                               drop=None, dtype=<class 'numpy.float64'>,
                               handle_unknown='ignore', n_values=None,
                               sparse=True))],
         verbose=False), 'remainder': 'passthrough'}


In [25]:
transformers['feat_categorical'].named_steps['one_hot'].get_feature_names()
# transformers.feat_categorical.named_steps.one_hot.get_feature_names()

array(['x0_0', 'x0_1', 'x0_2', 'x0_3'], dtype=object)

In [37]:
transformers['feat_numeric'].named_steps['impute_num']
# transformers.feat_numeric.named_steps.impute_num

SimpleImputer(add_indicator=True, copy=False, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

SimpleImputer(add_indicator=True, copy=False, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)