In [2]:
#Its not advisable to repeate the column names in multiple column transformer steps. In each step the column name should be unique
#Instead use pipeline before column transformer for chained transformations on same set of columns

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.tree import DecisionTreeClassifier

In [5]:
df = pd.read_csv('C:\\Users\\koriv\\Desktop\\MachineLearning_DataScience\\Hands_On_Machine_Learning\\Coding_ColumnTransformer_Pipeline_Deployment\\income_evaluation.csv', na_values=' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df.columns = df.columns.str.strip()
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', 1), df.income, test_size=0.2,
                                                   random_state=40)

  X_train, X_test, y_train, y_test = train_test_split(df.drop('income', 1), df.income, test_size=0.2,


In [8]:
num_cols = [col for col in X_train.columns if X_train[col].dtypes!='O']
num_cols

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [9]:
cat_cols = [col for col in X_train.columns if (X_train[col].dtypes=='O') & (col!='education')]
cat_cols

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [10]:
#null value check
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [11]:
# Specify chained transformations using Pipeline
pp_num = Pipeline([
    ('num_imp', SimpleImputer(strategy='median', add_indicator=False)),
    ('rob_num', RobustScaler())
])

pp_cat = Pipeline([
    ('cat_imp', SimpleImputer(strategy='constant', add_indicator=False, fill_value='missing')),
    ('ohe_cat', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [12]:
from sklearn.impute import MissingIndicator
ct = ColumnTransformer([
    ('mi', MissingIndicator(), X_train.columns),
    ('pp_num', pp_num, num_cols),
    ('pp_cat', pp_cat, cat_cols)
])

In [13]:
xt = ct.fit_transform(X_train)
xt

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
pd.DataFrame(xt).isna().sum().sum()

0

In [15]:
pd.DataFrame(xt).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
0,0.0,0.0,0.0,-0.25,0.054502,0.666667,0.0,1887.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,-0.85,-0.121984,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.45,1.588461,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,-0.15,-0.1189,-1.333333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,1.75,-0.582629,1.666667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [16]:
X_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
26470,40,Private,188291,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States
1104,34,Self-emp-not-inc,196791,Assoc-acdm,12,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,25,United-States
85,53,Private,346253,HS-grad,9,Divorced,Sales,Own-child,White,Female,0,0,35,United-States
16639,39,Private,435638,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,40,United-States
28097,22,Private,324922,HS-grad,9,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,50,United-States


In [17]:
ct.transform(X_test)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
# Final pipeline for model training
pipe_final = Pipeline([
    ('ct_step', ct),
    ('model', DecisionTreeClassifier())
])
pipe_final.fit(X_train, y_train)

Pipeline(steps=[('ct_step',
                 ColumnTransformer(transformers=[('mi', MissingIndicator(),
                                                  Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')),
                                                 ('pp_num',
                                                  Pipeline(steps=[('num_imp',
                                                                   SimpleImputer(strategy='med...
                                                   'education-num',
                                                   'capital-gain',
                                                   'capital-loss',
                                                   'hours-per-week']),
                                                 ('pp_cat',
                                        

In [21]:
pipe_final.predict(X_test)

array([' >50K', ' >50K', ' <=50K', ..., ' <=50K', ' <=50K', ' <=50K'],
      dtype=object)

In [22]:
pipe_final.score(X_test, y_test)


0.8129894058037771

In [23]:
pipe_final.named_steps

{'ct_step': ColumnTransformer(transformers=[('mi', MissingIndicator(),
                                  Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
        'marital-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
       dtype='object')),
                                 ('pp_num',
                                  Pipeline(steps=[('num_imp',
                                                   SimpleImputer(strategy='median')),
                                                  ('rob_num', RobustScaler())]),
                                  ['age', 'fnlwgt', 'education-num',
                                   'capital-gain', 'capital-loss',
                                   'hours-per-week']),
                                 ('pp_cat',
                                  Pipeline(steps=[('cat_imp',
                                                   SimpleImputer(fill_value='missing'

In [24]:
pipe_final.named_steps['ct_step']


ColumnTransformer(transformers=[('mi', MissingIndicator(),
                                 Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')),
                                ('pp_num',
                                 Pipeline(steps=[('num_imp',
                                                  SimpleImputer(strategy='median')),
                                                 ('rob_num', RobustScaler())]),
                                 ['age', 'fnlwgt', 'education-num',
                                  'capital-gain', 'capital-loss',
                                  'hours-per-week']),
                                ('pp_cat',
                                 Pipeline(steps=[('cat_imp',
                                                  SimpleImputer(fill_value='missing',
                        

In [25]:
pipe_final.named_steps['ct_step'].named_transformers_


{'mi': MissingIndicator(),
 'pp_num': Pipeline(steps=[('num_imp', SimpleImputer(strategy='median')),
                 ('rob_num', RobustScaler())]),
 'pp_cat': Pipeline(steps=[('cat_imp',
                  SimpleImputer(fill_value='missing', strategy='constant')),
                 ('ohe_cat',
                  OneHotEncoder(handle_unknown='ignore', sparse=False))])}

In [26]:
pipe_final.named_steps['ct_step'].named_transformers_['pp_cat']


Pipeline(steps=[('cat_imp',
                 SimpleImputer(fill_value='missing', strategy='constant')),
                ('ohe_cat',
                 OneHotEncoder(handle_unknown='ignore', sparse=False))])

In [28]:
pipe_final.named_steps['ct_step'].named_transformers_['pp_cat'].named_steps['ohe_cat'].get_feature_names()

array(['x0_ Federal-gov', 'x0_ Local-gov', 'x0_ Never-worked',
       'x0_ Private', 'x0_ Self-emp-inc', 'x0_ Self-emp-not-inc',
       'x0_ State-gov', 'x0_ Without-pay', 'x0_missing', 'x1_ Divorced',
       'x1_ Married-AF-spouse', 'x1_ Married-civ-spouse',
       'x1_ Married-spouse-absent', 'x1_ Never-married', 'x1_ Separated',
       'x1_ Widowed', 'x2_ Adm-clerical', 'x2_ Armed-Forces',
       'x2_ Craft-repair', 'x2_ Exec-managerial', 'x2_ Farming-fishing',
       'x2_ Handlers-cleaners', 'x2_ Machine-op-inspct',
       'x2_ Other-service', 'x2_ Priv-house-serv', 'x2_ Prof-specialty',
       'x2_ Protective-serv', 'x2_ Sales', 'x2_ Tech-support',
       'x2_ Transport-moving', 'x2_missing', 'x3_ Husband',
       'x3_ Not-in-family', 'x3_ Other-relative', 'x3_ Own-child',
       'x3_ Unmarried', 'x3_ Wife', 'x4_ Amer-Indian-Eskimo',
       'x4_ Asian-Pac-Islander', 'x4_ Black', 'x4_ Other', 'x4_ White',
       'x5_ Female', 'x5_ Male', 'x6_ Cambodia', 'x6_ Canada',
       'x6_ Ch