In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('C:\\Users\\koriv\\Desktop\\MachineLearning_DataScience\\Hands_On_Machine_Learning\\Coding_ColumnTransformer_Pipeline_Deployment\\income_evaluation.csv', na_values=' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.isna().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [5]:
df.shape

(32561, 15)

In [6]:
df.dropna(inplace=True)
df.shape

(30162, 15)

In [7]:
df.isna().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [8]:
df.columns = df.columns.str.strip()
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), df.income,
                                                   test_size=0.2, random_state=0)

In [10]:
ct = ColumnTransformer([
    ('step1', RobustScaler(), ['age', 'fnlwgt', 'hours-per-week']),
    ('step2', StandardScaler(), ['capital-gain', 'capital-loss', 'education-num']),
    ('step3', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['workclass', 
                                                                     'marital-status', 'occupation',
                                                                     'relationship', 'race', 
                                                                     'sex', 'native-country'])
], remainder='drop')

# pipeline use case 1 - with an 'estimator' as final step

In [11]:
p = Pipeline([
    ('coltf_step', ct),
    ('model', DecisionTreeClassifier()),
])

In [12]:

p.fit(X_train, y_train)

Pipeline(steps=[('coltf_step',
                 ColumnTransformer(transformers=[('step1', RobustScaler(),
                                                  ['age', 'fnlwgt',
                                                   'hours-per-week']),
                                                 ('step2', StandardScaler(),
                                                  ['capital-gain',
                                                   'capital-loss',
                                                   'education-num']),
                                                 ('step3',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  ['workclass',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                 

In [13]:
p.predict(X_test)


array([' <=50K', ' <=50K', ' <=50K', ..., ' >50K', ' <=50K', ' <=50K'],
      dtype=object)

In [14]:
p.score(X_test, y_test)


0.8059008785015747

In [15]:
p.named_steps

{'coltf_step': ColumnTransformer(transformers=[('step1', RobustScaler(),
                                  ['age', 'fnlwgt', 'hours-per-week']),
                                 ('step2', StandardScaler(),
                                  ['capital-gain', 'capital-loss',
                                   'education-num']),
                                 ('step3',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  ['workclass', 'marital-status', 'occupation',
                                   'relationship', 'race', 'sex',
                                   'native-country'])]),
 'model': DecisionTreeClassifier()}

In [16]:
p.named_steps['coltf_step'].transformers_


[('step1', RobustScaler(), ['age', 'fnlwgt', 'hours-per-week']),
 ('step2',
  StandardScaler(),
  ['capital-gain', 'capital-loss', 'education-num']),
 ('step3',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['workclass',
   'marital-status',
   'occupation',
   'relationship',
   'race',
   'sex',
   'native-country']),
 ('remainder', 'drop', [3])]

In [17]:
p.named_steps['coltf_step'].transformers_[2][1].get_feature_names()


array(['x0_ Federal-gov', 'x0_ Local-gov', 'x0_ Private',
       'x0_ Self-emp-inc', 'x0_ Self-emp-not-inc', 'x0_ State-gov',
       'x0_ Without-pay', 'x1_ Divorced', 'x1_ Married-AF-spouse',
       'x1_ Married-civ-spouse', 'x1_ Married-spouse-absent',
       'x1_ Never-married', 'x1_ Separated', 'x1_ Widowed',
       'x2_ Adm-clerical', 'x2_ Armed-Forces', 'x2_ Craft-repair',
       'x2_ Exec-managerial', 'x2_ Farming-fishing',
       'x2_ Handlers-cleaners', 'x2_ Machine-op-inspct',
       'x2_ Other-service', 'x2_ Priv-house-serv', 'x2_ Prof-specialty',
       'x2_ Protective-serv', 'x2_ Sales', 'x2_ Tech-support',
       'x2_ Transport-moving', 'x3_ Husband', 'x3_ Not-in-family',
       'x3_ Other-relative', 'x3_ Own-child', 'x3_ Unmarried', 'x3_ Wife',
       'x4_ Amer-Indian-Eskimo', 'x4_ Asian-Pac-Islander', 'x4_ Black',
       'x4_ Other', 'x4_ White', 'x5_ Female', 'x5_ Male', 'x6_ Cambodia',
       'x6_ Canada', 'x6_ China', 'x6_ Columbia', 'x6_ Cuba',
       'x6_ Dominican

In [1]:
# Operation available in a Pipeline depends on the final step available. 
# If final step is model then methods like fit and predict will be avialble.
# If final step is scaler then methods like fit and transform will be available.