In [1]:
import pandas as pd

from feature_engine.imputation import (AddMissingIndicator,
                                      CategoricalImputer,
                                      MeanMedianImputer)
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv('../data/preprocessing/credit_approval_uci.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
X_train, X_test, y_train, y_test = train_test_split(data.drop("target", axis=1),
                                                   data['target'],
                                                   test_size=0.3,
                                                   random_state=0
                                                   )

In [4]:
categorical_vars = X_train.select_dtypes(include="O").columns.to_list()
categorical_vars

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [5]:
categorical_indicator_vars = [f'{var}_na' for var in categorical_vars]
categorical_indicator_vars

['A1_na',
 'A4_na',
 'A5_na',
 'A6_na',
 'A7_na',
 'A9_na',
 'A10_na',
 'A12_na',
 'A13_na']

In [6]:
numerical_vars = X_train.select_dtypes(exclude="O").columns.to_list()
numerical_vars

['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

In [7]:
numerical_indicator_vars = [f'{var}_na' for var in numerical_vars]
numerical_indicator_vars

['A2_na', 'A3_na', 'A8_na', 'A11_na', 'A14_na', 'A15_na']

In [8]:
pipe = ColumnTransformer(transformers=[
    (
        'categorical_imputer', 
         SimpleImputer(strategy='most_frequent', add_indicator=True), 
         categorical_vars
    ),
    (
        'numerical_imputer', 
         SimpleImputer(strategy='median', add_indicator=True), 
         numerical_vars)
])

In [9]:
X_train = pipe.fit_transform(X_train)
X_test = pipe.fit_transform(X_test)

In [10]:
X_train.shape

(483, 26)

In [11]:
X_train = pd.DataFrame(X_train, columns=categorical_vars +
                      categorical_indicator_vars + 
                      numerical_vars + 
                      numerical_indicator_vars)
X_train.head()

ValueError: Shape of passed values is (483, 26), indices imply (483, 30)