In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv('../data/preprocessing/credit_approval_uci.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
X_train, X_test, y_train, y_test = train_test_split(data.drop("target", axis=1),
                                                   data['target'],
                                                   test_size=0.3,
                                                   random_state=0
                                                   )

In [4]:
categorical_vars = X_train.select_dtypes(include='O').columns.to_list()
categorical_vars

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [5]:
frequent_values = X_train[categorical_vars].mode().iloc[0,:].to_dict()
print(frequent_values)

{'A1': 'b', 'A4': 'u', 'A5': 'g', 'A6': 'c', 'A7': 'v', 'A9': 't', 'A10': 'f', 'A12': 'f', 'A13': 'g'}


In [6]:
X_train_mode = X_train[categorical_vars].fillna(frequent_values)
X_test_mode = X_test[categorical_vars].fillna(frequent_values)

In [7]:
# replace missing values with an arbitrary string.
imputation_dict = {var:"no_data" for var in categorical_vars}
imputation_dict

{'A1': 'no_data',
 'A4': 'no_data',
 'A5': 'no_data',
 'A6': 'no_data',
 'A7': 'no_data',
 'A9': 'no_data',
 'A10': 'no_data',
 'A12': 'no_data',
 'A13': 'no_data'}

In [8]:
X_train_arbitrary = X_train[categorical_vars].fillna(imputation_dict)
X_test_arbitrary = X_test[categorical_vars].fillna(imputation_dict)