### **Automatically select imputer parameters**

It uses grid search for the automatic imputation.

In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split,GridSearchCV

In [35]:
titanic=sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [36]:
titanic=titanic[['sex','age','fare','embarked','survived']]

In [37]:
X=titanic.drop(columns=['survived'])
Y=titanic['survived']


In [38]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=10)


In [39]:
numerical_features=['age','fare']
categorical_features=['sex','embarked']
numerical_transformer=Pipeline(steps=[
    ('imputer1',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])
categorical_transformer=Pipeline(steps=[
    ('imputer2',SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder',OneHotEncoder(handle_unknown='ignore'))
])

In [40]:
preprocessor=ColumnTransformer(
    transformers=[
        ('num',numerical_transformer,numerical_features),
        ('cat',categorical_transformer,categorical_features)
    ]
    
)

In [41]:
clf=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',LogisticRegression())
])

In [42]:
from sklearn import set_config
set_config(display='diagram')
clf

In [43]:
param_grid = {
    'preprocessor__num__imputer1__strategy': ['mean', 'median'],
    'preprocessor__cat__imputer2__strategy': ['most_frequent', 'constant'],
    'classifier__C': [0.1, 1.0, 10, 100]
}

In [44]:

grid_search=GridSearchCV(clf,param_grid,cv=10)

In [45]:
grid_search.fit(X_train,Y_train)
print(f"Best Params:",grid_search.best_params_)

Best Params: {'classifier__C': 1.0, 'preprocessor__cat__imputer2__strategy': 'most_frequent', 'preprocessor__num__imputer1__strategy': 'mean'}


In [47]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)


In [48]:
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_preprocessor__cat__imputer2__strategy,param_preprocessor__num__imputer1__strategy,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
7,0.009318,0.005865,0.002314,0.002669,1.0,constant,median,"{'classifier__C': 1.0, 'preprocessor__cat__imp...",0.791667,0.708333,...,0.816901,0.802817,0.71831,0.732394,0.788732,0.802817,0.788732,0.773944,0.036891,1
6,0.007126,0.007386,0.004707,0.007191,1.0,constant,mean,"{'classifier__C': 1.0, 'preprocessor__cat__imp...",0.791667,0.708333,...,0.816901,0.802817,0.71831,0.732394,0.788732,0.802817,0.788732,0.773944,0.036891,1
5,0.006721,0.007007,0.005404,0.006786,1.0,most_frequent,median,"{'classifier__C': 1.0, 'preprocessor__cat__imp...",0.791667,0.708333,...,0.816901,0.802817,0.71831,0.732394,0.788732,0.802817,0.788732,0.773944,0.036891,1
4,0.005097,0.003044,0.003965,0.004153,1.0,most_frequent,mean,"{'classifier__C': 1.0, 'preprocessor__cat__imp...",0.791667,0.708333,...,0.816901,0.802817,0.71831,0.732394,0.788732,0.802817,0.788732,0.773944,0.036891,1
11,0.006763,0.007169,0.003163,0.006107,10.0,constant,median,"{'classifier__C': 10, 'preprocessor__cat__impu...",0.791667,0.708333,...,0.816901,0.802817,0.71831,0.732394,0.788732,0.802817,0.788732,0.773944,0.036891,1
10,0.00879,0.008423,0.001507,0.004522,10.0,constant,mean,"{'classifier__C': 10, 'preprocessor__cat__impu...",0.791667,0.708333,...,0.816901,0.802817,0.71831,0.732394,0.788732,0.802817,0.788732,0.773944,0.036891,1
9,0.00548,0.008077,0.005537,0.005871,10.0,most_frequent,median,"{'classifier__C': 10, 'preprocessor__cat__impu...",0.791667,0.708333,...,0.816901,0.802817,0.71831,0.732394,0.788732,0.802817,0.788732,0.773944,0.036891,1
8,0.006846,0.002547,0.003914,0.004435,10.0,most_frequent,mean,"{'classifier__C': 10, 'preprocessor__cat__impu...",0.791667,0.708333,...,0.816901,0.802817,0.71831,0.732394,0.788732,0.802817,0.788732,0.773944,0.036891,1
12,0.007258,0.005644,0.002121,0.003241,100.0,most_frequent,mean,"{'classifier__C': 100, 'preprocessor__cat__imp...",0.791667,0.708333,...,0.816901,0.802817,0.71831,0.732394,0.788732,0.802817,0.788732,0.773944,0.036891,1
13,0.008074,0.006304,0.002893,0.004995,100.0,most_frequent,median,"{'classifier__C': 100, 'preprocessor__cat__imp...",0.791667,0.708333,...,0.816901,0.802817,0.71831,0.732394,0.788732,0.802817,0.788732,0.773944,0.036891,1
