# Automatically Select Imputer

In [16]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [17]:
# load dataset using seaborn
df = sns.load_dataset('titanic')

In [18]:
# display the first few rows of the dataframe
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [19]:
# drop unnecessary columns
df.drop(columns=['class','who' ,'adult_male', 'deck', 'embark_town', 'alive', 'alone' ], inplace=True)

In [20]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [21]:
# split the data into features and target variable
X = df.drop(columns=['survived'])
y = df['survived']

In [22]:
# split the dataset into training and testing sets
X_train, X_test ,y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [23]:
# display the first few rows of the training features
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
30,1,male,40.0,0,0,27.7208,C
10,3,female,4.0,1,1,16.7,S
873,3,male,47.0,0,0,9.0,S
182,3,male,9.0,4,2,31.3875,S
876,3,male,20.0,0,0,9.8458,S


In [24]:
# define preprocessing for numerical and categorical features
numerical_features= ['age', 'fare']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features= ['embarked', 'sex']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))

])

In [25]:
# column transformer to apply the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [26]:
# create the final pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [27]:
# define the parameter grid for GridSearchCV
from sklearn import set_config
set_config(display='diagram')
clf

In [28]:
# Define a parameter grid for GridSearchCV to tune preprocessing and classifier hyperparameters
param_grid = {
    'preprocessor_num_imputer_strategy': ['mean', 'median'],       # Numeric imputer strategies
    'preprocessor_cat_imputer_strategy': ['most_frequent', 'constant'],  # Categorical imputer strategies
    'classifier__C': [0.1, 1.0, 10, 100]                          # Regularization strength for classifier
}

# Initialize GridSearchCV with the classifier and parameter grid using 10-fold cross-validation
grid_search = GridSearchCV(clf, param_grid, cv=10)


In [29]:
# Update param_grid with correct parameter names
param_grid = {
	'preprocessor__num__imputer__strategy': ['mean', 'median'],
	'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant'],
	'classifier__C': [0.1, 1.0, 10, 100]
}
grid_search = GridSearchCV(clf, param_grid, cv=10)

grid_search.fit(X_train, y_train)
print('Best Params:')
print(grid_search.best_params_)

Best Params:
{'classifier__C': 1.0, 'preprocessor__cat__imputer__strategy': 'most_frequent', 'preprocessor__num__imputer__strategy': 'mean'}


In [30]:
print(f'Internal cv score: {grid_search.best_score_:.3f}')

Internal cv score: 0.788


In [32]:
#
# Convert GridSearchCV results to a DataFrame
cv_results = pd.DataFrame(grid_search.cv_results_)

# Sort the results by mean test score in descending order
cv_results = cv_results.sort_values('mean_test_score', ascending=False)

# Select relevant columns to view the best parameter combinations and their scores
cv_results[['param_classifier__C', 
            'param_preprocessor__cat__imputer__strategy', 
            'param_preprocessor__num__imputer__strategy', 
            'mean_test_score']]

Unnamed: 0,param_classifier__C,param_preprocessor__cat__imputer__strategy,param_preprocessor__num__imputer__strategy,mean_test_score
7,1.0,constant,median,0.787852
6,1.0,constant,mean,0.787852
5,1.0,most_frequent,median,0.787852
4,1.0,most_frequent,mean,0.787852
11,10.0,constant,median,0.787852
10,10.0,constant,mean,0.787852
9,10.0,most_frequent,median,0.787852
8,10.0,most_frequent,mean,0.787852
12,100.0,most_frequent,mean,0.787852
13,100.0,most_frequent,median,0.787852
