In [16]:
import numpy as np
import pandas as pd

In [17]:
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [18]:
df = pd.read_csv('../train.csv')
df = df.drop(columns=['PassengerId','Name','Ticket','Cabin'])
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [19]:
df.isnull().mean() *100

Survived     0.000000
Pclass       0.000000
Sex          0.000000
Age         19.865320
SibSp        0.000000
Parch        0.000000
Fare         0.000000
Embarked     0.224467
dtype: float64

In [20]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [22]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
30,1,male,40.0,0,0,27.7208,C
10,3,female,4.0,1,1,16.7000,S
873,3,male,47.0,0,0,9.0000,S
182,3,male,9.0,4,2,31.3875,S
876,3,male,20.0,0,0,9.8458,S
...,...,...,...,...,...,...,...
534,3,female,30.0,0,0,8.6625,S
584,3,male,,0,0,8.7125,C
493,1,male,71.0,0,0,49.5042,C
527,1,male,,0,0,221.7792,S


# Make a pipeline

In [23]:
numerical_features = ['Age','Fare']
numerical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

categorical_features = ['Embarked','Sex']
categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most-frequent')),
    ('ohe',OneHotEncoder(handle_unknown='ignore')),
])

# Using transformer

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numerical_transformer,numerical_features),
        ('cat',categorical_transformer,categorical_features),
    ]
)

In [25]:
clf = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classification',LogisticRegression())
])

In [26]:
from sklearn import set_config

set_config(display='diagram')
clf

# Automic selection of missing value fill (all possible way of strategy)

In [27]:
param_grid = {
    'preprocessor__num__imputer__strategy':['mean','median'],
    'preprocessor__cat__imputer__strategy':['most-frequent','constant'],
    'classification__C':[0.1,1.0,10,100]
}
grid_search = GridSearchCV(clf,param_grid,cv=10)

In [28]:
grid_search.fit(X_train,y_train)

grid_search.best_params_

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\HP TECHNOLOGY\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HP TECHNOLOGY\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HP TECHNOLOGY\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\U

{'classification__C': 1.0,
 'preprocessor__cat__imputer__strategy': 'constant',
 'preprocessor__num__imputer__strategy': 'mean'}

In [32]:
cv_results =  pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score",ascending=False)
cv_results[['param_classification__C','param_preprocessor__cat__imputer__strategy','param_preprocessor__num__imputer__strategy','mean_test_score']]

Unnamed: 0,param_classification__C,param_preprocessor__cat__imputer__strategy,param_preprocessor__num__imputer__strategy,mean_test_score
6,1.0,constant,mean,0.787852
7,1.0,constant,median,0.787852
10,10.0,constant,mean,0.787852
11,10.0,constant,median,0.787852
14,100.0,constant,mean,0.787852
15,100.0,constant,median,0.787852
2,0.1,constant,mean,0.786444
3,0.1,constant,median,0.786444
0,0.1,most-frequent,mean,
1,0.1,most-frequent,median,
