In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Transformers 
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# Read data from Titanic dataset.
TITANIC_URL = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')

df = pd.read_csv(TITANIC_URL)

In [None]:
# Split dataset
mask = np.random.rand(len(df)) < 0.8
train = df[mask]
test = df[~mask]

# Get features & label
X_train = train.drop('survived', axis=1)
X_test = test.drop('survived', axis=1)

y_train = train['survived'] 
y_test = test['survived']

Pipelines to the rescue:
https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

In [None]:
# Preprocessing pipeline
cat_features = ['sex', 'embarked', 'cabin', 'boat', 'ticket', 'home.dest']
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

num_features = ['pclass', 'age', 'sibsp', 'fare', 'body']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_features),
        ('num', num_transformer, num_features)])

In [None]:
# Prediction pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='liblinear'))])

In [None]:
# Train and evaluate out pipeline
clf.fit(X_train, y_train)
print(f"Model score: {clf.score(X_test, y_test):.3f}")