In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Transformers 
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
# Read data from Titanic dataset.
TITANIC_URL = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')

df = pd.read_csv(TITANIC_URL)

In [None]:
mask = np.random.rand(len(df)) < 0.8
raw_train = df[mask]
raw_test = df[~mask]

In [None]:
# Split dataset into categorical
cat_features = ['sex', 'embarked', 'cabin', 'boat', 'ticket', 'home.dest']

raw_train_cat = raw_train[cat_features]
raw_test_cat = raw_test[cat_features]

# And numerical columns
num_features = ['pclass', 'age', 'sibsp', 'fare', 'body']

raw_train_num = raw_train[num_features]
raw_test_num = raw_test[num_features]

In [None]:
# Fill NAs in categorical
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')

filled_train_cat = cat_imputer.fit_transform(raw_train_cat)
filled_test_cat = cat_imputer.transform(raw_test_cat)

# And numerical
num_imputer = SimpleImputer(strategy='median')

filled_train_num = num_imputer.fit_transform(raw_train_num)
filled_test_num = num_imputer.transform(raw_test_num)

In [None]:
# One hot encode categorical
onehot = OneHotEncoder(handle_unknown='ignore', sparse=False)

encoded_train_cat = onehot.fit_transform(filled_train_cat)
encoded_test_cat = onehot.transform(filled_test_cat)

In [None]:
# Scale numerical
scaler = StandardScaler()

scaled_train_num = scaler.fit_transform(filled_train_num)
scaled_test_num = scaler.transform(filled_test_num)

In [None]:
# Combine features
X_train = np.hstack((encoded_train_cat, scaled_train_num))
X_test = np.hstack((encoded_test_cat, scaled_test_num))

y_train = raw_train['survived']
y_test = raw_test['survived']

In [None]:
# Train and evaluate a simple model
clf = LogisticRegression(solver='liblinear')

clf.fit(X_train, y_train)
print(f"Model score: {clf.score(X_test, y_test):.3f}")

What if I need to add a new step, e.g. feature engineering?