In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

Read data from the Titanic dataset and have a look at it

In [None]:
TITANIC_URL = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')

raw_df = pd.read_csv(TITANIC_URL)

In [None]:
raw_df.info()

In [None]:
raw_df.head()

In [None]:
# Handle missing data

processed_df = raw_df.fillna(0, inplace=False)

In [None]:
# Pre-process categorical features

categorical_features = ['sex', 'embarked', 'cabin', 'boat', 'ticket', 'home.dest']
dfs = []

for feature in categorical_features:
    # Generate dummies
    dfs.append(pd.get_dummies(processed_df[feature]))

cat_features_df = pd.concat(dfs, axis=1)

In [None]:
# Scale numerical features

numerical_features = ['pclass', 'age', 'sibsp', 'fare', 'body']

num_features_df = processed_df[numerical_features]
num_features_df = num_features_df - num_features_df.min()
num_features_df = num_features_df / num_features_df.max()

In [None]:
# Split data
X = pd.concat([cat_features_df, num_features_df], axis=1)
y = processed_df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Train and evaluate a simple model
clf = LogisticRegression(solver='liblinear')

clf.fit(X_train, y_train)
print(f"Model score: {clf.score(X_test, y_test):.3f}")

How would this model work in production? When should I split the data?
```
X_train, X_test, y_train, y_test = train_test_split(X, y)
```
Am I really training on just training data?