# Titanic

#### References

- [Scikit-learn Pipelines with Titanic](https://jaketae.github.io/study/sklearn-pipeline/)

In [712]:
!pip install feature_engine



In [713]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml

In [714]:
from feature_engine.selection import DropFeatures
from sklearn.impute import KNNImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.pipeline import Pipeline
from feature_engine.encoding import OrdinalEncoder
from feature_engine.encoding import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

# load data

In [715]:
# load dataset
dataset = fetch_openml("titanic", version=1, as_frame=True, return_X_y=False)
# collect information
data = dataset.frame
col_x = dataset.feature_names
col_y = dataset.target_names

## data engineering

In [716]:
## DATA FORMAT

data['survived'] = data['survived'].apply(lambda x: True if x == '1' else False)
data['survived'] = data['survived'].astype(bool)
data['pclass'] = data['pclass'].astype(int)
data['pclass'] = data['pclass'].astype(str)
data['sex'] = data['sex'].astype(str)
data['embarked'].fillna('C', inplace=True)



## NEW FEATURES CREATION

# family size
data['family_size'] = data['parch'] + data['sibsp']
data['family_size'] = data['family_size'].astype(int)
# is alone according to family size
data['is_alone'] = data['family_size'].apply(lambda x: "0" if x > 1 else "1")
# title
data['title'] =  data['name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
d_title_converter = {
    'Miss':'Mrs',
    'Mr':'Mr',
    'Mrs':'Mrs'
}
data['title'] = data['title'].apply(lambda x: d_title_converter[x] if x in list(d_title_converter.keys()) else 'rare')
# new feature columns list
col_x = [c for c in data.columns if not c in col_y]

In [717]:
# set pipeline with common transformations
pl_common = Pipeline(steps=[
    ('drop_features', DropFeatures(features_to_drop=['boat', 'body', 'home.dest', 'cabin', 'ticket', 'parch', 'sibsp', 'name'])),
    ('imputer_numerical', SklearnTransformerWrapper(transformer = KNNImputer(n_neighbors=5), variables = ['age', 'fare'])),
    #('scaler', SklearnTransformerWrapper(transformer = RobustScaler(), variables = ['age', 'fare', 'family_size']))
])


# set pipeline with transformations for tree based algorithms (first option)
pl_tree_1 = Pipeline(steps=[
    ('encoding_categorical', OrdinalEncoder(encoding_method='ordered', variables=['sex', 'embarked', 'title', 'is_alone']))
                           ])

# set pipeline with transformations for tree based algorithms (second option)
pl_tree_2 = Pipeline(steps=[
    ('encoding_categorical', OneHotEncoder(drop_last_binary = True, variables=['embarked', 'sex', 'pclass', 'title', 'is_alone'])),
    #('pca', PCA(n_components=5))
                           ])


# evaluation

In [718]:
# collect data
X, y = data[col_x], data[col_y].values.ravel()

In [719]:
# final pipeline
pl1 = Pipeline(steps = [('pipeline_common', pl_common), 
                       ('pipeline_tree', pl_tree_1), 
                       ('classifier', RandomForestClassifier())
                      ])

# evaluation
scores = cross_val_score(pl1, X, y, cv=10, scoring="accuracy")
# display
np.mean(scores), np.std(scores)

(0.7425190839694656, 0.06816417759726304)

In [720]:
# final pipeline
pl2 = Pipeline(steps = [('pipeline_common', pl_common), 
                       ('pipeline_tree', pl_tree_2), 
                       ('classifier', RandomForestClassifier())
                      ])

# evaluation
scores = cross_val_score(pl2, X, y, cv=10, scoring="accuracy")
# display
np.mean(scores), np.std(scores)

(0.7440458015267176, 0.07416300631934006)

#### Conclusiones

- No existe mucha diferencia entre *OneHotEncoder* y *OrdinalEncoder* para algoritmos basados en arboles en el caso de que la cardinalidad no sea alta.
- En el caso de que haya alta cardinalidad en variables categoricas si que el *OneHotEncoder* afectaria negativamente a algoritmos basados en arboles de decision.
- Por tanto, **para algoritmos basados en arboles de decision lo mejor es usar el *OrdinalEncoder***.