## Import packages and read the data



Import the packages needed for the analysis.



In [None]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
plt.style.use('ggplot')

from typing import Tuple, List
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.pipeline import FeatureUnion, make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.base import TransformerMixin

import xgboost as xgb

In [None]:
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")

## Do data exploration



Using the processes presented by Hadley:

[https://r4ds.had.co.nz/exploratory-data-analysis.html](https://r4ds.had.co.nz/exploratory-data-analysis.html)

Two types of questions

-   What type of variation occurs within my variables?
-   What type of covariation occurs between my variables?

Let's visualize the categorical variables!



In [None]:
train.describe(include='all')

We see, looking at describe and the data description on kaggle that,

-   Survived (which is out dependent variable)
-   Pclass,
-   Sex,
-   Cabin,
-   Embarked

are our categorical variables.

Let's visualize the proportion of survived to not survived



In [None]:
s = train.Survived.value_counts()
s.index = ["No", "Yes"]
s

In [None]:
plt.bar(
    x=s.index,
    height=s,
    color=['darkred', 'darkblue']
)

Find proportions of the survived against the non-survived



In [None]:
s / sum(s)

We can visualize how many survived amongst a subset of our categorical variables.



In [None]:
cat_variables = [
    'Pclass',
    'Sex',
    'Embarked'
]
fig, ax = plt.subplots(2, 3, figsize=(10, 10))
rows, cols = range(2), range(4)
for row in rows:
    for cat_var, col in zip(cat_variables, cols):
        d = train.loc[train.Survived == row]
        sns.countplot(
            x=cat_var,
            data=train.loc[train.Survived == row],
            ax = ax[row, col]
        )
        ax[row, col].set_title("Survived == " + str(d.Survived.iloc[0]))

From which we recognize that the Pclass and Embarked has the potential of being strong predictors.

For other categorical variables we have to do some data mendling, such as Cabin, which consists of multiple cabins  for each passenger. Also we are able to subset the deck from the cabin numers, which might give us an indication of how good the predictor is.

Intuitively, the deck number should be a rather strong predictor, since the lower the deck, the more the passenger had to climbed to get to the top deck.



In [None]:
# Extract the cabins data and split into Deck and numberx
cabins = train.Cabin.str.split(" ", expand=True).fillna(np.nan)
f = lambda col: col.str.extract(r'([a-zA-Z]+)(\d+)')
cabins_split = pd.concat(
    [f(cabins[col_label]) for col_label in cabins.columns], 
    axis=1
)
ls = [
    "Deck_0", "Room_0",
    "Deck_1", "Room_1",
    "Deck_2", "Room_2",
    "Deck_3", "Room_3"
]
cabins_split.columns = ls
cabins_split

For the missing values, we are imputing with "Missing", which will be handled by the one-hot encoding



In [None]:
cabins_split.fillna("Missing", inplace=True)

Merge the datasets



In [None]:
t = pd.concat([
    train.drop(columns=["Cabin"]),
    cabins_split
], axis=1)

In [None]:
t.head()

In [None]:
t_sorted = t.sort_values("Deck_0")
sns.catplot(
    x="Deck_0",
    col="Survived",
    kind="count",
    data=t_sorted.loc[t_sorted.Deck_0 != "Missing"],
)

In [None]:
sns.countplot(
    x="Deck_0",
    hue="Survived",
    data=t_sorted.loc[t_sorted.Deck_0 != "Missing"]
)

The second plot tells a bit more of the deck variable. Maybe our model will be able to pick out based
  on the info from this, especially from those thatwere on the B, D, E, and F decks since these have great discrepancies between those who survived and those who didnt.

Just to check let's visualize deck 2 too.



In [None]:
sns.countplot(
    x="Deck_1",
    hue="Survived",
    data=t_sorted.loc[t_sorted.Deck_1 != "Missing"]
)

I suppose the multiple cabins for 1 person suggests that they travelled multiple people. That is already
  captured in other features



In [None]:
fig, ax = plt.subplots(1, 4, figsize=(13, 5))
for i in range(0, 4):
    sns.countplot(
        x="Deck_" + str(i),
        hue="Survived",
        data=t_sorted,
        ax=ax[i]
    )

We can also visualize our continuous variables:

-   Age,
-   Sibsp
-   Parch



In [None]:
con_variables = [
    "Age",
    "SibSp",
    "Parch"
]
f, axs = plt.subplots(1, 3, figsize=(15, 10))
for k, var in enumerate(con_variables):
    sns.histplot(
        data=train,
        x=var,
        hue="Survived",
        ax=axs[k]
    )

## Do data transformation and drop variables, e.g. transform categorical variables to dummy variables.



Use a data preparation function, to do all prepping on both the training and test sample



In [None]:
def prepare_sample(df: pd.DataFrame, y_label: str='Survived') -> Tuple[pd.DataFrame, pd.Series]:
    labels = ['PassengerId'] if y_label is None else ['PassengerId', y_label]

    # Extract the cabins data and split into Deck and numberx
    cabins = df.Cabin.str.split(" ", expand=True).fillna(np.nan)
    f = lambda col: col.str.extract(r'([a-zA-Z]+)(\d+)')
    c_split = pd.concat(
        [f(cabins[col_label]) for col_label in cabins.columns], 
        axis=1
    )
    ls = [
        "Deck_0", "Room_0",
        "Deck_1", "Room_1",
        "Deck_2", "Room_2",
        "Deck_3", "Room_3"
    ]
    c_split.columns = ls
    t = pd.concat([df.drop("Cabin", axis=1), c_split], axis=1)
    # Drop unnecessary columns
    X = t.drop(
        labels=labels + ['Ticket', 'Name'],
        axis=1
    )
    y = None if y_label is None else df[y_label]

    return X, y

X, y = prepare_sample(
    df=train
)

## Run modelling



Conduct modelling, by running randomized search cv for multiple parameters



In [None]:
# Specify classifiers
clfs = {
    'rf': RandomForestClassifier(random_state=0),
}
# Setup pipelines for variable types
numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]
)
ls = [
    "Deck_0", "Room_0",
    "Deck_1", "Room_1",
    "Deck_2", "Room_2",
    "Deck_3", "Room_3"
]
categorical_features = ['Pclass', 'Sex', 'Embarked'] + ls
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)
# Set variable types
numeric_features=[
    "Age",
    "SibSp",
    "Parch"
]
# Make transofmer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',  xgb.XGBClassifier(objective="binary:logistic", random_state=42))
])
# Setup hyperparameter grid
param_grid = {
    "classifier__colsample_bytree": np.arange(0.3, 0.7, 0.1),
    "classifier__gamma": np.arange(0, 0.5, 0.1),
    "classifier__learning_rate": np.arange(0.01, 0.1, 0.01), # default 0.1 
}
g = GridSearchCV(
    cv=5,
    estimator=clf,
    param_grid=param_grid,
    scoring='accuracy'
).fit(X,y)

In [None]:
g.best_score_

## Use Gridsearch results to predict on the test data



In [None]:
X_test, y_test = prepare_sample(test, None)

In [None]:
X_test.head()

In [None]:
predictions = g.predict(X_test)

## Submit predictions



In [None]:
import datetime as dt
t = dt.datetime.today().strftime("%d%m%Y")
submission = pd.concat([
    test.PassengerId,
    pd.Series(predictions)
], axis=1)
submission.columns = ['PassengerId', 'Survived']
submission.to_csv(f"./submissions/submission_{t}.csv", index=False)

In [None]:
import os
os.system(f"kaggle competitions submit -c titanic -f submissions/submission_{t}.csv -m 'Submission {t}'")