## Data loading

In [187]:
import numpy as np
import pandas as pd
import seaborn as sns


In [188]:
from sklearn.model_selection import train_test_split

dataset_path = './data/train.csv'

# Read in data
X_full = pd.read_csv("./data/train.csv", index_col='PassengerId')
X_test_full = pd.read_csv("./data/test.csv", index_col='PassengerId')

# Extract target
y = X_full.Survived
X_full.drop(columns=['Survived'], inplace=True)


In [189]:
X_full.head()


Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [190]:
# Split into training and testing data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y,
                                                                train_size=0.8, test_size=0.2, random_state=0
                                                                )

# Find the catagorical and numerical column types
catagorical_columns = [
    col for col in X_train_full.columns if X_train_full[col].nunique() < 10
    and X_train_full[col].dtype == 'object']
numerical_columns = [
    col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]

columns_used = catagorical_columns + numerical_columns + ['Name']

X_train = X_train_full[columns_used]
X_valid = X_valid_full[columns_used]
X_test = X_test_full[columns_used]


In [191]:
X_train.head()


Unnamed: 0_level_0,Sex,Embarked,Pclass,Age,SibSp,Parch,Fare,Name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
141,female,C,3,,0,2,15.2458,"Boulos, Mrs. Joseph (Sultana)"
440,male,S,2,31.0,0,0,10.5,"Kvillner, Mr. Johan Henrik Johannesson"
818,male,C,2,31.0,1,1,37.0042,"Mallet, Mr. Albert"
379,male,C,3,20.0,0,0,4.0125,"Betros, Mr. Tannous"
492,male,S,3,21.0,0,0,7.25,"Windelov, Mr. Einar"


In [192]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
import re


class GetTitle(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # Gets all the titles, indicated by a .
        return self

    def get_title(self, name):
        title_search = re.search(' ([A-Za-z]+)\.', name)
        return title_search.group(1) if title_search else ''

    def transform(self, X, y=None):
        titles = X['Name'].map(self.get_title)
        titles.replace(['Lady', 'Countess', 'Capt', 'Col', 'Don',
                       'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare', inplace=True)
        titles.replace('Mlle', 'Miss', inplace=True)
        titles.replace('Ms', 'Miss', inplace=True)
        titles.replace('Mme', 'Mrs', inplace=True)
        X['Name'] = titles
        return X


In [193]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'),),
])
name_transformer = Pipeline(steps=[
    ('classify', GetTitle()),
    ('ordinal', OrdinalEncoder(),),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('name', name_transformer, ['Name']),
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, catagorical_columns)
    ])


In [194]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=500, learning_rate=0.05,
                     early_stopping_rounds=10, n_jobs=4)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                      ])

clf.fit(X_train, y_train, model__eval_set=[
        (preprocessor.fit_transform(X_valid), y_valid)], model__verbose=1)

preds = clf.predict(X_valid)

((preds > 0.5).astype(int) == y_valid).sum() / y_valid.size  # type: ignore


[0]	validation_0-rmse:0.48669
[1]	validation_0-rmse:0.47440
[2]	validation_0-rmse:0.46301
[3]	validation_0-rmse:0.45240
[4]	validation_0-rmse:0.44271
[5]	validation_0-rmse:0.43356
[6]	validation_0-rmse:0.42533
[7]	validation_0-rmse:0.41812
[8]	validation_0-rmse:0.41053
[9]	validation_0-rmse:0.40445
[10]	validation_0-rmse:0.39848
[11]	validation_0-rmse:0.39291
[12]	validation_0-rmse:0.38808
[13]	validation_0-rmse:0.38345
[14]	validation_0-rmse:0.37937
[15]	validation_0-rmse:0.37585
[16]	validation_0-rmse:0.37266
[17]	validation_0-rmse:0.36926
[18]	validation_0-rmse:0.36654
[19]	validation_0-rmse:0.36413
[20]	validation_0-rmse:0.36186
[21]	validation_0-rmse:0.35918
[22]	validation_0-rmse:0.35732
[23]	validation_0-rmse:0.35551
[24]	validation_0-rmse:0.35398
[25]	validation_0-rmse:0.35206
[26]	validation_0-rmse:0.35034
[27]	validation_0-rmse:0.34901
[28]	validation_0-rmse:0.34779
[29]	validation_0-rmse:0.34651
[30]	validation_0-rmse:0.34596
[31]	validation_0-rmse:0.34502
[32]	validation_0-

[46]	validation_0-rmse:0.34040
[47]	validation_0-rmse:0.33997
[48]	validation_0-rmse:0.34044
[49]	validation_0-rmse:0.34054
[50]	validation_0-rmse:0.34089
[51]	validation_0-rmse:0.34092
[52]	validation_0-rmse:0.34069
[53]	validation_0-rmse:0.34026
[54]	validation_0-rmse:0.34029
[55]	validation_0-rmse:0.34004
[56]	validation_0-rmse:0.33976
[57]	validation_0-rmse:0.33976
[58]	validation_0-rmse:0.33967
[59]	validation_0-rmse:0.33973
[60]	validation_0-rmse:0.33997
[61]	validation_0-rmse:0.34004
[62]	validation_0-rmse:0.33988
[63]	validation_0-rmse:0.34026
[64]	validation_0-rmse:0.34038
[65]	validation_0-rmse:0.34032
[66]	validation_0-rmse:0.34053
[67]	validation_0-rmse:0.34023


0.8603351955307262

In [195]:
final_model = XGBRegressor(n_estimators=67, learning_rate=0.1, n_jobs=4)

final_pipline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', final_model)
                                ])
final_pipline.fit(X_full, y)
predictions = (final_pipline.predict(X_test) > 0.5).astype(int).flatten()


In [196]:
output = pd.DataFrame(
    {'PassengerId': X_test.index, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
