# Kaggle Titanic competition
## Data set preprocessing
### Imports
#### General libraries

In [218]:
import pandas as pd
import numpy as np
import re

#### Scikit learn

In [219]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder, LabelEncoder, StandardScaler, KBinsDiscretizer
)
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

#### XGBoost

In [220]:
from xgboost import XGBClassifier

### Load training and test data set

In [221]:
df_training = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

num_training_samples = df_training.shape[0]
num_test_samples = df_test.shape[0]

# concat the samples of both sets without
df_dataset = pd.concat([df_training.drop('Survived', axis=1), df_test])

### Feature engineering

#### Sex

In [222]:
df_dataset = pd.concat([df_dataset, pd.get_dummies(df_dataset.Sex, prefix='Sex')], axis=1)

#### Class

In [223]:
df_dataset = pd.concat([df_dataset, pd.get_dummies(df_dataset.Pclass, prefix='Class')], axis=1)

#### Has cabin

In [224]:
df_dataset['HasCabin'] = ~df_dataset['Cabin'].isna()
# df_dataset[['Cabin', 'HasCabin']][:10]

#### Family size

In [225]:
df_dataset['FamilySize'] = df_dataset['Parch'] + df_dataset['SibSp']
# df_dataset[['FamilySize', 'Parch', 'SibSp']][:10]

#### Title
Investigate which titles exist and group them.

In [226]:
df_dataset['Title'] = df_dataset['Name'].map(lambda x: re.compile(',\s(\w*).?\s').findall(x)[0])

# Group low-occuring, related titles together
df_dataset.loc[df_dataset.Title == 'Jonkheer', 'Title'] = 'Master'
df_dataset.loc[df_dataset.Title.isin(['Ms','Mlle']), 'Title'] = 'Miss'
df_dataset.loc[df_dataset.Title == 'Mme', 'Title'] = 'Mrs'
df_dataset.loc[df_dataset.Title.isin(['Capt', 'Don', 'Major', 'Col']), 'Title'] = 'Sir'
df_dataset.loc[df_dataset.Title.isin(['Dona', 'the']), 'Title'] = 'Lady'

In [227]:
df_dataset = pd.concat([df_dataset, pd.get_dummies(df_dataset.Title, prefix='Title')], axis=1)

#### Same ticket number
Might have travelled together with somebody else.

In [228]:
df_dataset['Alone'] = df_dataset['FamilySize'] == 0 & ~df_dataset.duplicated('Ticket')

#### Has multiple cabins

In [229]:
df_dataset['HasMultipleCabins'] = df_dataset['Cabin'].map(lambda x: len(str(x).split(' ')) > 1)

#### Age
Use mean of age values for missing values.

In [230]:
age_mean = df_dataset['Age'].mean()
print('Mean', age_mean)
df_dataset['Age'].fillna(age_mean, inplace=True)
df_dataset['AgeGroup'] = pd.cut(df_dataset['Age'], 4)
df_dataset[['Age', 'AgeGroup']][:10]

Mean 29.881137667304014


Unnamed: 0,Age,AgeGroup
0,22.0,"(20.128, 40.085]"
1,38.0,"(20.128, 40.085]"
2,26.0,"(20.128, 40.085]"
3,35.0,"(20.128, 40.085]"
4,35.0,"(20.128, 40.085]"
5,29.881138,"(20.128, 40.085]"
6,54.0,"(40.085, 60.043]"
7,2.0,"(0.0902, 20.128]"
8,27.0,"(20.128, 40.085]"
9,14.0,"(0.0902, 20.128]"


In [231]:
df_dataset = pd.concat([df_dataset, pd.get_dummies(df_dataset.AgeGroup, prefix='AgeGroup')], axis=1)

#### Fare price

In [232]:
fare_price_mean = df_dataset['Fare'].mean()
print('Mean', fare_price_mean)

df_dataset['Fare'].fillna(fare_price_mean, inplace=True)
# Treat people who haven't paid for the ticket the same way as people with missing values
df_dataset.loc[df_dataset.Fare <= 0, 'Fare'] = fare_price_mean

df_dataset['FarePriceGroup'] = pd.cut(df_dataset['Fare'], 6)
df_dataset[['Fare', 'FarePriceGroup']][:10]

Mean 33.29547928134557


Unnamed: 0,Fare,FarePriceGroup
0,7.25,"(2.662, 88.031]"
1,71.2833,"(2.662, 88.031]"
2,7.925,"(2.662, 88.031]"
3,53.1,"(2.662, 88.031]"
4,8.05,"(2.662, 88.031]"
5,8.4583,"(2.662, 88.031]"
6,51.8625,"(2.662, 88.031]"
7,21.075,"(2.662, 88.031]"
8,11.1333,"(2.662, 88.031]"
9,30.0708,"(2.662, 88.031]"


In [233]:
df_dataset = pd.concat([df_dataset, pd.get_dummies(df_dataset.FarePriceGroup, prefix='FarePriceGroup')], axis=1)
df_dataset[:10]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,"AgeGroup_(20.128, 40.085]","AgeGroup_(40.085, 60.043]","AgeGroup_(60.043, 80.0]",FarePriceGroup,"FarePriceGroup_(2.662, 88.031]","FarePriceGroup_(88.031, 172.89]","FarePriceGroup_(172.89, 257.75]","FarePriceGroup_(257.75, 342.61]","FarePriceGroup_(342.61, 427.469]","FarePriceGroup_(427.469, 512.329]"
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,...,1,0,0,"(2.662, 88.031]",1,0,0,0,0,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,...,1,0,0,"(2.662, 88.031]",1,0,0,0,0,0
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,...,1,0,0,"(2.662, 88.031]",1,0,0,0,0,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,...,1,0,0,"(2.662, 88.031]",1,0,0,0,0,0
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,...,1,0,0,"(2.662, 88.031]",1,0,0,0,0,0
5,6,3,"Moran, Mr. James",male,29.881138,0,0,330877,8.4583,,...,1,0,0,"(2.662, 88.031]",1,0,0,0,0,0
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,...,0,1,0,"(2.662, 88.031]",1,0,0,0,0,0
7,8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,...,0,0,0,"(2.662, 88.031]",1,0,0,0,0,0
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,...,1,0,0,"(2.662, 88.031]",1,0,0,0,0,0
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,...,0,0,0,"(2.662, 88.031]",1,0,0,0,0,0


#### Delete unwanted columns

In [234]:
df_dataset.drop([
    'Parch',
    'Cabin',
    'Ticket',
    'Fare',
    'Name',
    'SibSp',
    'Embarked',
    'Age',
    'Sex', 
    'Title', 
    'Pclass', 
    'FarePriceGroup', 
    'AgeGroup'
], axis=1, inplace=True)

### Divide into training and test samples again

In [279]:
df_training_preprocessed = pd.concat([df_dataset[:num_training_samples], df_training['Survived']], axis=1)
df_test_preprocessed = pd.concat([df_dataset[num_training_samples:num_training_samples + num_test_samples]], axis=1)

assert len(df_training_preprocessed) == len(df_training)
assert len(df_test_preprocessed) == len(df_test)

In [280]:
df_training_preprocessed[:10]

Unnamed: 0,PassengerId,Sex_female,Sex_male,Class_1,Class_2,Class_3,HasCabin,FamilySize,Title_Dr,Title_Lady,...,"AgeGroup_(20.128, 40.085]","AgeGroup_(40.085, 60.043]","AgeGroup_(60.043, 80.0]","FarePriceGroup_(2.662, 88.031]","FarePriceGroup_(88.031, 172.89]","FarePriceGroup_(172.89, 257.75]","FarePriceGroup_(257.75, 342.61]","FarePriceGroup_(342.61, 427.469]","FarePriceGroup_(427.469, 512.329]",Survived
0,1,0,1,0,0,1,False,1,0,0,...,1,0,0,1,0,0,0,0,0,0
1,2,1,0,1,0,0,True,1,0,0,...,1,0,0,1,0,0,0,0,0,1
2,3,1,0,0,0,1,False,0,0,0,...,1,0,0,1,0,0,0,0,0,1
3,4,1,0,1,0,0,True,1,0,0,...,1,0,0,1,0,0,0,0,0,1
4,5,0,1,0,0,1,False,0,0,0,...,1,0,0,1,0,0,0,0,0,0
5,6,0,1,0,0,1,False,0,0,0,...,1,0,0,1,0,0,0,0,0,0
6,7,0,1,1,0,0,True,0,0,0,...,0,1,0,1,0,0,0,0,0,0
7,8,0,1,0,0,1,False,4,0,0,...,0,0,0,1,0,0,0,0,0,0
8,9,1,0,0,0,1,False,2,0,0,...,1,0,0,1,0,0,0,0,0,1
9,10,1,0,0,1,0,False,1,0,0,...,0,0,0,1,0,0,0,0,0,1


In [282]:
df_test_preprocessed

Unnamed: 0,PassengerId,Sex_female,Sex_male,Class_1,Class_2,Class_3,HasCabin,FamilySize,Title_Dr,Title_Lady,...,"AgeGroup_(0.0902, 20.128]","AgeGroup_(20.128, 40.085]","AgeGroup_(40.085, 60.043]","AgeGroup_(60.043, 80.0]","FarePriceGroup_(2.662, 88.031]","FarePriceGroup_(88.031, 172.89]","FarePriceGroup_(172.89, 257.75]","FarePriceGroup_(257.75, 342.61]","FarePriceGroup_(342.61, 427.469]","FarePriceGroup_(427.469, 512.329]"
0,892,0,1,0,0,1,False,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,893,1,0,0,0,1,False,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,894,0,1,0,1,0,False,0,0,0,...,0,0,0,1,1,0,0,0,0,0
3,895,0,1,0,0,1,False,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4,896,1,0,0,0,1,False,2,0,0,...,0,1,0,0,1,0,0,0,0,0
5,897,0,1,0,0,1,False,0,0,0,...,1,0,0,0,1,0,0,0,0,0
6,898,1,0,0,0,1,False,0,0,0,...,0,1,0,0,1,0,0,0,0,0
7,899,0,1,0,1,0,False,2,0,0,...,0,1,0,0,1,0,0,0,0,0
8,900,1,0,0,0,1,False,0,0,0,...,1,0,0,0,1,0,0,0,0,0
9,901,0,1,0,0,1,False,2,0,0,...,0,1,0,0,1,0,0,0,0,0


In [284]:
X_train = df_training_preprocessed.drop(['PassengerId', 'Survived'], axis=1)
Y_train = df_training_preprocessed.Survived.values
X_test = df_test_preprocessed.drop(['PassengerId'], axis=1)

# Fix column naming by removing all special characters
X_train.rename(lambda x: ''.join(e for e in x if e.isalnum()), axis='columns', inplace=True)
X_test.rename(lambda x: ''.join(e for e in x if e.isalnum()), axis='columns', inplace=True)

In [294]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier


# A parameter grid for XGBoost
params = {
    'learning_rate': [0.01, 0.015, 0.02, 0.03],
    'min_child_weight': [1, 2, 3, 4, 5],
    'gamma': [0.0, 0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5],
    'n_estimators': [50, 100, 600]
}

xgb = XGBClassifier(objective='binary:logistic')

folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True)

grid = GridSearchCV(
    estimator=xgb,
    param_grid=params,
    scoring='roc_auc',
    n_jobs=8,
    cv=skf.split(X_train,Y_train),
    verbose=3
)
grid.fit(X_train, Y_train)
# print('\n All results:')
# print(grid.cv_results_)
print('\n Best estimator:')
print(grid.best_estimator_)
print('\n Best score:')
print(grid.best_score_ * 2 - 1)
print('\n Best parameters:')
print(grid.best_params_)
results = pd.DataFrame(grid.cv_results_)
results.to_csv('xgb-grid-search-results-04.csv', index=False)

Fitting 5 folds for each of 9720 candidates, totalling 48600 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 140 tasks      | elapsed:    9.7s
[Parallel(n_jobs=8)]: Done 300 tasks      | elapsed:   20.2s
[Parallel(n_jobs=8)]: Done 524 tasks      | elapsed:   37.3s
[Parallel(n_jobs=8)]: Done 812 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Done 1164 tasks      | elapsed:  1.8min
[Parallel(n_jobs=8)]: Done 1580 tasks      | elapsed:  2.6min
[Parallel(n_jobs=8)]: Done 2060 tasks      | elapsed:  3.4min
[Parallel(n_jobs=8)]: Done 2604 tasks      | elapsed:  4.2min
[Parallel(n_jobs=8)]: Done 3212 tasks      | elapsed:  5.1min
[Parallel(n_jobs=8)]: Done 3884 tasks      | elapsed:  6.3min
[Parallel(n_jobs=8)]: Done 4620 tasks      | elapsed:  7.7min
[Parallel(n_jobs=8)]: Done 5420 tasks      | elapsed:  9.3min
[Parallel(n_jobs=8)]: Done 6284 tasks      | elapsed: 10.9min
[Parallel(n_jobs=8)]: Done 7212 tasks      | elapsed: 12.7min



 Best estimator:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=1, learning_rate=0.015,
       max_delta_step=0, max_depth=4, min_child_weight=2, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1.0)

 Best score:
0.7480012697553315

 Best parameters:
{'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.015, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'subsample': 1.0}


In [295]:
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=1, learning_rate=0.015,
       max_delta_step=0, max_depth=4, min_child_weight=2, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1.0)

model.fit(X_train, Y_train)

y_test = model.predict(X_test)
print(len(X_test))
print(len(y_test))
results_df = pd.DataFrame(data={'PassengerId':df_test_preprocessed['PassengerId'], 'Survived':y_test})
results_df

418
418


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [296]:
results_df.to_csv('titanic-submission-xgb-04.csv', index=False)