### Abstract

This notebook is intended to do the prediction on the suivavibility of passengers on board on Titanic. 
It is inspired by [a blog post](https://zhuanlan.zhihu.com/p/28739256) from zhihu.com.

In [65]:
import xgboost as xgb

import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [26]:
!pip list | grep learn

scikit-learn (0.19.0)


In [38]:
train = pd.read_csv('./titanic_train.csv')
test = pd.read_csv('./titanic_test.csv')
sample_submission = pd.read_csv('./titanic_gender_submission.csv')

print('testing data: ', len(test))
print('training data samples:', len(train))
train.head()

testing data:  418
training data samples: 891


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
print('testing data samples:')
test.head()

testing data samples:


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [39]:
sample_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


### Feature Engineering

In [21]:
def feature_engineer(titanic):
    
    # missing the missing 'age' with median 
    titanic["age"] = titanic["Age"].fillna(titanic["Age"].median())
    
    # add a new feature to indicate whether a passenger is child
    titanic["child"] = titanic["Age"].apply(lambda x: 1 if x < 15 else 0)

    # numeralize the feature 'sex'
    titanic["sex"] = titanic["Sex"].apply(lambda x: 1 if x == "male" else 0)

    # fill the missing data in feature 'Embarked'
    titanic["Embarked"] = titanic["Embarked"].fillna("S")
    
    # embark
    def getEmbark(Embarked):
        if Embarked == "S":
            return 1
        elif Embarked == "C":
            return 2
        else:
            return 3
    titanic["embark"] = titanic["Embarked"].apply(getEmbark)

    # add a new feature to count the size of family that one belongs to
    titanic["fimalysize"] = titanic["SibSp"] + titanic["Parch"] + 1

    # fill the missing data for the feature 'cabin'
    def getCabin(cabin):
        if cabin == "N":
            return 0
        else:
            return 1
    titanic["cabin"] = titanic["Cabin"].apply(getCabin)

    # categorize the name based on the gender
    def getName(name):
        if "Mr" in str(name):
            return 1
        elif "Mrs" in str(name):
            return 2
        else:
            return 0
    titanic["name"] = titanic["Name"].apply(getName)

    # fill the missing fare with the median
    titanic["fare"] = titanic["Fare"].fillna(titanic["Fare"].median())

    return titanic[titanic.columns.difference(
        ['Fare', 'Age', 'Name', 'Cabin', 'Embarked', 'Sex', 'SibSb', 'Parch', 'Ticket'])]

In [22]:
df_train = feature_engineer(train)
df_test  = feature_engineer(test)

df_train.head()

x_train = df_train[df_train.columns.difference(['Survived'])]
y_train = df_train['Survived']
print('x_train:')
print(x_train.head())

print('y_train:')
print(y_train.values[0:5])

x_train:
   PassengerId  Pclass  SibSp   age  cabin  child  embark     fare  \
0            1       3      1  22.0      1      0       1   7.2500   
1            2       1      1  38.0      1      0       2  71.2833   
2            3       3      0  26.0      1      0       1   7.9250   
3            4       1      1  35.0      1      0       1  53.1000   
4            5       3      0  35.0      1      0       1   8.0500   

   fimalysize  name  sex  
0           2     1    1  
1           2     1    0  
2           1     0    0  
3           2     1    0  
4           1     1    1  
y_train:
[0 1 1 1 0]


In [31]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,SibSp,age,cabin,child,embark,fare,fimalysize,name,sex
0,892,3,0,34.5,1,0,3,7.8292,1,1,1
1,893,3,1,47.0,1,0,1,7.0,2,1,0
2,894,2,0,62.0,1,0,3,9.6875,1,1,1
3,895,3,0,27.0,1,0,1,8.6625,1,1,1
4,896,3,1,22.0,1,0,1,12.2875,3,1,0


### XGBoost Model

In [29]:
estimator = XGBClassifier(learning_rate=0.1,
                          max_depth=2,
                          n_estimators = 50,  # number of trees to train
                          silent=False,
                          # binary classification
                          objective='binary:logistic')

estimator.fit(x_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=50, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1)

In [40]:
y_train_pred = estimator.predict(x_train)

print('train accuracy:', accuracy_score(y_train, y_train_pred))

train accuracy: 0.838383838384


In [66]:
from sklearn.metrics import accuracy_score

def evaluate(estimator, x, y):
    y_pred = estimator.predict(x)
    return accuracy_score(y, y_pred)


def generate_submission(estimator, df_test):
    y_test_pred = estimator.predict(df_test)

    submission = pd.DataFrame()
    submission['PassengerId'] = df_test['PassengerId']
    submission['Survived'] = y_test_pred

    return submission

In [59]:
submission = generate_submission(estimator, df_test)

submission.head(10)

#submission.to_csv('titanic_submission.csv', index=False)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


### Evaluation

In [56]:
# cross validation with scikit learn

param_test = {
    'n_estimators': [x for x in range(30, 52, 2)],
    'max_depth':    [x for x in range(2, 8, 1)]
}

grid_search = GridSearchCV(estimator = estimator,
                           param_grid = param_test,
                           scoring='accuracy', 
                           cv=5,
                           verbose=1)

grid_search.fit(x_train, y_train)

print('Best model:', grid_search.best_params_)
print('Best accuracy:', grid_search.best_score_)

Fitting 5 folds for each of 66 candidates, totalling 330 fits
Best model: {'max_depth': 2, 'n_estimators': 30}
Best accuracy: 0.810325476992


[Parallel(n_jobs=1)]: Done 330 out of 330 | elapsed:   38.0s finished


In [68]:
submission_2 = generate_submission(grid_search.best_estimator_, df_test)

print('train accuracy:', evaluate(grid_search.best_estimator_, x_train, y_train))

train accuracy: 0.828282828283


In [62]:
np.where(submission != submission_2)

(array([ 33,  36,  37,  87, 138, 161, 165, 169, 194, 268, 280, 359, 376, 383]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))

In [64]:
submission_2.to_csv('titanic_submission_230.csv', index=False)