### Abstract

This notebook is intended to do the prediction on the suivavibility of passengers on board on Titanic. 
It is inspired by [a blog post](https://zhuanlan.zhihu.com/p/28739256) from zhihu.com.

In [70]:
import xgboost as xgb

import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

In [90]:
train = pd.read_csv('./titanic_train.csv')
test = pd.read_csv('./titanic_test.csv')
sample_submission = pd.read_csv('./titanic_gender_submission.csv')

print('testing data: ', len(test))
print('training data samples:', len(train))
train.head()

testing data:  418
training data samples: 891


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [100]:
train[train['Survived'] == 1].to_csv('titanic_survived.csv')

In [69]:
print('testing data samples:')
test.head(20)

testing data samples:


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [39]:
sample_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


### Feature Engineering

In [101]:

def feature_engineer(titanic):
    
    # missing the missing 'age' with median 
    titanic["age"] = titanic["Age"].fillna(titanic["Age"].median())
    
    # add a new feature to indicate whether a passenger is child
    titanic["child"] = titanic["Age"].apply(lambda x: 1 if x < 15 else 0)

    # numeralize the feature 'sex'
    titanic["sex"] = titanic["Sex"].apply(lambda x: 1 if x == "male" else 0)

    # fill the missing data in feature 'Embarked'
    titanic["Embarked"] = titanic["Embarked"].fillna("NA")
    embark_encoder = preprocessing.LabelEncoder()
    titanic["embark"] = embark_encoder.fit_transform(titanic["Embarked"])

    # add a new feature to count the size of family that one belongs to
    titanic["fimalysize"] = titanic["SibSp"] + titanic["Parch"] + 1

    # fill the missing data for the feature 'cabin'
    def getCabin(cabin):
        if cabin == "N":
            return 0
        else:
            return 1
    titanic["cabin"] = titanic["Cabin"].apply(getCabin)

    # categorize the name based on the marital status
    def getCivil(name):
        if "Mr" in str(name):
            return 1
        elif "Mrs" in str(name):
            return 2
        elif 'Miss' in str(name):
            return 3
        elif 'Dr.' in str(name):
            return 4
        elif 'Master' in str(name):
            return 5
        else:
            return 0
    titanic["civil"] = titanic["Name"].apply(getCivil)

    # fill the missing fare with the median
    titanic["fare"] = titanic["Fare"].fillna(titanic["Fare"].median())

    return titanic[titanic.columns.difference(
        ['Fare', 'Age', 'Name', 'Cabin', 'Embarked', 'Sex', 'SibSb', 'Parch', 'Ticket'])]

In [102]:
df_train = feature_engineer(train)
df_test  = feature_engineer(test)

df_train.head()

x_train = df_train[df_train.columns.difference(['Survived'])]
y_train = df_train['Survived']
print('x_train:')
x_train.head(10)
#print('y_train:')
#print(y_train.values[0:5])

x_train:


Unnamed: 0,PassengerId,Pclass,SibSp,age,cabin,child,civil,embark,fare,fimalysize,sex
0,1,3,1,22.0,1,0,1,3,7.25,2,1
1,2,1,1,38.0,1,0,1,0,71.2833,2,0
2,3,3,0,26.0,1,0,3,3,7.925,1,0
3,4,1,1,35.0,1,0,1,3,53.1,2,0
4,5,3,0,35.0,1,0,1,3,8.05,1,1
5,6,3,0,28.0,1,0,1,2,8.4583,1,1
6,7,1,0,54.0,1,0,1,3,51.8625,1,1
7,8,3,3,2.0,1,1,5,3,21.075,5,1
8,9,3,0,27.0,1,0,1,3,11.1333,3,0
9,10,2,1,14.0,1,1,1,0,30.0708,2,0


### XGBoost Model

In [103]:
estimator = XGBClassifier(learning_rate=0.1,
                          max_depth=3,
                          n_estimators = 50,  # number of trees to train
                          silent=False,
                          # binary classification
                          objective='binary:logistic')

estimator.fit(x_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=50, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1)

In [112]:
y_train_pred = estimator.predict(x_train)

print('train accuracy:', accuracy_score(y_train, y_train_pred))

train accuracy: 0.864197530864


In [109]:
feature_factors = pd.DataFrame()
feature_factors['feature'] = x_train.columns
feature_factors['importance'] = estimator.feature_importances_

feature_factors

Unnamed: 0,feature,importance
0,PassengerId,0.174847
1,Pclass,0.128834
2,SibSp,0.030675
3,age,0.174847
4,cabin,0.0
5,child,0.0
6,civil,0.058282
7,embark,0.033742
8,fare,0.217791
9,fimalysize,0.08589


In [66]:
from sklearn.metrics import accuracy_score

def evaluate(estimator, x, y):
    y_pred = estimator.predict(x)
    return accuracy_score(y, y_pred)


def generate_submission(estimator, df_test):
    y_test_pred = estimator.predict(df_test)

    submission = pd.DataFrame()
    submission['PassengerId'] = df_test['PassengerId']
    submission['Survived'] = y_test_pred

    return submission

In [113]:
submission = generate_submission(estimator, df_test)

submission.head(10)

#submission.to_csv('titanic_submission_350.csv', index=False)

### Evaluation

In [116]:
# cross validation with scikit learn

param_test = {
    'n_estimators': [x for x in range(30, 52, 2)],
    'max_depth':    [x for x in range(2, 6, 1)]
}

grid_search = GridSearchCV(estimator = estimator,
                           param_grid = param_test,
                           scoring='accuracy', 
                           cv=10,
                           verbose=1)

grid_search.fit(x_train, y_train)

print('Best model:', grid_search.best_params_)
print('Best accuracy:', grid_search.best_score_)

Fitting 10 folds for each of 44 candidates, totalling 440 fits
Best model: {'max_depth': 4, 'n_estimators': 30}
Best accuracy: 0.83164983165


[Parallel(n_jobs=1)]: Done 440 out of 440 | elapsed:   42.7s finished


In [117]:
submission_2 = generate_submission(grid_search.best_estimator_, df_test)

print('train accuracy:', evaluate(grid_search.best_estimator_, x_train, y_train))

train accuracy: 0.870931537598


In [118]:
np.where(submission != submission_2)

(array([  4,  36,  37,  41,  72,  87,  90, 104, 138, 144, 148, 153, 158,
        159, 169, 199, 206, 225, 268, 280, 291, 323, 367, 376, 382]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]))

In [119]:
submission_2.to_csv('titanic_submission_430.csv', index=False)