In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree

import xgboost as xgb

In [3]:
# Get Titanic dataset
data = pd.read_csv("data/titanic_dataset.csv")
data.index = data.PassengerId.values
data.drop('PassengerId',axis=1,inplace=True)
print("dataset shape: " + str(data.shape))
data.head()

dataset shape: (891, 11)


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Prepare data (Features engineering)

# 1) transform string values in int values for categorical features (Sex, Embarked)
data['Sex'] = data['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
data['Embarked'] = data['Embarked'].fillna('U').map( {'S': 0, 'C': 1, 'Q': 2, 'U': 3 } ).astype(int)

# 2) Create a new boolean features 'HasCabin' which is False if Cabin is NaN, True otherwise
data['HasCabin'] = data.Cabin.notnull() * 1

# 3) Drop unnused features
data.drop(['Name','Ticket','Cabin'],axis=1,inplace=True)

# 4) Missing values: NaN value in Age: drop it for simplicity
data.dropna(inplace=True)

# Look the data
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,HasCabin
1,0,3,0,22.0,1,0,7.25,0,0
2,1,1,1,38.0,1,0,71.2833,1,1
3,1,3,1,26.0,0,0,7.925,0,0
4,1,1,1,35.0,1,0,53.1,0,1
5,0,3,0,35.0,0,0,8.05,0,0


In [5]:
# Split features and labels into X et Y numpy array
X = data.drop('Survived',axis=1).values
Y = data.Survived.values.reshape(X.shape[0],1)

# Split into train and test set (80/20)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

print("Number of entries in the training set : {}".format(X_train.shape[0]))
print("Number of entries in the test set     : {}".format(X_test.shape[0]))
print("Number of features in the training set: {}".format(X_train.shape[1]))

Number of entries in the training set : 571
Number of entries in the test set     : 143
Number of features in the training set: 8


In [6]:
# Prepare model comparison table
compModel = pd.DataFrame({'accuracy':0}, index = ['Decision tree','Random forest','AdaBoost','XGBoost']).T

In [7]:
# DECISION TREE
print("DECISION TREE:")

# Prepare DecisionTree model and fit it to the data
dt = DecisionTreeClassifier().fit(X_train, Y_train)

# Make prediction
predictions = dt.predict(X_test)
print('Prediction exemples: ' + str(dt.predict(X_test[:10])))

# Get accuracy of this model
score = dt.score(X_test, Y_test)
compModel['Decision tree'] = score
print("Decision Tree accuracy: {}".format(score))

DECISION TREE:
Prediction exemples: [1 1 1 0 1 0 1 0 0 0]
Decision Tree accuracy: 0.7622377622377622


In [8]:
# RANDOM FOREST
print("RANDOM FOREST:")

# Prepare DecisionTree model and fit it to the data
rf = RandomForestClassifier(n_estimators=50).fit(X_train, Y_train.reshape(Y_train.shape[0],))

# Make prediction
predictions = rf.predict(X_test)
print('Prediction exemples: ' + str(rf.predict(X_test[:10])))

# Get accuracy of this model
score = rf.score(X_test, Y_test)
compModel['Random forest'] = score
print("Random Forest accuracy: {}".format(score))

RANDOM FOREST:
Prediction exemples: [1 1 1 0 1 0 0 0 0 0]
Random Forest accuracy: 0.7832167832167832


In [9]:
# AdaBoost
print("AdaBoost:")

# Prepare DecisionTree model and fit it to the data
adaB = AdaBoostClassifier(n_estimators=50).fit(X_train, Y_train.reshape(Y_train.shape[0],))

# Make prediction
predictions = adaB.predict(X_test)
print('Prediction exemples: ' + str(adaB.predict(X_test[:10])))

# Get accuracy of this model
score = adaB.score(X_test, Y_test)
compModel['AdaBoost'] = score
print("AdaBoost accuracy: {}".format(score))

AdaBoost:
Prediction exemples: [1 1 1 0 1 0 0 0 0 0]
AdaBoost accuracy: 0.8321678321678322


In [10]:
# XGBoost
print("XGBoost")
print()

# Prepare dataset
xgb_train = xgb.DMatrix(X_train, label = Y_train)
xgb_test = xgb.DMatrix(X_test, label = Y_test)
watchlist = [(xgb_train, 'train'), (xgb_test, 'valid')]

# Prepare model (hyperparameters)
xgb_pars = {'min_child_weight': 5, 'eta': 0.9, 'max_depth': 15, 'gamma': 0.5, 
            'booster' : 'gbtree', 'objective': 'binary:logistic'}

# Train the XGBoost model
xgbModel = xgb.train(xgb_pars, xgb_train, 50, watchlist, early_stopping_rounds=50, maximize=False, verbose_eval=10)
print('Modeling RMSLE %.5f' % xgbModel.best_score)
print()

# Make prediction
predictions = (xgbModel.predict(xgb_test) > 0.5) * 1
print('Prediction exemples: ' + str((xgbModel.predict(xgb_test)[:10] > 0.5)*1))

# Get accuracy of this model
score = (Y_test.reshape(Y_test.shape[0],) == predictions).sum() / Y_test.shape[0]
compModel['XGBoost'] = score
print("XGBoost accuracy: {}".format(score))

XGBoost

[0]	train-error:0.182137	valid-error:0.195804
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 50 rounds.
[10]	train-error:0.103327	valid-error:0.174825
[20]	train-error:0.091068	valid-error:0.160839
[30]	train-error:0.087566	valid-error:0.160839
[40]	train-error:0.087566	valid-error:0.160839
[49]	train-error:0.087566	valid-error:0.160839
Modeling RMSLE 0.14685

Prediction exemples: [0 1 1 0 1 0 0 0 0 0]
XGBoost accuracy: 0.8391608391608392


In [11]:
# Compare model performances:
compModel.T.sort_values('accuracy',ascending=False)

Unnamed: 0,accuracy
XGBoost,0.839161
AdaBoost,0.832168
Random forest,0.783217
Decision tree,0.762238
