In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import clear_output, display

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree

import xgboost as xgb

In [None]:
# Get Titanic dataset
data = pd.read_csv("data/titanic_dataset.csv")
data.index = data.PassengerId.values
data.drop('PassengerId',axis=1,inplace=True)
print("dataset shape: " + str(data.shape))
data.head()

In [None]:
# Prepare data (Features engineering)

# 1) transform string values in int values for categorical features (Sex, Embarked)
data['Sex'] = data['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
data['Embarked'] = data['Embarked'].fillna('U').map( {'S': 0, 'C': 1, 'Q': 2, 'U': 3 } ).astype(int)

# 2) Create a new boolean features 'HasCabin' which is False if Cabin is NaN, True otherwise
data['HasCabin'] = data.Cabin.notnull() * 1

# 3) Drop unnused features
data.drop(['Name','Ticket','Cabin'],axis=1,inplace=True)

# 4) Missing values: NaN value in Age: drop it for simplicity
data.dropna(inplace=True)

# Look the data
data.head()

In [None]:
# Split features and labels into X et Y numpy array
X = data.drop('Survived',axis=1).values
Y = data.Survived.values.reshape(X.shape[0],1)

# Split into train and test set (80/20)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

print("Number of entries in the training set : {}".format(X_train.shape[0]))
print("Number of entries in the test set     : {}".format(X_test.shape[0]))
print("Number of features in the training set: {}".format(X_train.shape[1]))

In [None]:
# Prepare model comparison table
compModel = pd.DataFrame({'accuracy':0}, index = ['Decision tree','Random forest','AdaBoost','XGBoost']).T

In [None]:
# DECISION TREE
print("DECISION TREE:")

# Prepare DecisionTree model and fit it to the data
dt = DecisionTreeClassifier().fit(X_train, Y_train)

# Make prediction
predictions = dt.predict(X_test)
print('Prediction exemples: ' + str(dt.predict(X_test[:10])))

# Get accuracy of this model
score = dt.score(X_test, Y_test)
compModel['Decision tree'] = score
print("Decision Tree accuracy: {}".format(score))

In [None]:
# RANDOM FOREST
print("RANDOM FOREST:")

# Prepare DecisionTree model and fit it to the data
rf = RandomForestClassifier(n_estimators=50).fit(X_train, Y_train.reshape(Y_train.shape[0],))

# Make prediction
predictions = rf.predict(X_test)
print('Prediction exemples: ' + str(rf.predict(X_test[:10])))

# Get accuracy of this model
score = rf.score(X_test, Y_test)
compModel['Random forest'] = score
print("Random Forest accuracy: {}".format(score))

In [None]:
# AdaBoost
print("AdaBoost:")

# Prepare DecisionTree model and fit it to the data
adaB = AdaBoostClassifier(n_estimators=50).fit(X_train, Y_train.reshape(Y_train.shape[0],))

# Make prediction
predictions = adaB.predict(X_test)
print('Prediction exemples: ' + str(adaB.predict(X_test[:10])))

# Get accuracy of this model
score = adaB.score(X_test, Y_test)
compModel['AdaBoost'] = score
print("AdaBoost accuracy: {}".format(score))

In [None]:
# Gradient Boosting (using XGBoost)
print("Gradient boosting")
print()

# Prepare dataset
xgb_train = xgb.DMatrix(X_train, label = Y_train)
xgb_test = xgb.DMatrix(X_test, label = Y_test)
watchlist = [(xgb_train, 'train'), (xgb_test, 'valid')]

# Prepare model (hyperparameters)
xgb_pars = {'min_child_weight': 5, 'eta': 0.9, 'max_depth': 15, 'gamma': 0.5, 
            'booster' : 'gbtree', 'objective': 'binary:logistic'}

xgb_pars = {'min_child_weight': 5, 'eta': 0.9, 'max_depth': 1, 'gamma': 0.3, 
            'booster' : 'gbtree', 'objective': 'binary:logistic'}

# Train the XGBoost model
xgbModel = xgb.train(xgb_pars, xgb_train, 50, watchlist, early_stopping_rounds=50, maximize=False, verbose_eval=10)
print('Modeling RMSLE %.5f' % xgbModel.best_score)
print()

# Make prediction
predictions = (xgbModel.predict(xgb_test) > 0.5) * 1
print('Prediction exemples: ' + str((xgbModel.predict(xgb_test)[:10] > 0.5)*1))

# Get accuracy of this model
score = (Y_test.reshape(Y_test.shape[0],) == predictions).sum() / Y_test.shape[0]
compModel['Gradient boosting'] = score
print("Gradient boosting accuracy: {}".format(score))

In [None]:
# Compare model performances:
compModel.T.sort_values('accuracy',ascending=False)