In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
train_data = pd.read_csv('../input/titanic/train.csv')
test_data = pd.read_csv('../input/titanic/test.csv')

train_data['train_test'] = 1
test_data['train_test'] = 0
test_data['Survived'] = np.NaN

data = pd.concat([train_data,test_data])

In [None]:
train_data.head()

In [None]:
test_data.head()

# Data Exploration

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_data.describe()

In [None]:
train_data.info()

In [None]:
print('Sex:',train_data['Sex'].unique())
print('Pclass:',train_data['Pclass'].unique())
print('SibSp:',train_data['SibSp'].unique())
print('Parch:',train_data['Parch'].unique())
print('Embarked:',train_data['Embarked'].unique())

In [None]:
train_data.describe(include=['O'])

In [None]:
# Divide in numerical and categorical features

df_num = train_data[['Age','SibSp','Parch','Fare']]
df_cat = train_data[['Survived','Pclass','Sex','Ticket','Cabin','Embarked']]

In [None]:
#distributions for all numeric variables 
for i in df_num.columns:
    plt.hist(df_num[i])
    plt.title(i)
    plt.show()

In [None]:
print(df_num.corr())
sns.heatmap(df_num.corr())

In [None]:
# compare survival rate across Age, SibSp, Parch, and Fare 
pd.pivot_table(train_data, index = 'Survived', values = ['Age','SibSp','Parch','Fare'])

In [None]:
train_data[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_data[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
for i in df_cat.columns:
    sns.barplot(df_cat[i].value_counts().index,df_cat[i].value_counts()).set_title(i)
    plt.show()

In [None]:
# Comparing survival and each of these categorical variables 
print(pd.pivot_table(train_data, index = 'Survived', columns = 'Pclass', values = 'Ticket' ,aggfunc ='count'))
print()
print(pd.pivot_table(train_data, index = 'Survived', columns = 'Sex', values = 'Ticket' ,aggfunc ='count'))
print()
print(pd.pivot_table(train_data, index = 'Survived', columns = 'Embarked', values = 'Ticket' ,aggfunc ='count'))

In [None]:
train_data[["Pclass", "Survived"]].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_data[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_data[["Embarked", "Survived"]].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

# Feature Engineering

In [None]:
data

In [None]:
data.head()

In [None]:
# Drop Name, Ticket and Cabin columns

data_prep = data.drop(['Name','Ticket','Cabin'], axis = 1)
data_prep.head()

In [None]:
# Input missing data from Age and Fare with median

data_prep['Age'] = data_prep['Age'].fillna(train_data['Age'].median())
data_prep['Fare'] = data_prep['Fare'].fillna(train_data['Fare'].median())
data_prep.head()

In [None]:
# Drop Null 'Embarked' rows (only 2)

data_prep.dropna(subset=['Embarked'],inplace = True)
data_prep.head()

In [None]:
# Get dummies from categorical features

data_prep = pd.get_dummies(data_prep, columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked'])
data_prep.head()

In [None]:
#Split to train test again

X_train = data_prep[data_prep['train_test'] == 1].drop(['train_test'], axis =1)
X_test = data_prep[data_prep['train_test'] == 0].drop(['train_test'], axis =1)
y_train = data[data['train_test'] ==1]['Survived']

In [None]:
# Scale data from the continuous features Age and Fare to [0,1] so that they match de 'get dummies' scale from the 
# categorical features

from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()
data_prep_scaled = data_prep.copy()
data_prep_scaled[['Age','Fare']]= scale.fit_transform(data_prep_scaled[['Age','Fare']])
data_prep_scaled.head()

X_train_scaled = data_prep_scaled[data_prep_scaled['train_test'] == 1].drop(['train_test'], axis =1)
X_test_scaled = data_prep_scaled[data_prep_scaled['train_test'] == 0].drop(['train_test'], axis =1)

y_train = data[data['train_test'] ==1]['Survived']

In [None]:
data_prep_scaled.head()

In [None]:
X_train_scaled.head()

In [None]:
X_test_scaled.head()

# Shortlist Promising Models

In [None]:
# Prepare Train and Test data

X_train = X_train_scaled.drop(['PassengerId','Survived'], axis=1)
Y_train = X_train_scaled["Survived"]
X_test  = X_test_scaled.drop(['PassengerId','Survived'], axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

In [None]:
# Import models to be used

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

## Possible models

* Naive Bayes (40.7%)
* Logistic Regression (80.4%)
* Decision Tree (78.5%)
* K Nearest Neighbor (79.1%)
* Random Forest (79.5%)
* Support Vector Classifier(80.3%)
* Xtreme Gradient Boosting (81.2%)

5-fold Cross Validation

In [None]:
# Naive Bayes

gnb = GaussianNB()
cv = cross_val_score(gnb,X_train,Y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
# Logistic Regression

lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr,X_train,Y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
# Decision Tree

dt = tree.DecisionTreeClassifier(random_state = 1)
cv = cross_val_score(dt,X_train,Y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
# K-Nearest Neighbor

knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train,Y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
# Random Forest

rf = RandomForestClassifier(random_state = 1)
cv = cross_val_score(rf,X_train,Y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
# Support Vector Classifier

svc = SVC(probability = True)
cv = cross_val_score(svc,X_train,Y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
# Xtreme Gradient Boosting

from xgboost import XGBClassifier
xgb = XGBClassifier(random_state =1)
cv = cross_val_score(xgb,X_train,Y_train,cv=5)
print(cv)
print(cv.mean())

# Fine-Tune the System

Chosen Model: Xtreme Gradient Boosting (Base score: 81.2%)

In [None]:
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [None]:
# Define a function which will help  create XGBoost models and perform cross-validation.

def modelfit(alg, X_train, predictors, Y_train, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=Y_train.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X_train, Y_train, eval_metric='auc', verbose=True)
        
    #Predict training set:
    dtrain_predictions = alg.predict(X_train)
    dtrain_predprob = alg.predict_proba(X_train)[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(Y_train.values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(Y_train, dtrain_predprob))
    print(alg.n_estimators)
    
    # Get and plot feature importances
    feature_important = alg.get_booster().get_score(importance_type='weight')
    keys = list(feature_important.keys())
    values = list(feature_important.values())
    data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score")
    data.plot(kind='barh')

## Step 1: Fix learning rate and number of estimators for tuning tree-based parameters

In [None]:

#Choose all predictors except 'Survived' (which is the target) & 'PassengerID'
predictors = [x for x in X_train.columns]

# Define model hyperparameters
xgb1 = XGBClassifier(
 learning_rate =0.1, # fixed
 n_estimators=100, # fixed
 max_depth=5, # should be between 3-6 (too high values tend to overfitting; tune with CV afterwards)
 min_child_weight=1, # 1 is the default (increase to control over-fitting; tune with CV afterwards)
 gamma=0, # 0 is the default (tune afterwards)
 subsample=0.8, # Typical values: 0.5-1 (Lower values make the algorithm more conservative and prevents overfitting but too small values might lead to under-fitting)
 colsample_bytree=0.8, # same as subsample but with fraction of columns (features) instead of rows (observations)
 objective= 'binary:logistic', # logistic regression for binary classification, returns predicted probability (not class)
 scale_pos_weight=1, # 1 is the default (A value greater than 0 should be used in case of high class imbalance as it helps in faster convergence.)
 seed=10, # random number seed (Can be used for generating reproducible results and also for parameter tuning.)
 use_label_encoder=False
) 

# Fit the model and generate predictions using the 'modelfit' function defined previously
modelfit(xgb1, X_train, predictors, Y_train)

## Step 2: Tune max_depth and min_child_weight

In [None]:
# Define parameter grid
param_test1 = {
 'max_depth':range(3,10,2), # before was 5, now Grid Search on [3, 5, 7, 9]; >max_depth --> >over-fitting
 'min_child_weight':range(1,6,2) # before was 1, now Grid Search on [1, 3, 5] ; >min_child_weight --> <over-fitting
}

# Fit model with GridSearchCV to fin optimum 'max_depth' & 'min_child_weight' values (the other values remain fixed)
gsearch1 = GridSearchCV(
    estimator = XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=100, 
        max_depth=5,
        min_child_weight=1, 
        gamma=0, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        objective= 'binary:logistic', 
        nthread=4, 
        scale_pos_weight=1, 
        seed=10), 
    param_grid = param_test1, 
    scoring='roc_auc',   
    cv=5)
gsearch1.fit(X_train,Y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

Optimum values:
* **'max_depth'**: 9
* **'min_child_weight'**: 3

Note: 'max_depth' value of 9 is the maximum of the proposed grid, so I should check increasing this.

I'll further tune both parameters with 1 above and 1 below of each one:
* **'max_depth'**: [8, 9, 10]
* **'min_child_weight'**: [2, 3, 4]

In [None]:
# Define parameter grid
param_test2 = {
 'max_depth': [8, 9, 10], 
 'min_child_weight': [2, 3, 4] 
}

# Fit model with GridSearchCV to fin optimum 'max_depth' & 'min_child_weight' values (the other values remain fixed)
gsearch2 = GridSearchCV(
    estimator = XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=100, 
        max_depth=5,
        min_child_weight=1, 
        gamma=0, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        objective= 'binary:logistic', 
        nthread=4, 
        scale_pos_weight=1, 
        seed=10), 
    param_grid = param_test2, 
    scoring='roc_auc',   
    cv=5)
gsearch2.fit(X_train,Y_train)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

Optimum values:
* **'max_depth'**: 10
* **'min_child_weight'**: 3

'max_depth' optimum value is the tested maximum, so I'll try increasing that further. I'll leave 'min_child_weight' fixed in 3 as it didn't change.

In [None]:
# Define parameter grid
param_test3 = {
 'max_depth': [9, 10, 11, 12, 13], 
}

# Fit model with GridSearchCV to fin optimum 'max_depth' & 'min_child_weight' values (the other values remain fixed)
gsearch3 = GridSearchCV(
    estimator = XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=100, 
        max_depth=5,
        min_child_weight=3, 
        gamma=0, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        objective= 'binary:logistic', 
        nthread=4, 
        scale_pos_weight=1, 
        seed=10), 
    param_grid = param_test3, 
    scoring='roc_auc',   
    cv=5)
gsearch3.fit(X_train,Y_train)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

Optimum value:
* **'max_depth'**: 10

It didn't change so I'll leave it fixed in 10 from now on.

## Step 3: Tune gamma

In [None]:
# Define parameter grid
param_test4 = {
 'gamma':[i/10.0 for i in range(0,5)] # gamma = [0, 0.1, 0.2, 0.3, 0.4]; the default is 0; >gamma means a > minimum loss reduction requirement to make a split (makes the algoritm more conservative)
}

# Fit model with GridSearchCV to find optimum 'gamma' values (the other values remain fixed)
gsearch4 = GridSearchCV(
    estimator = XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=100, 
        max_depth=10, # found through GridSearchCV in Step 2
        min_child_weight=3,  # found through GridSearchCV in Step 2
        gamma=0, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        objective= 'binary:logistic', 
        nthread=4, 
        scale_pos_weight=1, 
        seed=10), 
    param_grid = param_test4, 
    scoring='roc_auc',   
    cv=5)
gsearch4.fit(X_train,Y_train)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

Optimum value:
* **'gamma'**: 0.0

The default 'gamma' value of 0 was the optimum so I'll keep it fixed from now on.

Before proceeding, I'm going to re-calibrate the number of boosting rounds for the updated parameters.

In [None]:
#Choose all predictors except 'Survived' (which is the target) & 'PassengerID'
predictors = [x for x in X_train.columns]

# Define model hyperparameters
xgb2 = XGBClassifier(
 learning_rate =0.1, # fixed
 n_estimators=100, # fixed
 max_depth=10,  # found through GridSearchCV in Step 2
 min_child_weight=3,  # found through GridSearchCV in Step 2
 gamma=0,  # found through GridSearchCV in Step 3
 subsample=0.8, 
 colsample_bytree=0.8, 
 objective= 'binary:logistic', 
 scale_pos_weight=1, 
 seed=10,
 use_label_encoder=False) 

# Fit the model and generate predictions using the 'modelfit' function defined previously
modelfit(xgb2, X_train, predictors, Y_train)

Current optimization through tuning of hyperparameters:
* Accuracy : 0.856 --> 0.8886
* AUC Score (Train): 0.900415 --> 0.944586


Tuned hyperparameters so far:
* 'max_depth'
* 'min_child_weight'
* 'gamma'

## Step 4: Tune subsample and colsample_bytree

In [None]:
# Define parameter grid
param_test5 = {
 'subsample':[i/10.0 for i in range(6,10)], # [0.6, 0.7, 0.8, 0.9]
 'colsample_bytree':[i/10.0 for i in range(6,10)] # [0.6, 0.7, 0.8, 0.9]
}

# Fit model with GridSearchCV to find optimum 'gamma' values (the other values remain fixed)
gsearch5 = GridSearchCV(
    estimator = XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=100, 
        max_depth=10, # found through GridSearchCV in Step 2
        min_child_weight=3,  # found through GridSearchCV in Step 2
        gamma=0,  # found through GridSearchCV in Step 3
        subsample=0.8, 
        colsample_bytree=0.8, 
        objective= 'binary:logistic', 
        nthread=4, 
        scale_pos_weight=1, 
        seed=10), 
    param_grid = param_test5, 
    scoring='roc_auc',   
    cv=5)
gsearch5.fit(X_train,Y_train)
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_

Optimum values:
* **'colsample_bytree'**: 0.8
* **'subsample'**: 0.9

Note: 'subsample' value of 0.9 is the maximum of the proposed grid, so I should check increasing this.

I'll further tune both parameters with 0.05 above and 0.05 below of each one:
* **'colsample_bytree'**: [0.75, 0.8, 0.85]
* **'subsample'**: [0.85, 0.9, 0.95]

In [None]:
# Define parameter grid
param_test6 = {
 'subsample':[i/100.0 for i in range(85, 100, 5)], # [0.85, 0.90, 0.95]
 'colsample_bytree':[i/100.0 for i in range(75, 90, 5)] # [0.75, 0.80, 0.85]
}

# Fit model with GridSearchCV to find optimum 'gamma' values (the other values remain fixed)
gsearch6 = GridSearchCV(
    estimator = XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=100, 
        max_depth=10, # found through GridSearchCV in Step 2
        min_child_weight=3,  # found through GridSearchCV in Step 2
        gamma=0,  # found through GridSearchCV in Step 3
        subsample=0.8, 
        colsample_bytree=0.8, 
        objective= 'binary:logistic', 
        nthread=4, 
        scale_pos_weight=1, 
        seed=10), 
    param_grid = param_test6, 
    scoring='roc_auc',   
    cv=5)
gsearch6.fit(X_train,Y_train)
gsearch6.cv_results_, gsearch6.best_params_, gsearch6.best_score_

Optimum values:
* **'colsample_bytree'**: 0.8
* **'subsample'**: 0.9

They haven't changed so I'll keep both of the values fixed from now on.

## Step 5: Tuning Regularization Parameters

In [None]:
# Define parameter grid
param_test7 = {
 'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100],
 'reg_lambda':[1, 10, 20, 50, 100, 1000]
}

# Fit model with GridSearchCV to find optimum 'gamma' values (the other values remain fixed)
gsearch7 = GridSearchCV(
    estimator = XGBClassifier( 
        learning_rate =0.1, 
        n_estimators=100, 
        max_depth=10, # found through GridSearchCV in Step 2
        min_child_weight=3,  # found through GridSearchCV in Step 2
        gamma=0,  # found through GridSearchCV in Step 3
        subsample=0.9,  # found through GridSearchCV in Step 4
        colsample_bytree=0.8,  # found through GridSearchCV in Step 4 
        objective= 'binary:logistic', 
        nthread=4, 
        scale_pos_weight=1, 
        seed=10), 
    param_grid = param_test7, 
    scoring='roc_auc',   
    cv=5)
gsearch7.fit(X_train,Y_train)
gsearch7.cv_results_, gsearch7.best_params_, gsearch7.best_score_

Optimum values:
* **'reg_alpha'**: 0
* **'reg_lambda'**: 1

The optimum values are the defaults for both parameters so I'll leave them as that.

Apply changes from steps 4 and 5 to the Model:

In [None]:
#Choose all predictors except 'Survived' (which is the target) & 'PassengerID'
predictors = [x for x in X_train.columns]

# Define model hyperparameters
xgb3 = XGBClassifier(
 learning_rate =0.1, # fixed
 n_estimators=100, # fixed
 max_depth=10,  # found through GridSearchCV in Step 2
 min_child_weight=3,  # found through GridSearchCV in Step 2
 gamma=0,  # found through GridSearchCV in Step 3
 subsample=0.9, # found through GridSearchCV in Step 4
 colsample_bytree=0.8, # found through GridSearchCV in Step 4
 reg_alpha=0, # found through GridSearchCV in Step 5
 reg_lambda=1,# found through GridSearchCV in Step 5
 objective= 'binary:logistic', 
 scale_pos_weight=1, 
 seed=10,
 use_label_encoder=False) 

# Fit the model and generate predictions using the 'modelfit' function defined previously
modelfit(xgb3, X_train, predictors, Y_train)

this model is slightly worse than the previous one, with the only change being the 'subsample' parameter. This parameter was 0.8 with the previous model and I found 0.9 to be the optimum value via GridSearchCV. So I don'y know why it is worse.

I'll keep the previous model as it yielded better results.

## Step 6: Reducing Learning Rate

In [None]:
#Choose all predictors except 'Survived' (which is the target) & 'PassengerID'
predictors = [x for x in X_train.columns]

# Define model hyperparameters
xgb4 = XGBClassifier(
 learning_rate =0.05, # fixed
 n_estimators=10000, # fixed
 max_depth=10,  # found through GridSearchCV in Step 2
 min_child_weight=3,  # found through GridSearchCV in Step 2
 gamma=0,  # found through GridSearchCV in Step 3
 subsample=0.8, # the optimum value found through GridSearchCV in Step 4 was 0.9 but strangely yielded a worse result in the model
 colsample_bytree=0.8, # found through GridSearchCV in Step 4
 reg_alpha=0, # found through GridSearchCV in Step 5
 reg_lambda=1,# found through GridSearchCV in Step 5
 objective= 'binary:logistic', 
 scale_pos_weight=1, 
 seed=10,
 use_label_encoder=False) 

# Fit the model and generate predictions using the 'modelfit' function defined previously
modelfit(xgb4, X_train, predictors, Y_train)

This model is slightly worse than the previous one, but I'll keep this one because a lower learning rate would make a more robust model that may generalize better on the test set.

Generate predictions on the test set:

In [None]:
Y_pred = xgb4.predict(X_test)

Generate submission file:

In [None]:
submission_xgb = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": Y_pred
    })

submission_xgb.to_csv('submission_xgb.csv', index=False)
print("Your submission was successfully saved!")