# Titanic

### Imports & Loading Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

In [2]:
# Load data
training_pre = pd.read_csv('train.csv')
testing_pre = pd.read_csv('test.csv')

# Data Exploration

In [3]:
# Print first 5 training examples and feature dtypes
display(training_pre.head(5))
display(testing_pre.head(5))
print(training_pre.dtypes)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


**Explore the Training Set**

In [4]:
num_records = training_pre.shape[0]                     # Total number of records
num_surv = training_pre['Survived'].value_counts()[1]   # Number of survivors
num_death = training_pre['Survived'].value_counts()[0]  # Number of deaths



print('Total number of records: {}'.format(num_records))
print('Number of survivors: {}'.format(num_surv))
print('Number of deaths: {}'.format(num_death))
print('Percentage of deaths: {}%'.format(round(100 * num_death/num_records, 1)))

# Print Correlation between Features
print('\nCorrelation between numerical Features: ')
training_pre.corr()

Total number of records: 891
Number of survivors: 342
Number of deaths: 549
Percentage of deaths: 61.6%

Correlation between numerical Features: 


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


# Pre-Processing

## NaN Values

Here we explore the NaN values in our training and testing sets.

In [5]:
# Count total NaNs and non-NaNs
num_nan_train = training_pre.isnull().sum().sum()
num_non_nan_train = training_pre.count().sum()
num_nan_test = testing_pre.isnull().sum().sum()
num_non_nan_test = testing_pre.count().sum()

# Print total NaN
print('Training Set\nNumber of NaNs: {}'.format(num_nan_train))
print('Number of non-NaNs: {}\n'.format(num_non_nan_train))
print('Testing Set\nNumber of NaNs: {}'.format(num_nan_test))
print('Number of non-NaNs: {}\n'.format(num_non_nan_test))

# Print NaN for each feature
print('NaN in training set: \n{}\n'.format(training_pre.isnull().sum()))
print('NaN in test set: \n{}\n'.format(testing_pre.isnull().sum()))

Training Set
Number of NaNs: 866
Number of non-NaNs: 9826

Testing Set
Number of NaNs: 414
Number of non-NaNs: 4184

NaN in training set: 
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

NaN in test set: 
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64



## Cleaning NaN
##### Key Points for NaN

- About 10% of our values are NaN for each set
- NaN's reside almost exclusively in the "Age" and "Cabin" features for both the training and the testing set

**Strategy**

Because our dataset is quite small, we will not delete any inputs containing NaN values. Instead, we will try *forward filling* and *backward filling* (i.e. replace NaN with the value from the previous row).

In [6]:
# Create non-NaN sets
training_pre_nan = training_pre.fillna(method = 'ffill', axis = 0)
training_pre_nan = training_pre_nan.fillna(method = 'backfill', axis = 0)
testing_pre_nan = training_pre.fillna(method = 'ffill', axis = 0)
testing_pre_nan = training_pre_nan.fillna(method = 'backfill', axis = 0)

# Print total NaN
print('Training Set NaNs: {}'.format(training_pre_nan.isnull().sum().sum()))
print('Testing Set NaNs: {}'.format(testing_pre_nan.isnull().sum().sum()))

Training Set NaNs: 0
Testing Set NaNs: 0


## Split Data

In [7]:
# Split data into features and labels
labels = training_pre_nan['Survived']
features_pre_nan = training_pre_nan.drop(['Survived', 'PassengerId'], axis = 1)

display(features_pre_nan.head(5))

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,C85,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,C85,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,C123,S


## Feature Engineering

Since the size of our dataset is small, it would help to give our model more features to work with. 
Let's add in (Age * Pclass) as a feature as well as the family size.
Finally, let's include the fare per person.

In [8]:
# Age * Class
features_pre_nan['Age*Class'] = features_pre_nan['Age'] * features_pre_nan['Pclass']
# Family Size
features_pre_nan['Family_Size']= features_pre_nan['SibSp'] + features_pre_nan['Parch']
# Fare per Person
features_pre_nan['Fare_Per_Person']= features_pre_nan['Fare'] / (features_pre_nan['Family_Size']+1)

display(features_pre_nan)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age*Class,Family_Size,Fare_Per_Person
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,C85,S,66.0,1,3.62500
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,1,35.64165
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,C85,S,78.0,0,7.92500
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,35.0,1,26.55000
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,C123,S,105.0,0,8.05000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,C50,S,54.0,0,13.00000
887,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,19.0,0,30.00000
888,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,19.0,1,2,W./C. 6607,23.4500,B42,S,57.0,3,5.86250
889,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,26.0,0,30.00000


## Normalise Numerical Features

**Normalise**: Age, SibSp, Parch, Fare, Age*Class, Family_Size, Fare_Per_Person

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare', 'Age*Class', 'Family_Size', 'Fare_Per_Person']

features_pre_nan_norm = pd.DataFrame(data = features_pre_nan)
features_pre_nan_norm[numerical_features] = scaler.fit_transform(features_pre_nan[numerical_features])

display(features_pre_nan_norm)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age*Class,Family_Size,Fare_Per_Person
0,3,"Braund, Mr. Owen Harris",male,0.271174,0.125,0.000000,A/5 21171,0.014151,C85,S,0.294373,0.1,0.007076
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,0.472229,0.125,0.000000,PC 17599,0.139136,C85,C,0.167722,0.1,0.069568
2,3,"Heikkinen, Miss. Laina",female,0.321438,0.000,0.000000,STON/O2. 3101282,0.015469,C85,S,0.348652,0.0,0.015469
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,0.434531,0.125,0.000000,113803,0.103644,C123,S,0.154152,0.1,0.051822
4,3,"Allen, Mr. William Henry",male,0.434531,0.000,0.000000,373450,0.015713,C123,S,0.470780,0.0,0.015713
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,"Montvila, Rev. Juozas",male,0.334004,0.000,0.000000,211536,0.025374,C50,S,0.240094,0.0,0.025374
887,1,"Graham, Miss. Margaret Edith",female,0.233476,0.000,0.000000,112053,0.058556,B42,S,0.081780,0.0,0.058556
888,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,0.233476,0.125,0.333333,W./C. 6607,0.045771,B42,S,0.253664,0.3,0.011443
889,1,"Behr, Mr. Karl Howell",male,0.321438,0.000,0.000000,111369,0.058556,C148,C,0.113443,0.0,0.058556


## One-Hot Encoding

**Assume** 0 correlation between name and survivability rate. Remove the Names column.

In [10]:
features_pre_nan_norm_noname = features_pre_nan_norm.drop('Name', axis = 1)

display(features_pre_nan_norm_noname)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age*Class,Family_Size,Fare_Per_Person
0,3,male,0.271174,0.125,0.000000,A/5 21171,0.014151,C85,S,0.294373,0.1,0.007076
1,1,female,0.472229,0.125,0.000000,PC 17599,0.139136,C85,C,0.167722,0.1,0.069568
2,3,female,0.321438,0.000,0.000000,STON/O2. 3101282,0.015469,C85,S,0.348652,0.0,0.015469
3,1,female,0.434531,0.125,0.000000,113803,0.103644,C123,S,0.154152,0.1,0.051822
4,3,male,0.434531,0.000,0.000000,373450,0.015713,C123,S,0.470780,0.0,0.015713
...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,male,0.334004,0.000,0.000000,211536,0.025374,C50,S,0.240094,0.0,0.025374
887,1,female,0.233476,0.000,0.000000,112053,0.058556,B42,S,0.081780,0.0,0.058556
888,3,female,0.233476,0.125,0.333333,W./C. 6607,0.045771,B42,S,0.253664,0.3,0.011443
889,1,male,0.321438,0.000,0.000000,111369,0.058556,C148,C,0.113443,0.0,0.058556


**One-Hot Encode**: Pclass, Sex, Ticket, Cabin, Embarked

In [11]:
categorical_features = ['Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked']

# Make a copy of current DF
features_pre_nan_norm_noname_onehot = pd.DataFrame(data = features_pre_nan_norm_noname)
features_pre_nan_norm_noname_onehot = features_pre_nan_norm_noname_onehot.astype({'Pclass': str})
print(features_pre_nan_norm_noname_onehot.dtypes)
# Get encoded
features_pre_nan_norm_noname_onehot = pd.get_dummies(features_pre_nan_norm_noname_onehot)

# Print the number of features after one-hot encoding
encoded = list(features_pre_nan_norm_noname_onehot.columns)
print("{} total features after one-hot encoding.".format(len(encoded)))
features_pre_nan_norm_noname_onehot

Pclass              object
Sex                 object
Age                float64
SibSp              float64
Parch              float64
Ticket              object
Fare               float64
Cabin               object
Embarked            object
Age*Class          float64
Family_Size        float64
Fare_Per_Person    float64
dtype: object
843 total features after one-hot encoding.


Unnamed: 0,Age,SibSp,Parch,Fare,Age*Class,Family_Size,Fare_Per_Person,Pclass_1,Pclass_2,Pclass_3,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,0.271174,0.125,0.000000,0.014151,0.294373,0.1,0.007076,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,0.472229,0.125,0.000000,0.139136,0.167722,0.1,0.069568,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.321438,0.000,0.000000,0.015469,0.348652,0.0,0.015469,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,0.434531,0.125,0.000000,0.103644,0.154152,0.1,0.051822,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0.434531,0.000,0.000000,0.015713,0.470780,0.0,0.015713,0,0,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.334004,0.000,0.000000,0.025374,0.240094,0.0,0.025374,0,1,0,...,0,0,0,0,0,0,0,0,0,1
887,0.233476,0.000,0.000000,0.058556,0.081780,0.0,0.058556,1,0,0,...,0,0,0,0,0,0,0,0,0,1
888,0.233476,0.125,0.333333,0.045771,0.253664,0.3,0.011443,0,0,1,...,0,0,0,0,0,0,0,0,0,1
889,0.321438,0.000,0.000000,0.058556,0.113443,0.0,0.058556,1,0,0,...,0,0,0,0,0,0,0,1,0,0


# Machine Learning

Now that the pre-processing is done, we are ready for the ML part!

Split data into training and cross-validation sets

In [12]:
X = features_pre_nan_norm_noname_onehot
y = labels

data = X.join(y)

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

## Naive Predictor

To have something to compare our model to, we'll calculate a *naive perdictor* accuracy.

Since more than 50% of people died, our naive predictor will assume that everyone died and we'll calculate its accuracy.

**NB:** Our metric of choice will be accuracy (instead of f-score) since there is not great imbalance in the classes (and since this is what Kaggle requests)

In [13]:
# Calculate and print "naive accuracy"
naive_accuracy = 100* num_death / (num_records)

print('The naive prediction is: {}%'.format(round(naive_accuracy, 1)))

The naive prediction is: 61.6%


## Import and Instantiate Classifiers


In [14]:
# Ensemble Methods
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier  # Decision Trees
from sklearn.naive_bayes import MultinomialNB    # Naive Bayes
from sklearn.naive_bayes import GaussianNB       # Gaussian Naive Bayes
from sklearn.svm import SVC                      # SVM
import random

random.seed(42)
decision_tree = DecisionTreeClassifier(random_state=42)
naive_bayes = MultinomialNB()
naive_bayes_g = GaussianNB()
bagging = BaggingClassifier(random_state=42, n_jobs=-1)
random_forest = RandomForestClassifier(random_state=42, n_jobs=-1)
svm = SVC(random_state=42)
adaboost = AdaBoostClassifier(random_state=42)

## Train-Predict Pipeline

## Grid Search

Evaluate each models performance on a cross-validation set using grid search.

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from time import time




# Naive Bayes
alpha = [1]

# Bagging
n_estimators_bagging = [10, 20, 30, 50, 100, 200, 300]
max_samples_bagging = [0.1, 0.2, 0.5, 1.0, 2.0, 3.0, 5.0]
max_features_bagging = [0.5, 1.0, 2.0, 3.0, 50.0, 200]

# Random Forests
criterion = ['gini', 'entropy']
n_estimators_rf = [100, 150, 200]
max_depth_rf = [None, 1, 3, 5, 10]
min_samples_leaf_rf = [5, 10]
min_samples_split_rf = [5, 10]


# SVC
kernel = ['linear', 'poly', 'rbf']
c_parameter = [0.001, 0.01, 0.1, 1, 10]
gamma = [0.001, 0.01, 0.1, 1]


# AdaBoost
n_estimators_ada = [10, 30, 50, 100, 200, 500]
learning_rate = [0.001, 0.01, 0.1, 0.5, 1, 1.5, 2]






# Hyperparameters
naive_bayes_parameters = {'alpha': alpha}
naive_bayes_g_parameters = {'var_smoothing': np.logspace(0,-9, num=1000)}
bagging_parameters = {'n_estimators': n_estimators_bagging, 'max_features': max_features_bagging, 'max_samples': max_samples_bagging}
random_forest_parameters = {'n_estimators': n_estimators_rf, 'max_depth': max_depth_rf, 'min_samples_leaf': min_samples_leaf_rf, 'min_samples_split': min_samples_split_rf}
svm_parameters = {'kernel': kernel, 'C': c_parameter, 'gamma': gamma}
adaboost_parameters = {'n_estimators': n_estimators_ada, 'learning_rate': learning_rate}

# Scoring object using accuracy
scorer = make_scorer(accuracy_score)


clfs_param =[(naive_bayes, naive_bayes_parameters), 
             (naive_bayes_g, naive_bayes_g_parameters), 
             (bagging, bagging_parameters), 
             (random_forest, random_forest_parameters), 
             (svm, svm_parameters), 
             (adaboost, adaboost_parameters)]

In [None]:
# Create list to store models
models = []
unopt_accuracies = []
accuracies = []

# Perform grid search
for clf, parameter in clfs_param:
    print('\n{}\n'.format(clf.__class__.__name__))
    
    grid_obj = GridSearchCV(clf, parameter, scoring=scorer, n_jobs = -1)
    
    # Perform grid search
    start = time()
    grid_fit = grid_obj.fit(X_train, y_train)
    end = time()
    print('Time to tune: {}s\n'.format(round(end - start), 2))
    
    # Get best estimator
    best_clf = grid_fit.best_estimator_
    models.append(best_clf)
    
    # Make predictions using the unoptimized and model
    predictions = (clf.fit(X_train, y_train)).predict(X_valid)
    best_predictions = best_clf.predict(X_valid)
    
    predictions_train = (clf.fit(X_train, y_train)).predict(X_train)
    best_predictions_train = best_clf.predict(X_train)
    
    # Check hyperparameters
    print('Unoptomised: {}\n'.format(clf.get_params(deep = True)))
    print('Optomised: {}\n'.format(best_clf.get_params(deep = True)))
    
    # Print Results
    print("\nUnoptimised-Accuracy-Training: {:.4f}".format(accuracy_score(y_train, predictions_train)))
    print("Optimised-Accuracy-training: {:.4f}".format(accuracy_score(y_train, best_predictions_train)))
    
    print("\nUnoptimised-Accuracy-validation: {:.4f}".format(accuracy_score(y_valid, predictions)))
    print("Optimised-Accuracy-validation: {:.4f}".format(accuracy_score(y_valid, best_predictions)))
    
    print('\n \n \n=============================================================================================')
    
    unopt_accuracies.append(accuracy_score(y_valid, predictions))
    accuracies.append(accuracy_score(y_valid, best_predictions))
    
print('All unoptimised accuracies (validation): {}'.format(unopt_accuracies))
print('Best unoptimised accuracy (validation): {}\n'.format(max(unopt_accuracies)))
print('All optimised accuracies (validation): {}'.format(accuracies))
print('Best optimised accuracy (validation): {}'.format(max(accuracies)))


MultinomialNB

Time to tune: 1s

Unoptomised: {'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

Optomised: {'alpha': 1, 'class_prior': None, 'fit_prior': True}


Unoptimised-Accuracy-Training: 0.9087
Optimised-Accuracy-training: 0.9087

Unoptimised-Accuracy-validation: 0.7821
Optimised-Accuracy-validation: 0.7821

 
 

GaussianNB

Time to tune: 24s

Unoptomised: {'priors': None, 'var_smoothing': 1e-09}

Optomised: {'priors': None, 'var_smoothing': 0.9593608287093143}


Unoptimised-Accuracy-Training: 0.9593
Optimised-Accuracy-training: 0.8104

Unoptimised-Accuracy-validation: 0.4525
Optimised-Accuracy-validation: 0.8045

 
 

BaggingClassifier

Time to tune: 38s

Unoptomised: {'base_estimator': None, 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 10, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

Optomised: {'base_estimator': None, 'bootstrap': True, 'bootstrap_features': False, '