# Titanic

### Imports & Loading Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

In [41]:
# Load data
training_pre = pd.read_csv('train.csv')
testing_pre = pd.read_csv('test.csv')

# Data Exploration

In [31]:
# Print first 5 training examples and feature dtypes
display(training_pre.head(5))
display(testing_pre.head(5))
print(training_pre.dtypes)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


**Explore the Training Set**

In [43]:
num_records = training_pre.shape[0]                     # Total number of records
num_surv = training_pre['Survived'].value_counts()[1]   # Number of survivors
num_death = training_pre['Survived'].value_counts()[0]  # Number of deaths



print('Total number of records: {}'.format(num_records))
print('Number of survivors: {}'.format(num_surv))
print('Number of deaths: {}'.format(num_death))
print('Percentage of deaths: {}%'.format(round(100 * num_death/num_records, 1)))

Total number of records: 891
Number of survivors: 342
Number of deaths: 549
Percentage of deaths: 61.6%


# Pre-Processing

## NaN Values

Here we explore the NaN values in our training and testing sets.

In [5]:
# Count total NaNs and non-NaNs
num_nan_train = training_pre.isnull().sum().sum()
num_non_nan_train = training_pre.count().sum()
num_nan_test = testing_pre.isnull().sum().sum()
num_non_nan_test = testing_pre.count().sum()

# Print total NaN
print('Training Set\nNumber of NaNs: {}'.format(num_nan_train))
print('Number of non-NaNs: {}\n'.format(num_non_nan_train))
print('Testing Set\nNumber of NaNs: {}'.format(num_nan_test))
print('Number of non-NaNs: {}\n'.format(num_non_nan_test))

# Print NaN for each feature
print('NaN in training set: \n{}\n'.format(training_pre.isnull().sum()))
print('NaN in test set: \n{}\n'.format(testing_pre.isnull().sum()))

Training Set
Number of NaNs: 866
Number of non-NaNs: 9826

Testing Set
Number of NaNs: 414
Number of non-NaNs: 4184

NaN in training set: 
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

NaN in test set: 
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64



## Cleaning NaN
##### Key Points for NaN

- About 10% of our values are NaN for each set
- NaN's reside almost exclusively in the "Age" and "Cabin" features for both the training and the testing set

**Strategy**

Because our dataset is quite small, we will not delete any inputs containing NaN values. Instead, we will try *forward filling* and *backward filling* (i.e. replace NaN with the value from the previous row).

In [6]:
# Create non-NaN sets
training_pre_nan = training_pre.fillna(method = 'ffill', axis = 0)
training_pre_nan = training_pre_nan.fillna(method = 'backfill', axis = 0)
testing_pre_nan = training_pre.fillna(method = 'ffill', axis = 0)
testing_pre_nan = training_pre_nan.fillna(method = 'backfill', axis = 0)

# Print total NaN
print('Training Set NaNs: {}'.format(training_pre_nan.isnull().sum().sum()))
print('Testing Set NaNs: {}'.format(testing_pre_nan.isnull().sum().sum()))

Training Set NaNs: 0
Testing Set NaNs: 0


## Split Data

In [7]:
# Split data into features and labels
labels_nan_pre = training_pre_nan['Survived']
features_pre_nan = training_pre_nan.drop(['Survived', 'PassengerId'], axis = 1)

display(features_pre_nan.head(5))

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,C85,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,C85,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,C123,S


## Normalise Numerical Features

**Normalise**: Age, SibSp, Parch, Fare

In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare']

features_pre_nan_norm = pd.DataFrame(data = features_pre_nan)
features_pre_nan_norm[numerical_features] = scaler.fit_transform(features_pre_nan[numerical_features])

display(features_pre_nan_norm)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,0.271174,0.125,0.000000,A/5 21171,0.014151,C85,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,0.472229,0.125,0.000000,PC 17599,0.139136,C85,C
2,3,"Heikkinen, Miss. Laina",female,0.321438,0.000,0.000000,STON/O2. 3101282,0.015469,C85,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,0.434531,0.125,0.000000,113803,0.103644,C123,S
4,3,"Allen, Mr. William Henry",male,0.434531,0.000,0.000000,373450,0.015713,C123,S
...,...,...,...,...,...,...,...,...,...,...
886,2,"Montvila, Rev. Juozas",male,0.334004,0.000,0.000000,211536,0.025374,C50,S
887,1,"Graham, Miss. Margaret Edith",female,0.233476,0.000,0.000000,112053,0.058556,B42,S
888,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,0.233476,0.125,0.333333,W./C. 6607,0.045771,B42,S
889,1,"Behr, Mr. Karl Howell",male,0.321438,0.000,0.000000,111369,0.058556,C148,C


## One-Hot Encoding

**Assume** 0 correlation between name and survivability rate. Remove the Names column.

In [9]:
features_pre_nan_norm_noname = features_pre_nan_norm.drop('Name', axis = 1)

display(features_pre_nan_norm_noname)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,0.271174,0.125,0.000000,A/5 21171,0.014151,C85,S
1,1,female,0.472229,0.125,0.000000,PC 17599,0.139136,C85,C
2,3,female,0.321438,0.000,0.000000,STON/O2. 3101282,0.015469,C85,S
3,1,female,0.434531,0.125,0.000000,113803,0.103644,C123,S
4,3,male,0.434531,0.000,0.000000,373450,0.015713,C123,S
...,...,...,...,...,...,...,...,...,...
886,2,male,0.334004,0.000,0.000000,211536,0.025374,C50,S
887,1,female,0.233476,0.000,0.000000,112053,0.058556,B42,S
888,3,female,0.233476,0.125,0.333333,W./C. 6607,0.045771,B42,S
889,1,male,0.321438,0.000,0.000000,111369,0.058556,C148,C


**One-Hot Encode**: Pclass, Sex, Ticket, Cabin, Embarked

In [30]:
categorical_features = ['Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked']

# Make a copy of current DF
features_pre_nan_norm_noname_onehot = pd.DataFrame(data = features_pre_nan_norm_noname)
features_pre_nan_norm_noname_onehot = features_pre_nan_norm_noname_onehot.astype({'Pclass': str})
print(features_pre_nan_norm_noname_onehot.dtypes)
# Get encoded
features_pre_nan_norm_noname_onehot = pd.get_dummies(features_pre_nan_norm_noname_onehot)

# Print the number of features after one-hot encoding
encoded = list(features_pre_nan_norm_noname_onehot.columns)
print("{} total features after one-hot encoding.".format(len(encoded)))
features_pre_nan_norm_noname_onehot

Pclass       object
Sex          object
Age         float64
SibSp       float64
Parch       float64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object
840 total features after one-hot encoding.


Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Ticket_110152,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,0.271174,0.125,0.000000,0.014151,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0.472229,0.125,0.000000,0.139136,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.321438,0.000,0.000000,0.015469,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0.434531,0.125,0.000000,0.103644,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0.434531,0.000,0.000000,0.015713,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.334004,0.000,0.000000,0.025374,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
887,0.233476,0.000,0.000000,0.058556,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
888,0.233476,0.125,0.333333,0.045771,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
889,0.321438,0.000,0.000000,0.058556,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


# Machine Learning

Now that the pre-processing is done, we are ready for the ML part!

## Naive Predictor

To have something to compare our model to, we'll calculate a *naive perdictor* accuracy.

Since more than 50% of people died, our naive predictor will assume that everyone died and we'll calculate its accuracy.

**NB:** Our metric of choice will be accuracy (instead of f-score) since there is not great imbalance in the classes (and since this is what Kaggle requests)

In [45]:
training_pre_nan.head(5)

# Calculate and print "naive accuracy"
naive_accuracy = 100* num_death / (num_records)

print('The naive prediction is: {}%'.format(round(naive_accuracy, 1)))

The naive prediction is: 61.6%


## Import and Instantiate Classifiers


In [46]:
# Ensemble Methods
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier  # Decision Trees
from sklearn.naive_bayes import MultinomialNB    # Naive Bayes
from sklearn.svm import SVC                      # SVM

decision_tree = DecisionTreeClassifier()
naive_bayes = MultinomialNB()
bagging = BaggingClassifier()
random_forest = RandomForestClassifier()


## Train-Predict Pipeline

## Grid Search