# Titanic

### Imports & Loading Data

In [46]:
import numpy as np
import pandas as pd
from IPython.display import display

In [44]:
# Load data
pre_training_set = pd.read_csv('train.csv')
pre_testing_set = pd.read_csv('test.csv')

# Data Exploration

In [54]:
# Print first 5 training examples
display(pre_training_set.head(5))
display(pre_testing_set.head(5))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


**Explore**

In [55]:
num_records = pre_training_set.shape[0]                     # Total number of records
num_surv = pre_training_set['Survived'].value_counts()[1]   # Number of survivors
num_death = pre_training_set['Survived'].value_counts()[0]  # Number of deaths


print('Total number of records: {}'.format(num_records))
print('Number of survivors: {}'.format(num_surv))
print('Number of deaths: {}'.format(num_death))
print('Percentage of deaths: {}%'.format(round(100 * num_death/num_records, 1)))

Total number of records: 891
Number of survivors: 342
Number of deaths: 549
Percentage of deaths: 61.6%


## NaN Values

Here we explore the NaN values in our training and testing sets.

In [16]:
# Count total NaNs and non-NaNs
num_nan_train = pre_training_set.isnull().sum().sum()
num_non_nan_train = pre_training_set.count().sum()
num_nan_test = pre_testing_set.isnull().sum().sum()
num_non_nan_test = pre_testing_set.count().sum()

# Print total NaN
print('Training Set\nNumber of NaNs: {}'.format(num_nan_train))
print('Number of non-NaNs: {}\n'.format(num_non_nan_train))
print('Testing Set\nNumber of NaNs: {}'.format(num_nan_test))
print('Number of non-NaNs: {}\n'.format(num_non_nan_test))

# Print NaN for each feature
print('NaN in training set: \n{}\n'.format(pre_training_set.isnull().sum()))
print('NaN in test set: \n{}\n'.format(pre_testing_set.isnull().sum()))

Training Set
Number of NaNs: 866
Number of non-NaNs: 9826

Testing Set
Number of NaNs: 414
Number of non-NaNs: 4184

NaN in training set: 
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

NaN in test set: 
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64



## Cleaning NaN
##### Key Points for NaN

- About 10% of our values are NaN for each set
- NaN's reside almost exclusively in the "Age" and "Cabin" features for both the training and the testing set

**Strategy**

Because our dataset is quite small, we will not delete any inputs containing NaN values. Instead, we will try *forward filling* and *backward filling* (i.e. replace NaN with the value from the previous row).

In [18]:
# Create non-NaN sets
nan_pre_training_set = pre_training_set.fillna(method = 'ffill', axis = 0)
nan_pre_training_set = nan_pre_training_set.fillna(method = 'backfill', axis = 0)
nan_pre_testing_set = pre_training_set.fillna(method = 'ffill', axis = 0)
nan_pre_testing_set = nan_pre_training_set.fillna(method = 'backfill', axis = 0)

# Print total NaN
print('Training Set NaNs: {}'.format(nan_pre_training_set.isnull().sum().sum()))
print('Testing Set NaNs: {}'.format(nan_pre_testing_set.isnull().sum().sum()))

Training Set NaNs: 0
Testing Set NaNs: 0


## Split Data

In [58]:
# Split data into features and labels
nan_pre_labels = nan_pre_training_set['Survived']
nan_pre_features = nan_pre_training_set.drop('Survived', axis = 1)

display(nan_pre_features.head(5))

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,C85,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,C85,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,C123,S


pandas.core.frame.DataFrame

**One-Hot Encode**: pclass, sex, ticket number, cabin, embarked

**Normalise**: Age, sibsp, parch, fare

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,2.308642,29.58156,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.55459,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.0,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.035144,0.026653,-0.057527,-0.001652,0.012658
Pclass,-0.035144,1.0,-0.291157,0.083081,0.018443,-0.5495
Age,0.026653,-0.291157,1.0,-0.206104,-0.165191,0.083417
SibSp,-0.057527,0.083081,-0.206104,1.0,0.414838,0.159651
Parch,-0.001652,0.018443,-0.165191,0.414838,1.0,0.216225
Fare,0.012658,-0.5495,0.083417,0.159651,0.216225,1.0
