# Titanic

### Imports

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load data
pre_training_set = pd.read_csv('train.csv')
pre_testing_set = pd.read_csv('test.csv')

# Explore the Data

In [3]:
# Print first 5 training examples
pre_training_set.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
# Print lasta 5 training examples
pre_training_set.tail(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


## Cleaning Data | NaN Values

Here we explore the NaN values in our training and testing sets.

In [5]:
# Count total NaNs and non-NaNs
num_nan_train = pre_training_set.isnull().sum().sum()
num_non_nan_train = pre_training_set.count().sum()
num_nan_test = pre_testing_set.isnull().sum().sum()
num_non_nan_test = pre_testing_set.count().sum()

# Print total NaN
print('Training Set\nNumber of NaNs: {}'.format(num_nan_train))
print('Number of non-NaNs (training): {}\n'.format(num_non_nan_train))
print('Testing Set\nNumber of NaNs (testing): {}'.format(num_nan_test))
print('Number of non-NaNs (testing): {}\n'.format(num_non_nan_test))

# Print NaN for each feature
print('NaN in training set: \n{}\n'.format(pre_training_set.isnull().sum()))
print('NaN in test set: \n{}\n'.format(pre_testing_set.isnull().sum()))

Training Set
Number of NaNs: 866
Number of non-NaNs (training): 9826

Testing Set
Number of NaNs (testing): 414
Number of non-NaNs (testing): 4184

NaN in training set: 
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

NaN in test set: 
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64



##### Key Points for NaN

- About 10% of our values are NaN for each set
- NaN's reside almost exclusively in the "Age" and "Cabin" features for both the training and the testing set

**Strategy**

Because our dataset is quite small, we will not delete any inputs containing NaN values. Instead, we will try *forward filling* and *backward filling* (i.e. replace NaN with the value from the previous row).

In [13]:
# Create non-NaN sets
nan_pre_training_set = pre_training_set.fillna(method = 'ffill', axis = 0)
nan_pre_training_set = nan_pre_training_set.fillna(method = 'backfill', axis = 0)
nan_pre_testing_set = pre_training_set.fillna(method = 'ffill', axis = 0)
nan_pre_testing_set = nan_pre_training_set.fillna(method = 'backfill', axis = 0)

# Print total NaN
print('Training Set\nNumber of NaNs: {}'.format(nan_pre_training_set.isnull().sum().sum()))
print('Testing Set\nNumber of NaNs (testing): {}'.format(num_nan_test))
print('Number of non-NaNs (testing): {}\n'.format(num_non_nan_test))