## Importing Stuff

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import tree
from time import time

## Viewing the dataset

Let's take a view into the dataset itself.

In [2]:
data_raw = pd.read_csv("datasets/titanic_train.csv", index_col='PassengerId')
data_validate = pd.read_csv("datasets/titanic_test.csv", index_col='PassengerId')
data_raw.sample(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,,S
151,0,2,"Bateman, Rev. Robert James",male,51.0,0,0,S.O.P. 1166,12.525,,S
804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C
313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26.0,1,1,250651,26.0,,S
38,0,3,"Cann, Mr. Ernest Charles",male,21.0,0,0,A./5. 2152,8.05,,S
64,0,3,"Skoog, Master. Harald",male,4.0,3,2,347088,27.9,,S
596,0,3,"Van Impe, Mr. Jean Baptiste",male,36.0,1,1,345773,24.15,,S
544,1,2,"Beane, Mr. Edward",male,32.0,1,0,2908,26.0,,S
145,0,2,"Andrew, Mr. Edgardo Samuel",male,18.0,0,0,231945,11.5,,S
696,0,2,"Chapman, Mr. Charles Henry",male,52.0,0,0,248731,13.5,,S


In [3]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [4]:
data_raw.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [5]:
data_raw.describe(include='all')

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,891,2,,,,681,,147,3
top,,,"Douglas, Mr. Walter Donald",male,,,,CA. 2343,,B96 B98,S
freq,,,1,577,,,,7,,4,644
mean,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [6]:
data_raw['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [7]:
data_raw['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

## Cleaning and Wrangling the Data

We'll make a copy of the raw data and put it in a list along with the validation set. We can later separate it into training and testing data.

In [8]:
data_copy = data_raw.copy(deep=True)
data_cleaner = [data_copy, data_validate]

We see that there are 891 entries in the dataset and 12 columns including the PassengerId as the index.

Of the 891 entries for Cabin 687 entries in total are null. This means that there isn't much we can do with the information about the cabin.

In addition, both the Ticket and Fare columns are more or less random. Furthermore, PassengerId is only a unique identifier and will not affect our model.

While it is possible to separate the Name into titles alone, I believe it is not needed.

So all of them are dropped.


We note that there 177 entries for Age do not exist. Instead of deleting these entries completely, we shall instead fill these age columns with the median age. We choose median over mean because there are both babies(Age is a fraction less than one) and very old people as well which might skew the value of mean.

In the case of the port of Embarkation, we see that only 2 values are null. We will use the mode of this column to fill in these values.

In [9]:
for dataset in data_cleaner:
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    dataset.drop(['Cabin', 'Ticket', 'Fare', 'Name'], axis=1, inplace = True)

Now SibSp and Parch are described as follows:

    sibsp: The dataset defines family relations in this way...
    Sibling = brother, sister, stepbrother, stepsister
    Spouse = husband, wife (mistresses and fiancés were ignored)

    parch: The dataset defines family relations in this way...
    Parent = mother, father
    Child = daughter, son, stepdaughter, stepson
    Some children travelled only with a nanny, therefore parch=0 for them.
    
We can instead create a new feature 'FamilySize' by adding SibSp and Parch and 1(For the person themself). We will also create another feature 'IsAlone' for the people who travelled alone. And then we may remove the SibSp and Parch columns.

In [10]:
for dataset in data_cleaner:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    # We set IsAlone to 1/True for everyone and then change it to 0/False depending on their FamilySize.
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0
    dataset.drop(['SibSp', 'Parch'], axis=1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
data_cleaner[0].head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Embarked,FamilySize,IsAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,22.0,S,2,0
2,1,1,female,38.0,C,2,0
3,1,3,female,26.0,S,1,1
4,1,1,female,35.0,S,2,0
5,0,3,male,35.0,S,1,1


We know from our preliminary analysis that the Sex is either male or female. We also know the age and port of embarkation for all passengers. Let us set male = 0 and female = 1. Also, we can set the port such that C = 0, Q = 1, S = 2. We shall leave the age as it is

In [12]:
for dataset in data_cleaner:
    dataset['Sex'].loc[dataset['Sex'] == 'male'] = 0
    dataset['Sex'].loc[dataset['Sex'] == 'female'] = 1
    dataset['Embarked'].loc[dataset['Embarked'] == 'C'] = 0
    dataset['Embarked'].loc[dataset['Embarked'] == 'Q'] = 1
    dataset['Embarked'].loc[dataset['Embarked'] == 'S'] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [13]:
data_cleaner[0].head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Embarked,FamilySize,IsAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,0,22.0,2,2,0
2,1,1,1,38.0,0,2,0
3,1,3,1,26.0,2,1,1
4,1,1,1,35.0,2,2,0
5,0,3,0,35.0,2,1,1


## Splitting up the data

We can now split the data into the labels and features.

In [14]:
data_clean, data_validate = data_cleaner
data_labels = data_clean['Survived']
data_features = data_clean.drop('Survived', axis=1)

Splitting up the labels and features into training and testing sets.

In [15]:
features_train, features_test, labels_train, labels_test = train_test_split(data_features, data_labels,
                                                                            test_size=0.2, random_state=42)

Taking a look at our testing, training and validating data

##### Training Data

In [16]:
features_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Embarked,FamilySize,IsAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
332,1,0,45.5,2,1,1
734,2,0,23.0,2,1,1
383,3,0,32.0,2,1,1
705,3,0,26.0,2,2,0
814,3,1,6.0,2,7,0


In [17]:
labels_train.head()

PassengerId
332    0
734    0
383    0
705    0
814    0
Name: Survived, dtype: int64

##### Testing Data

In [18]:
features_test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Embarked,FamilySize,IsAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
710,3,0,28.0,0,3,0
440,2,0,31.0,2,1,1
841,3,0,20.0,2,1,1
721,2,1,6.0,2,2,0
40,3,1,14.0,0,2,0


In [19]:
labels_test.head()

PassengerId
710    1
440    0
841    0
721    1
40     1
Name: Survived, dtype: int64

##### Validation Data

In [20]:
data_validate.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Embarked,FamilySize,IsAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,3,0,34.5,1,1,1
893,3,1,47.0,2,2,0
894,2,0,62.0,1,1,1
895,3,0,27.0,2,1,1
896,3,1,22.0,2,3,0


## Applying Naive Bayes

In [21]:
nb_classifier = GaussianNB()

In [22]:
t0 = time()
nb_classifier.fit(features_train, labels_train)
print("Training Time: ", time()-t0, "s.", sep='')

Training Time: 0.011408567428588867s.


In [23]:
t1 = time()
nb_pred = nb_classifier.predict(features_test)
print("Testing Time: ", time()-t1, "s.", sep='')

Testing Time: 0.0057599544525146484s.


In [24]:
print("Accuracy: ", accuracy_score(labels_test, nb_pred), ".", sep='')

Accuracy: 0.787709497207.


~79% accuracy on our testing data. Now we just predict for the given data and save it to a file.

## Using a Decision Tree

In [25]:
dt_classifier = tree.DecisionTreeClassifier(min_samples_split=40)

In [26]:
t0 = time()
dt_classifier.fit(features_train, labels_train)
print("Training Time: ", round(time() - t0), "s")

Training Time:  0 s


In [27]:
t1 = time()
dt_prediction = dt_classifier.predict(features_test)
print("Prediction Time: ", round(time() - t1), "s")

Prediction Time:  0 s


In [28]:
print(accuracy_score(labels_test, dt_prediction))

0.826815642458


In [29]:
features_test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Embarked,FamilySize,IsAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
710,3,0,28.0,0,3,0
440,2,0,31.0,2,1,1
841,3,0,20.0,2,1,1
721,2,1,6.0,2,2,0
40,3,1,14.0,0,2,0


In [35]:
dt_classifier.predict(features_test.head())

array([0, 0, 0, 1, 1], dtype=int64)

In [37]:
labels_test[:5]

PassengerId
710    1
440    0
841    0
721    1
40     1
Name: Survived, dtype: int64

## Running the algorithms on the validation set

In [39]:
final = dt_classifier.predict(data_validate)

In [40]:
sample = pd.read_csv("datasets/titanic_sample.csv", index_col='PassengerId')
sample['Survived'] = final
sample.to_csv("datasets/titanic_output.csv", )