# Titanic Project

This is a small project for practicing my Machine Learning & Data Science knowledge, by doing the well-known Kaggle Titanic competition

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Checking if any field is missing

In [2]:
data = train.append(test, ignore_index=True)
print('The number of missing data')
print(data.isnull().sum())

The number of missing data
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64


Let's look at the age distribution, grouped by class, and divided into 2 genders

In [3]:
data.groupby(['Pclass'])['Age'].hist(by=data['Sex'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Pclass
1    [AxesSubplot(0.1,0.15;0.347826x0.75), AxesSubp...
2    [AxesSubplot(0.1,0.15;0.347826x0.75), AxesSubp...
3    [AxesSubplot(0.1,0.15;0.347826x0.75), AxesSubp...
Name: Age, dtype: object

As we can see in the class 3, the histogram is a little skewed (to the left side of the distribution), so median is the best choice for us to fill in the NaN cells

In [4]:
data['Age'] = data.groupby(['Pclass', 'Sex'])['Age'].apply(lambda x: x.fillna(x.median()))
data['Age']

0       22.0
1       38.0
2       26.0
3       35.0
4       35.0
        ... 
1304    25.0
1305    39.0
1306    38.5
1307    25.0
1308    25.0
Name: Age, Length: 1309, dtype: float64

Once again, checking if any field is missing data

In [5]:
print('Missing data')
print(data.isnull().sum())

Missing data
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64


Let's look into Fare field

In [6]:
data[data['Fare'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1043,1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


We can simply fill this information by using the mean value

In [7]:
#data['Fare'].fillna(data['Fare'].mean())
data['Fare'] = data.groupby('Pclass')['Fare'].apply(lambda x: x.fillna(data['Fare'].mean()))
data[data['Fare'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [8]:
print('Missing data')
print(data.isnull().sum())

Missing data
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          2
dtype: int64


Now let's go into Embarked field

In [9]:
data[data['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1.0,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1.0,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


Since there are only 2 people missing this information, we can fill this information manually

They are all female, in the B cabin, and Pclass is also 1, so probably they are all embarked

We can just set them embarked in the most popular place

In [10]:
data[(data['Sex'] == 'female') & (data['Pclass'] == 1)].hist(by=data['Embarked'])

<IPython.core.display.Javascript object>

array([[<AxesSubplot:title={'center':'C'}>,
        <AxesSubplot:title={'center':'Q'}>],
       [<AxesSubplot:title={'center':'S'}>, <AxesSubplot:>]], dtype=object)

The most popular here is S

In [11]:
data['Embarked'].fillna('S', inplace=True)
data[data['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [12]:
print('Missing data')
print(data.isnull().sum())

Missing data
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          0
dtype: int64


The Cabin field is missing too much, with little meaning, so we can just drop it

Name and Ticket can also be dropped

In [13]:
data = data.drop(['Cabin', 'Name', 'Ticket'], axis=1)
data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,male,22.0,1,0,7.2500,S
1,2,1.0,1,female,38.0,1,0,71.2833,C
2,3,1.0,3,female,26.0,0,0,7.9250,S
3,4,1.0,1,female,35.0,1,0,53.1000,S
4,5,0.0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
1304,1305,,3,male,25.0,0,0,8.0500,S
1305,1306,,1,female,39.0,0,0,108.9000,C
1306,1307,,3,male,38.5,0,0,7.2500,S
1307,1308,,3,male,25.0,0,0,8.0500,S


In [18]:
print('Missing data')
print(data.isnull().sum())

Missing data
PassengerId      0
Survived       418
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         0
dtype: int64


Digitalize the string fields

In [14]:
genders = {'male': 1, 'female': 2}
data['Sex'] = data['Sex'].map(genders, na_action='ignore')
data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,1,22.0,1,0,7.2500,S
1,2,1.0,1,2,38.0,1,0,71.2833,C
2,3,1.0,3,2,26.0,0,0,7.9250,S
3,4,1.0,1,2,35.0,1,0,53.1000,S
4,5,0.0,3,1,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
1304,1305,,3,1,25.0,0,0,8.0500,S
1305,1306,,1,2,39.0,0,0,108.9000,C
1306,1307,,3,1,38.5,0,0,7.2500,S
1307,1308,,3,1,25.0,0,0,8.0500,S


In [15]:
data.hist(by=data['Embarked'])

<IPython.core.display.Javascript object>

array([[<AxesSubplot:title={'center':'C'}>,
        <AxesSubplot:title={'center':'Q'}>],
       [<AxesSubplot:title={'center':'S'}>, <AxesSubplot:>]], dtype=object)

In [16]:
ports = {'S': 1, 'C': 2, 'Q': 3}
data['Embarked'] = data['Embarked'].map(ports, na_action='ignore')
data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,1,22.0,1,0,7.2500,1
1,2,1.0,1,2,38.0,1,0,71.2833,2
2,3,1.0,3,2,26.0,0,0,7.9250,1
3,4,1.0,1,2,35.0,1,0,53.1000,1
4,5,0.0,3,1,35.0,0,0,8.0500,1
...,...,...,...,...,...,...,...,...,...
1304,1305,,3,1,25.0,0,0,8.0500,1
1305,1306,,1,2,39.0,0,0,108.9000,2
1306,1307,,3,1,38.5,0,0,7.2500,1
1307,1308,,3,1,25.0,0,0,8.0500,1


In [17]:
data[data['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked


In [20]:
train = data[data['Survived'].notnull()]
test = data[data['Survived'].isnull()]

In [21]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,1,22.0,1,0,7.2500,1
1,2,1.0,1,2,38.0,1,0,71.2833,2
2,3,1.0,3,2,26.0,0,0,7.9250,1
3,4,1.0,1,2,35.0,1,0,53.1000,1
4,5,0.0,3,1,35.0,0,0,8.0500,1
...,...,...,...,...,...,...,...,...,...
886,887,0.0,2,1,27.0,0,0,13.0000,1
887,888,1.0,1,2,19.0,0,0,30.0000,1
888,889,0.0,3,2,22.0,1,2,23.4500,1
889,890,1.0,1,1,26.0,0,0,30.0000,2


In [22]:
test

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
891,892,,3,1,34.5,0,0,7.8292,3
892,893,,3,2,47.0,1,0,7.0000,1
893,894,,2,1,62.0,0,0,9.6875,3
894,895,,3,1,27.0,0,0,8.6625,1
895,896,,3,2,22.0,1,1,12.2875,1
...,...,...,...,...,...,...,...,...,...
1304,1305,,3,1,25.0,0,0,8.0500,1
1305,1306,,1,2,39.0,0,0,108.9000,2
1306,1307,,3,1,38.5,0,0,7.2500,1
1307,1308,,3,1,25.0,0,0,8.0500,1


In [26]:
X_train, Y_train = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']], train['Survived']
X_test, Y_test = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']], test['Survived']

### SVM

In [29]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

clf = SVC(kernel='rbf')
grid_values = {'gamma':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100], 'C':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]}

grid_clf_acc = GridSearchCV(clf, param_grid=grid_values, scoring='accuracy')
grid_clf_acc.fit(X_train, Y_train)

print('Best parameter for SVM: ', grid_clf_acc.best_params_)
print('Grid best score: ', grid_clf_acc.best_score_)

Best parameter for SVM:  {'C': 100, 'gamma': 0.01}
Grid best score:  0.8249011361496456


### kNN

In [33]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
param = {'n_neighbors': [i for i in range(1, 20)],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']}
grid_clf_acc = GridSearchCV(clf, param_grid=param, scoring='accuracy')
grid_clf_acc.fit(X_train, Y_train)

print('Best parameter for kNN: ', grid_clf_acc.best_params_)
print('Grid best score: ', grid_clf_acc.best_score_)

Best parameter for kNN:  {'metric': 'manhattan', 'n_neighbors': 12, 'weights': 'distance'}
Grid best score:  0.7968928504174252


### Decision Tree

In [34]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
param = {'max_depth': [i for i in range(1, 20)]}
grid_clf_acc = GridSearchCV(clf, param_grid=param, scoring='accuracy')
grid_clf_acc.fit(X_train, Y_train)

print('Best parameter for Decision Tree: ', grid_clf_acc.best_params_)
print('Grid best score: ', grid_clf_acc.best_score_)

Best parameter for Decision Tree:  {'max_depth': 4}
Grid best score:  0.8081162513338773


### Random Forest

In [40]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
param = {'max_depth':[i for i in range(1, 20)]}
grid_clf_acc = GridSearchCV(clf, param_grid=param, scoring='accuracy')
grid_clf_acc.fit(X_train, Y_train)

print('Best parameter for Random Forest: ', grid_clf_acc.best_params_)
print('Grid best score: ', grid_clf_acc.best_score_)

Best parameter for Random Forest:  {'max_depth': 4}
Grid best score:  0.8271483271608814


### Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
param = {'C': [i for i in range(1, 10)]}
grid_clf_acc = GridSearchCV(clf, param_grid=param, scoring='accuracy')
grid_clf_acc.fit(X_train, Y_train)

print('Best parameter for Logistic Regression: ', grid_clf_acc.best_params_)
print('Grid best score: ', grid_clf_acc.best_score_)

Best parameter for Logistic Regression:  {'C': 1}
Grid best score:  0.8058439520431863


### Neural Network

In [41]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier()
param = {'hidden_layer_sizes': [10, 30, 100, 300, 1000],
            'alpha':  [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1]}

grid_clf_acc = GridSearchCV(clf, param_grid=param, scoring='accuracy')
grid_clf_acc.fit(X_train, Y_train)

print('Best parameter for Neural Network: ', grid_clf_acc.best_params_)
print('Grid best score: ', grid_clf_acc.best_score_)









Best parameter for Neural Network:  {'alpha': 0.1, 'hidden_layer_sizes': 300}
Grid best score:  0.8316615403929445


2 best options are:

Best parameter for Neural Network:  {'alpha': 0.1, 'hidden_layer_sizes': 300}
Grid best score:  0.8316615403929445

Best parameter for Random Forest:  {'max_depth': 4}
Grid best score:  0.8271483271608814

In [49]:
ground_truth = pd.read_csv('titanic3.csv')
Y_true = ground_truth[891:]['survived']

In [42]:
clf = RandomForestClassifier(max_depth=4).fit(X_train, Y_train)
Y_test = clf.predict(X_test)
Y_test

array([0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

In [50]:
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_true, Y_test)

0.5125786163522013

In [51]:
clf = MLPClassifier(hidden_layer_sizes=300, alpha=0.1).fit(X_train, Y_train)
Y_test = clf.predict(X_test)
Y_test



array([0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

In [52]:
roc_auc_score(Y_true, Y_test)

0.5090106434446058