# Titanic Survival Prediction Notebook
Teammates: Kevin Moy, John Um, Joyce Li, Sid Karia, Jessica Wu

Team Name: Koolgle

Competition Link: https://www.kaggle.com/c/dma-fall2020/overview

In [112]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [113]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [114]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [115]:
#Drop Ticket and Cabin columns
df_train = df_train.drop(['Ticket', 'Cabin', 'PassengerId'], axis=1)
df_test = df_test.drop(['Ticket', 'Cabin'], axis=1)
df_train

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C


In [116]:
#Title manipulation
impt_titles = [' Don.', ' Rev.', ' Dr.', ' Major.', ' Lady.',' Sir.', ' Col.', ' Capt.', ' the Countess.', ' Jonkheer.']
df_train['Title'] = df_train['Name'].str.extract(r',(.*?\.)', expand=True)
df_test['Title'] = df_test['Name'].str.extract(r',(.*?\.)', expand=True)
df_train['Title'] = df_train['Title'].replace(impt_titles, 'Important')
df_test['Title'] = df_test['Title'].replace(impt_titles, 'Important')

df_train['Title'] = df_train['Title'].replace([' Mlle.',' Ms.',' Mme.'], ' Miss.')
df_test['Title'] = df_test['Title'].replace([' Mlle.',' Ms.',' Mme.'], ' Miss.')


In [117]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,Mr.
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,Mrs.
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,Miss.
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,Mrs.
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,Mr.


Here we can see the percentage under each title that survived in the training set. Our next step is to numerically encode "Name" by title because we can infer that women and children, as well as important titled people (Sir, Capt, Don, etc.), got lifeboat priority. 

5 main categories:

- Master: Young boy (I think?)
- Miss: Single women
- Mr.: Men
- Mrs.: Married women
- Important: Anyone with one of the titles specified above.

In [118]:
df_train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master.,0.575
1,Miss.,0.704301
2,Mr.,0.156673
3,Mrs.,0.792
4,Important,0.347826


Unsurprisingly, kids and females were far and away the most saved group. We can now numerically encode this col, somewhat arbitrarily.

In [119]:
df_test['Title'].value_counts()

 Mr.         240
 Miss.        79
 Mrs.         72
 Master.      21
Important      5
 Dona.         1
Name: Title, dtype: int64

In [120]:
#Numerical Encoding for Training and Test set
title_encode = {" Mr.": 2, " Miss.": 3, " Mrs.": 4, " Master.": 5, "Important": 6}
df_train['Title'] = df_train['Title'].fillna(0) #Handles the one NaN row.
df_test['Title'] = df_test['Title'].fillna(0) 
df_train['Title'] = df_train['Title'].map(title_encode)
print(df_train['Title'].value_counts())
df_test['Title'] = df_test['Title'].map(title_encode)
df_test['Title'].value_counts()

2    517
3    186
4    125
5     40
6     23
Name: Title, dtype: int64


2.0    240
3.0     79
4.0     72
5.0     21
6.0      5
Name: Title, dtype: int64

In [121]:
#One Hot Encoding for Sex Variable
encode_sex = {'male': 0, 'female': 1}
df_train['Sex'] = df_train['Sex'].map(encode_sex).astype(int)
df_test['Sex'] = df_test['Sex'].map(encode_sex).astype(int)

In [122]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,S,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C,4
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,S,3
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,S,4
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,S,2


Next we have to handle NaN's in age, which is really tough, since it's definitely a super important feature, so incorrect predictions would definitely hurt rather than help. Ideally we could derive this from other characteristics but that would just be arbitrary as well.

Looking at the age distribution of ranges with NaNs dropped, looks like kids are still pretty well accounted for. So it'll probably be best to just drop.

In [123]:
df_train['Age'].dropna().astype(int).value_counts().sort_index()

0      7
1      7
2     10
3      6
4     10
      ..
66     1
70     3
71     2
74     1
80     1
Name: Age, Length: 71, dtype: int64

In [124]:
#Fill NA of Fare variable with median
df_test['Fare'].fillna(df_test.median(), inplace=True)
df_train['Fare'].fillna(df_train.median(), inplace=True)

In [125]:
#Define FamSize variable by adding siblings+ parents+ self
df_train['FamSize'] = df_train['SibSp'] + df_train['Parch'] + 1 #Sum siblings + parents + self. to get a representative family count
df_test['FamSize'] = df_test['SibSp'] + df_test['Parch'] + 1


In [126]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamSize
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,S,2,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C,4,2
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,S,3,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,S,4,2
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,S,2,1


In [127]:
#Dealing with Embarked feature
port_map = {'S': 0, 'C': 1, 'Q': 2}
df_train['Embarked'] = df_train['Embarked'].fillna('S') #who gives a shit lol 
df_test['Embarked'] = df_test['Embarked'].fillna('S')
df_train['Embarked'] = df_train['Embarked'].map(port_map).astype(int)
df_test['Embarked'] = df_test['Embarked'].map(port_map).astype(int)

In [128]:
#Finishing up numeric encoding:
map_rank = {" Mr.": 1, " Miss.": 2, " Mrs.": 3, " Master.": 4, "Important": 5}
df_train['Title'] = df_train['Title'].map(map_rank)
df_train['Title'] = df_train['Title'].fillna(0)
df_test['Title'] = df_test['Title'].map(map_rank)
df_test['Title'] = df_test['Title'].fillna(0)
df_train['Title'] = df_train['Title'].astype(int)
df_test['Title'] = df_test['Title'].astype(int)

In [129]:
df_train = df_train.drop(['Name'], axis=1)
df_test = df_test.drop(['Name'], axis=1)

In [130]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamSize
0,0,3,0,22.0,1,0,7.25,0,0,2
1,1,1,1,38.0,1,0,71.2833,1,0,2
2,1,3,1,26.0,0,0,7.925,0,0,1
3,1,1,1,35.0,1,0,53.1,0,0,2
4,0,3,0,35.0,0,0,8.05,0,0,1


In [131]:
#Fillna with median values for Fare and Age
df_train['Fare'].fillna(df_train['Fare'].dropna().median(), inplace=True)
df_test['Fare'].fillna(df_test['Fare'].dropna().median(), inplace=True)
df_train['Age'].fillna(df_train['Age'].dropna().median(), inplace=True)
df_test['Age'].fillna(df_test['Age'].dropna().median(), inplace=True)

Train Models:

In [132]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [133]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop("Survived", axis=1), df_train["Survived"], test_size=0.2)

In [134]:
X_train, X_test

(     Pclass  Sex    Age  SibSp  Parch     Fare  Embarked  Title  FamSize
 619       2    0  26.00      0      0  10.5000         0      0        1
 555       1    0  62.00      0      0  26.5500         0      0        1
 160       3    0  44.00      0      1  16.1000         0      0        2
 831       2    0   0.83      1      1  18.7500         0      0        3
 647       1    0  56.00      0      0  35.5000         1      0        1
 ..      ...  ...    ...    ...    ...      ...       ...    ...      ...
 52        1    1  49.00      1      0  76.7292         1      0        2
 132       3    1  47.00      1      0  14.5000         0      0        2
 888       3    1  28.00      1      2  23.4500         0      0        4
 107       3    0  28.00      0      0   7.7750         0      0        1
 457       1    1  28.00      1      0  51.8625         0      0        2
 
 [712 rows x 9 columns],
      Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  Title  FamSize
 577       1

In [135]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
print(accuracy_score(clf.predict(X_train), y_train))
print(accuracy_score(clf.predict(X_test), y_test))

0.9803370786516854
0.7318435754189944


In [136]:
#Trying bagging
bag_clf = BaggingClassifier(DecisionTreeClassifier(random_state=42), n_estimators=150,
                            bootstrap=True, n_jobs=-1,
                            random_state=42)
bag_clf.fit(X_train, y_train)
print(accuracy_score(bag_clf.predict(X_train), y_train))
print(accuracy_score(bag_clf.predict(X_test), y_test))

0.9803370786516854
0.7877094972067039


In [137]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [138]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
print(accuracy_score(rf.predict(X_train), y_train))
print(accuracy_score(rf.predict(X_test), y_test))

0.9803370786516854
0.770949720670391


In [139]:
from sklearn.ensemble import ExtraTreesClassifier
xtrees = ExtraTreesClassifier(random_state=42)
xtrees.fit(X_train, y_train)
print(accuracy_score(xtrees.predict(X_train), y_train))
print(accuracy_score(xtrees.predict(X_test), y_test))

0.9803370786516854
0.7821229050279329


In [140]:
#Trying NN - this isn't very good
from sklearn.linear_model import Perceptron
nn = Perceptron(max_iter=50000)
nn.fit(X_train, y_train)
print(accuracy_score(nn.predict(X_train), y_train))
print(accuracy_score(nn.predict(X_test), y_test))

0.7345505617977528
0.6927374301675978


In [141]:
#Trying Linear Support Vectors
from sklearn.svm import SVC, LinearSVC
lsv = LinearSVC(max_iter=50000)
lsv.fit(X_train, y_train)
print(accuracy_score(lsv.predict(X_train), y_train))
print(accuracy_score(lsv.predict(X_test), y_test))

0.8132022471910112
0.770949720670391




In [142]:
dt = DecisionTreeClassifier() 
dt.fit(X_train, y_train)
print(accuracy_score(dt.predict(X_train), y_train))
print(accuracy_score(dt.predict(X_test), y_test))

0.9803370786516854
0.7486033519553073


In [158]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamSize
619,2,0,26.0,0,0,10.5,0,0,1
555,1,0,62.0,0,0,26.55,0,0,1
160,3,0,44.0,0,1,16.1,0,0,2
831,2,0,0.83,1,1,18.75,0,0,3
647,1,0,56.0,0,0,35.5,1,0,1


In [159]:
#OFFICIAL SUBMISSION CELL - used LSV
final_test = df_test.drop(columns ='PassengerId')
y_pred = lsv.predict(final_test)
submission = pd.DataFrame({
        "PassengerId": df_test["PassengerId"],
        "Survived": y_pred
    })

In [160]:
submission.to_csv('trial2.csv', index=False)