In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

In [2]:
# Acquire data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
combine = [train_df, test_df]

# Data Preparation

## Dropping features
Based on our assumptions, we want to drop the Cabin and Ticket features

In [3]:
# Drop Ticket and Cabin columns, use axis=1 to refer to columns
train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_df, test_df]

Name and PassengerId columns must be dropped too, but first we have to create the Title feature

## Creating new features

### Title feature

In [4]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract('(\w+\.)', expand=False)

In [5]:
display(combine[0]["Title"].value_counts())
display(combine[1]["Title"].value_counts())

Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Col.           2
Mlle.          2
Major.         2
Mme.           1
Capt.          1
Ms.            1
Lady.          1
Jonkheer.      1
Countess.      1
Don.           1
Sir.           1
Name: Title, dtype: int64

Mr.        240
Miss.       78
Mrs.        72
Master.     21
Col.         2
Rev.         2
Dr.          1
Ms.          1
Dona.        1
Name: Title, dtype: int64

- We can replace some rare titles like (Lady, Countess, Capt, Col, Don, Dr, Major, Rev, Sir, Jonkheer, Dona) by Rare name
- Replace Mlle by Miss
- Replace Ms by Miss
- Replace Mme by Mrs

In [6]:
rare_replacements = ['Lady.', 'Countess.', 'Capt.', 'Col.', 'Don.', 'Dr.', 'Major.',
                     'Rev.', 'Sir.', 'Jonkheer.', 'Dona.']

for dataset in combine:
    dataset["Title"] = dataset["Title"].replace(rare_replacements, "Rare")
    dataset["Title"] = dataset["Title"].replace("Mlle.", "Miss.")
    dataset["Title"] = dataset["Title"].replace("Ms.", "Miss.")
    dataset["Title"] = dataset["Title"].replace("Mme.", "Mrs.")

In [7]:
display(combine[0]["Title"].value_counts())
display(combine[1]["Title"].value_counts())

Mr.        517
Miss.      185
Mrs.       126
Master.     40
Rare        23
Name: Title, dtype: int64

Mr.        240
Miss.       79
Mrs.        72
Master.     21
Rare         6
Name: Title, dtype: int64

In [8]:
# Look survival rates depending of Title
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master.,0.575
1,Miss.,0.702703
2,Mr.,0.156673
3,Mrs.,0.793651
4,Rare,0.347826


- Passengers with Miss or Mrs title are more likely to survive. Both are female titles
- Males with Mr title are more likely to die, only a 15.66% survived

**Now we can drop Name and PassengerId features**

In [9]:
train_df = combine[0]
test_df = combine[1]

train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

combine = [train_df, test_df]

train_df.shape, test_df.shape

((891, 9), (418, 9))

## Converting categorical features to numerical

In [10]:
# Now we will use a map function to map values from two series having one column same.
# Last columns of the first series must be equal to index column of second series
title_mapping = {"Mr.": 1, "Miss.": 2, "Mrs.": 3, "Master.": 4, "Rare": 5}
sex_mapping = {'female': 1, 'male': 0}

for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

train_df = combine[0]
test_df = combine[1]

In [11]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,S,1
1,1,1,1,38.0,1,0,71.2833,C,3
2,1,3,1,26.0,0,0,7.925,S,2
3,1,1,1,35.0,1,0,53.1,S,3
4,0,3,0,35.0,0,0,8.05,S,1


# Handle missing values
We will perform these operations:
- Imputation on "Age"
- Delete missing rows of "Embarked"

In [13]:
# Drop rows with missing values
for dataset in combine:
    # Impute missing values on Age with the mean age of passengers
    dataset['Age'] = dataset['Age'].fillna(dataset["Age"].mean())
    # Drop the rows that still have missing values on train set
    dataset['Embarked'] = dataset['Embarked'].fillna("S")
    
    dataset['Fare'] = dataset['Fare'].fillna(dataset['Fare'].mean())

combine[0].shape, combine[1].shape

((891, 9), (418, 9))

In [14]:
combine[0].info(), combine[1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null object
Title       891 non-null int64
dtypes: float64(2), int64(6), object(1)
memory usage: 62.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Embarked       418 non-null object
Title          418 non-null int64
dtypes: float64(2), int64(6), object(1)
memory usage: 29.5+ KB


(None, None)

# Creation of new features

## AgeBand feature
We have defined these age bands:
- 0-10, childhood
- 10-20, adolescence
- 20-36, youth
- 49-64, adulthood
- 65-max, old age

In [15]:
bins = [0, 10, 20, 36, 65, 80]
labels = [0, 1, 2, 3, 4]

for dataset in combine:
    dataset['AgeBand'] = pd.cut(dataset['Age'], bins=bins, labels=labels)

In [16]:
combine[0]['AgeBand'].value_counts()

2    517
3    187
1    115
0     64
4      8
Name: AgeBand, dtype: int64

In [17]:
combine[0].head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,AgeBand
0,0,3,0,22.0,1,0,7.25,S,1,2
1,1,1,1,38.0,1,0,71.2833,C,3,3
2,1,3,1,26.0,0,0,7.925,S,2,2
3,1,1,1,35.0,1,0,53.1,S,3,2
4,0,3,0,35.0,0,0,8.05,S,1,2


In [18]:
combine[0][['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,AgeBand,Survived
0,0,0.59375
1,1,0.382609
3,3,0.379679
2,2,0.363636
4,4,0.125


The passengers in their childhood (AgeBand == 0) are more likely to survive, all other ageBands don't show relevant information

## FamilySize feature
This feature is the sum of all family members, in order to achieve that, we perform the sum of SibSp and Parch features

In [19]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [20]:
combine[0][['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


Families with 4 members are more likely to survive, maybe because the lifeboats size.
Other families with 2 or 3 members shows a survivor percentage higher than 50%, but it is still a non-relevant percentage

## Encode "Embarked" into numerical
We will encode Embarked values into numerical:
- Cherbourg -> 0
- Queenstown -> 1
- Southampton -> 2

In [21]:
combine[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Sex           891 non-null int64
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null object
Title         891 non-null int64
AgeBand       891 non-null category
FamilySize    891 non-null int64
dtypes: category(1), float64(2), int64(7), object(1)
memory usage: 70.8+ KB


In [22]:
combine[0]['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [23]:
ports_map = {'C': 0, 'Q': 1, 'S': 2}

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map(ports_map)

In [24]:
combine[0].head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,AgeBand,FamilySize
0,0,3,0,22.0,1,0,7.25,2,1,2,2
1,1,1,1,38.0,1,0,71.2833,0,3,3,2
2,1,3,1,26.0,0,0,7.925,2,2,2,1
3,1,1,1,35.0,1,0,53.1,2,3,2,2
4,0,3,0,35.0,0,0,8.05,2,1,2,1


In [25]:
combine[0]['Embarked'].value_counts()

2    646
0    168
1     77
Name: Embarked, dtype: int64

In [26]:
combine[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Sex           891 non-null int64
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null int64
Title         891 non-null int64
AgeBand       891 non-null category
FamilySize    891 non-null int64
dtypes: category(1), float64(2), int64(8)
memory usage: 70.8 KB


In [27]:
combine[1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Embarked       418 non-null int64
Title          418 non-null int64
AgeBand        418 non-null category
FamilySize     418 non-null int64
dtypes: category(1), float64(2), int64(8)
memory usage: 33.3 KB


### Modelling
Now we will train a model and predict the required solution

In [28]:
train_df = combine[0]
test_df = combine[1]

Y_train = train_df['Survived'].copy()
X_train = train_df.drop('Survived', axis=1).copy()
X_test = test_df.drop('PassengerId', axis=1).copy()

In [29]:
X_train.shape, Y_train.shape, X_test.shape

((891, 10), (891,), (418, 10))

We will try these algorithms:
- Logistic Regression
- SVM

In [30]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Pclass        891 non-null int64
Sex           891 non-null int64
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null int64
Title         891 non-null int64
AgeBand       891 non-null category
FamilySize    891 non-null int64
dtypes: category(1), float64(2), int64(7)
memory usage: 63.8 KB


In [31]:
# Logistic Regression model
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

logreg_model = LogisticRegression(max_iter=10000)
logreg_model.fit(X_train_scaled, Y_train)

Y_pred_logreg = logreg_model.predict(X_test)

acc_log = round(logreg_model.score(X_train_scaled, Y_train) * 100, 2)
acc_log

81.48

In [32]:
#SVM model
svc_model = SVC()
svc_model.fit(X_train_scaled, Y_train)

Y_pred_svm = svc_model.predict(X_test)

acc_svc = round(svc_model.score(X_train_scaled, Y_train) * 100, 2)
acc_svc

83.95

In [33]:
#Random forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train_scaled, Y_train)

Y_pred_random_forest = random_forest.predict(X_test)

acc_random_forest = round(random_forest.score(X_train_scaled, Y_train) * 100, 2)
acc_random_forest

98.43

This last result may be overfitting

If the Random forest accuracy is not due to overfitting, it is the best result, if not, SVM gives the best accuracy

# Creation of submission

In [37]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred_random_forest
    })
#submission.to_csv('../output/submission.csv', index=False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [38]:
submission.to_csv('submission_random_forest.csv', index=False)

In [39]:
submission.shape

(418, 2)