# Predict survival on Titanic
Supervised learning, we have labeled dataset we can train a model on.
Our target, survived or not (0,1), is categorical so we can use a classification model.

In [1]:
import pandas as pd
import numpy as np

Read the training data

In [2]:
original_data = pd.read_csv('titanic_data/train.csv')
original_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Survived is our target.
Let's use Pclass, Sex, Age, Cabin, Embarked.

In [3]:
target = 'Survived'
data = original_data.loc[:, ['Pclass', 'Sex', 'Age', 'Cabin', 'Embarked', target]]

Some alg. can't handle string values/categorical columns. These need to be processed.
## Data Preprocessing

In [4]:
# Replace value with 1 and nan with 0 in Cabin column.
data.loc[data['Cabin'].notnull(), 'Cabin'] = 1
data.loc[data['Cabin'].isnull(), 'Cabin'] = 0

In [5]:
# Split Sex and Embarked categories to separate columns
new_embarked = pd.get_dummies(data['Embarked'], prefix='Embarked')
new_sex = pd.get_dummies(data['Sex'])
data = pd.concat([data, new_embarked, new_sex], axis=1)

In [6]:
data.head()

Unnamed: 0,Pclass,Sex,Age,Cabin,Embarked,Survived,Embarked_C,Embarked_Q,Embarked_S,female,male
0,3,male,22.0,0,S,0,0,0,1,0,1
1,1,female,38.0,1,C,1,1,0,0,1,0
2,3,female,26.0,0,S,1,0,0,1,1,0
3,1,female,35.0,1,S,1,0,0,1,1,0
4,3,male,35.0,0,S,0,0,0,1,0,1


Check if we have any NaN values

In [7]:
data.isnull().sum()

Pclass          0
Sex             0
Age           177
Cabin           0
Embarked        2
Survived        0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
female          0
male            0
dtype: int64

Remove rows from training data or replace with a clever value.

In [8]:
data.describe()

Unnamed: 0,Pclass,Age,Cabin,Survived,Embarked_C,Embarked_Q,Embarked_S,female,male
count,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.228956,0.383838,0.188552,0.08642,0.722783,0.352413,0.647587
std,0.836071,14.526497,0.420397,0.486592,0.391372,0.281141,0.447876,0.47799,0.47799
min,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
75%,3.0,38.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
max,3.0,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
data.Age.median()

28.0

In [10]:
# Replace nan with median age
data.loc[data.Age.isnull(), 'Age'] = data.Age.median()

How does it look now?

In [11]:
data.isnull().sum()

Pclass        0
Sex           0
Age           0
Cabin         0
Embarked      2
Survived      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
female        0
male          0
dtype: int64

In [12]:
data.describe()

Unnamed: 0,Pclass,Age,Cabin,Survived,Embarked_C,Embarked_Q,Embarked_S,female,male
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,29.361582,0.228956,0.383838,0.188552,0.08642,0.722783,0.352413,0.647587
std,0.836071,13.019697,0.420397,0.486592,0.391372,0.281141,0.447876,0.47799,0.47799
min,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
75%,3.0,35.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
max,3.0,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Train a K Nearest Neighbour model

In [13]:
# Some imports for this section
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

Split the data into train and test. OBS! You should always have a 3:rd dataset saved for a final, final testing to reduce bias in your score.

In [15]:
X = data.drop(columns=[target]).select_dtypes(include='number')
y = data[target]
print(X.head())
print(y.head())

   Pclass   Age  Cabin  Embarked_C  Embarked_Q  Embarked_S  female  male
0       3  22.0      0           0           0           1       0     1
1       1  38.0      1           1           0           0       1     0
2       3  26.0      0           0           0           1       1     0
3       1  35.0      1           0           0           1       1     0
4       3  35.0      0           0           0           1       0     1
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [17]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

Make prediction on  test set and evaluate model

In [18]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86       105
           1       0.82      0.74      0.78        74

   micro avg       0.83      0.83      0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179

