In [236]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt


df = pd.read_csv('titanic.csv')
df = df[['Age', 'SibSp', 'Pclass', 'Parch', 'Sex', 'Survived']]

df = df.dropna()

In [237]:
df.head()

Unnamed: 0,Age,SibSp,Pclass,Parch,Sex,Survived
0,22.0,1,3,0,male,0
1,38.0,1,1,0,female,1
2,26.0,0,3,0,female,1
3,35.0,1,1,0,female,1
4,35.0,0,3,0,male,0


In [238]:
dummies = pd.get_dummies(df['Sex'])
dummies.head(1)

df = pd.concat([df, dummies], axis=1)


df.head()

Unnamed: 0,Age,SibSp,Pclass,Parch,Sex,Survived,female,male
0,22.0,1,3,0,male,0,0,1
1,38.0,1,1,0,female,1,1,0
2,26.0,0,3,0,female,1,1,0
3,35.0,1,1,0,female,1,1,0
4,35.0,0,3,0,male,0,0,1


In [239]:

from sklearn.preprocessing import normalize #get the function needed to normalize our data.

X = df[['Age', 'SibSp', 'Pclass', 'Parch', 'male']] #create the X matrix
X = normalize(X) #normalize the matrix to put everything on the same scale
y = df['Survived'] #create the y-variable


df['Survived'].value_counts()

0    424
1    290
Name: Survived, dtype: int64

In [240]:
from sklearn.model_selection import train_test_split #We need this to split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #split the data, store it into different variables

In [241]:
from sklearn.neighbors import KNeighborsClassifier #the object class we need

knn = KNeighborsClassifier(n_neighbors=3) #create a KNN-classifier with 5 neighbors (default)
knn = knn.fit(X_train, y_train) #this fits the k-nearest neigbor model with the train data
accuracy = knn.score(X_test, y_test) #calculate the fit on the test data

In [242]:
from sklearn.metrics import confusion_matrix
y_test_pred = knn.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_test_pred) #creates a "confusion matrix"
cm

array([[102,  32],
       [ 21,  60]], dtype=int64)

In [243]:

#In order to read it easily , let's make a dataframe out of it, and add labels to it.
conf_matrix = pd.DataFrame(cm, index=['Not survive', 'Survived'], columns = ['Not survived P', 'Survived P']) 
conf_matrix

Unnamed: 0,Not survived P,Survived P
Not survive,102,32
Survived,21,60


In [244]:
y_test.value_counts()

0    134
1     81
Name: Survived, dtype: int64

In [245]:
recall  = conf_matrix.iloc[1,1] / (conf_matrix.iloc[1,1]+ conf_matrix.iloc[0,1])
precision = conf_matrix.iloc[1,1] / (conf_matrix.iloc[1,1] + conf_matrix.iloc[1,0])

In [246]:
print(f'The accuracy is {accuracy}')

The accuracy is 0.7534883720930232


In [247]:
print(f'The recall is {recall}')

The recall is 0.6521739130434783


In [248]:
print(f'The precision is {precision}')

The precision is 0.7407407407407407
