In [35]:
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import confusion_matrix

In [36]:
titanic = pd.read_csv("titanic.csv")

In [37]:
titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,


The easiest five variables to work with are Pclass, Sex, Age, Cabin, SibSp and Parch, trying to predict Survived.

In [38]:
titanice = titanic[['Sex', 'Age', 'SibSp','Parch','Pclass','Survived']].dropna()
titanice.head(3)

Unnamed: 0,Sex,Age,SibSp,Parch,Pclass,Survived
0,male,22.0,1,0,3,0
1,female,38.0,1,0,1,1
2,female,26.0,0,0,3,1


We create dummy variables for male/female.

In [39]:
dummies = pd.get_dummies(titanice['Sex'])
dummies.rename(columns={'female': 'Sex_Dummies'}, inplace=True)
titanice["gender_dummy"] = dummies["Sex_Dummies"]

In [40]:
titanice.head(3)

Unnamed: 0,Sex,Age,SibSp,Parch,Pclass,Survived,gender_dummy
0,male,22.0,1,0,3,0,0
1,female,38.0,1,0,1,1,1
2,female,26.0,0,0,3,1,1


ok, lets normalize the dataset and split up our dataset.

In [44]:
X = titanice[["Age","SibSp","Parch","Pclass","gender_dummy"]]
X = normalize(X) 
y = titanice['Survived'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

now we calculate the accuracy of our model:

In [45]:
knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(X_train, y_train) #this fits the k-nearest neigbor model with the train data
knn.score(X_test, y_test) #calculate the fit on the test data

0.8

what values are predicted based on our model?

In [46]:
y_test_pred = knn.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_test_pred) #creates a "confusion matrix"
cm

array([[111,  23],
       [ 20,  61]], dtype=int64)

1 is survived, 0 is not survived
What's actually the case (what happened) is ALWAYS IN THE ROWS. What's predicted is ALWAYS IN THE COLUMNS.

See below.

In [47]:
df_cm = pd.DataFrame(cm, index=['did_not_survive', 'survived'], columns = ['did_not_survive_predicted', 'survived_predicted'])
df_cm

Unnamed: 0,did_not_survive_predicted,survived_predicted
did_not_survive,111,23
survived,20,61


In [48]:
precision = df_cm.iloc[0,1]/(df_cm.iloc[0,1] + df_cm.iloc[0,0]) #how many of the survivors are actually survivors

In [49]:
recall = df_cm.iloc[0,1]/(df_cm.iloc[0,1] + df_cm.iloc[1,1]) # how many of the survivors did it catch

In [50]:
print(f"precision: {precision}")
print(f"recall: {recall}")

precision: 0.17164179104477612
recall: 0.27380952380952384
