# Exercise 1

Take the `titanic` dataset and use all attributes to predict the class `Survived` with a k-nearest neighbours classifier, which one do you think is the best distance measure? And why?

In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', None)
titanic = pd.read_csv('../Data/titanic.csv.zst', index_col='Name')

titanic['Sex'] = (titanic['Sex'].to_numpy() == 'male').astype(int)
titanic.head(5)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Mr. Owen Harris Braund,0,3,1,22.0,1,0,7.25
Mrs. John Bradley (Florence Briggs Thayer) Cumings,1,1,0,38.0,1,0,71.2833
Miss. Laina Heikkinen,1,3,0,26.0,0,0,7.925
Mrs. Jacques Heath (Lily May Peel) Futrelle,1,1,0,35.0,1,0,53.1
Mr. William Henry Allen,0,3,1,35.0,0,0,8.05


In [2]:
titanic.describe(include='all')

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
count,887.0,887.0,887.0,887.0,887.0,887.0,887.0
mean,0.385569,2.305524,0.645998,29.471443,0.525366,0.383315,32.30542
std,0.487004,0.836662,0.47848,14.121908,1.104669,0.807466,49.78204
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,0.0,20.25,0.0,0.0,7.925
50%,0.0,3.0,1.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,1.0,38.0,1.0,0.0,31.1375
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


Some preliminary definitions to use later.

In [3]:
metrics = ['manhattan', 'euclidean', 'cosine']
other_metrics = ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'minkowski', 'sqeuclidean', 'hamming']
all_features = titanic.loc[:, titanic.columns != 'Survived'].columns

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


def knn_accuracy(metric: str):
    X = titanic[all_features]
    y = titanic['Survived']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=224)

    knn = KNeighborsClassifier(metric=metric)
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [5]:
def compute_accuracies(metrics: list[str]) -> pd.DataFrame:
    accuracies =  pd.DataFrame(
        map(lambda metric: [metric, knn_accuracy(metric=metric)], metrics),
        columns=['Metric', 'Accuracy'],
    ).sort_values(by='Accuracy', ascending=False)

    return accuracies

In [6]:
compute_accuracies(metrics)

Unnamed: 0,Metric,Accuracy
2,cosine,0.788288
0,manhattan,0.77027
1,euclidean,0.738739


In [7]:
compute_accuracies(other_metrics)

Unnamed: 0,Metric,Accuracy
1,canberra,0.873874
6,hamming,0.792793
0,braycurtis,0.77027
3,correlation,0.752252
2,chebyshev,0.743243
4,minkowski,0.738739
5,sqeuclidean,0.738739
