# Exercise 1

Take the titanic dataset and using all attributes to predict the class `Survived` (convert age and fare into classes ; exclude names from the attribute list)

In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', None)
titanic = pd.read_csv('../Data/titanic.csv.zst', index_col='Name')

titanic.describe(include='all')

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
count,887.0,887.0,887,887.0,887.0,887.0,887.0
unique,,,2,,,,
top,,,male,,,,
freq,,,573,,,,
mean,0.385569,2.305524,,29.471443,0.525366,0.383315,32.30542
std,0.487004,0.836662,,14.121908,1.104669,0.807466,49.78204
min,0.0,1.0,,0.42,0.0,0.0,0.0
25%,0.0,2.0,,20.25,0.0,0.0,7.925
50%,0.0,3.0,,28.0,0.0,0.0,14.4542
75%,1.0,3.0,,38.0,1.0,0.0,31.1375


First we create classes and label them.

In [2]:
from sklearn import preprocessing

titanic['Age Group'] = pd.qcut(x=titanic['Age'], q=4)
titanic['Fare Group'] = pd.qcut(x=titanic['Fare'], q=4)

for col in ['Sex', 'Age Group', 'Fare Group']:
    le = preprocessing.LabelEncoder()
    titanic[col] = le.fit_transform(titanic[col])

titanic.head(5)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Age Group,Fare Group
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Mr. Owen Harris Braund,0,3,1,22.0,1,0,7.25,1,0
Mrs. John Bradley (Florence Briggs Thayer) Cumings,1,1,0,38.0,1,0,71.2833,2,3
Miss. Laina Heikkinen,1,3,0,26.0,0,0,7.925,1,0
Mrs. Jacques Heath (Lily May Peel) Futrelle,1,1,0,35.0,1,0,53.1,2,3
Mr. William Henry Allen,0,3,1,35.0,0,0,8.05,2,1


Some preliminary definitions to use later.

In [3]:
all_features = ['Pclass', 'Sex', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Age Group', 'Fare Group']
max_depth = len(all_features) + 1

## (a) Choose Three classifiers and evaluate their performance using all attributes

In [4]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(metric='canberra'),
    # Note: We use the `canberra` metric, as it has proven to be the best one in the past exercises.
    'Naïve Bayes': GaussianNB(),
}

In [5]:
from util import *

for name, model in models.items():
    print(f'{model}:')
    a, p, r, f = kfold_eval(model, titanic[all_features], titanic['Survived'], 5)
    data = {
        'Fold': range(1, 6),
        'Accuracy': a,
        'Precision': p,
        'Recall': r,
        'F1-Score': f,
    }

    scores = pd.DataFrame(data).set_index('Fold')
    display(scores)

DecisionTreeClassifier():


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.786517,0.627119,0.698113,0.660714
2,0.786517,0.683544,0.80597,0.739726
3,0.830508,0.742857,0.8125,0.776119
4,0.813559,0.633803,0.865385,0.731707
5,0.847458,0.746032,0.810345,0.77686


KNeighborsClassifier(metric='canberra'):


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.735955,0.779661,0.575,0.661871
2,0.820225,0.746835,0.830986,0.786667
3,0.824859,0.785714,0.774648,0.780142
4,0.824859,0.690141,0.844828,0.75969
5,0.870056,0.825397,0.8125,0.818898


GaussianNB():


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.719101,0.762712,0.555556,0.642857
2,0.752809,0.759494,0.705882,0.731707
3,0.785311,0.857143,0.681818,0.759494
4,0.79096,0.774648,0.723684,0.748299
5,0.813559,0.777778,0.720588,0.748092


## (b) Define a feature selection method and use it on all the classifiers;
We will be using PCA for this task.

In [6]:
from util import pca_eval

for name, model in models.items():
    a, p, r, f1 = pca_eval(model, titanic[all_features], titanic['Survived'], k=5, c=4)

    print(f'{name}:')
    data = {
        'Fold': range(1, 6),
        'Accuracy': a,
        'Precision': p,
        'Recall': r,
        'F1-Score': f1,
    }

    scores = pd.DataFrame(data).set_index('Fold')
    display(scores)

Decision Tree:


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.79096,0.634921,0.740741,0.683761
2,0.79096,0.634921,0.740741,0.683761
3,0.79096,0.634921,0.740741,0.683761
4,0.79096,0.634921,0.740741,0.683761
5,0.79096,0.634921,0.740741,0.683761


KNN:


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.824859,0.761905,0.75,0.755906
2,0.824859,0.761905,0.75,0.755906
3,0.824859,0.761905,0.75,0.755906
4,0.824859,0.761905,0.75,0.755906
5,0.824859,0.761905,0.75,0.755906


Naïve Bayes:


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.745763,0.555556,0.673077,0.608696
2,0.745763,0.555556,0.673077,0.608696
3,0.745763,0.555556,0.673077,0.608696
4,0.745763,0.555556,0.673077,0.608696
5,0.745763,0.555556,0.673077,0.608696


## (c) Compare the classifiers and explain the differences observed;

PCA is a data reduction technique compressing all features into just a few principal components.

The performance seems to be consistently lower though.
Further checking/changing code did not shine light on as to why.