# Exercise 1

Take the titanic dataset and using all attributes to predict the class `Survived` (convert age and fare into classes ; exclude names from the attribute list)

In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', None)
titanic = pd.read_csv('../Data/titanic.csv.zst', index_col='Name')

titanic.describe(include='all')

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
count,887.0,887.0,887,887.0,887.0,887.0,887.0
unique,,,2,,,,
top,,,male,,,,
freq,,,573,,,,
mean,0.385569,2.305524,,29.471443,0.525366,0.383315,32.30542
std,0.487004,0.836662,,14.121908,1.104669,0.807466,49.78204
min,0.0,1.0,,0.42,0.0,0.0,0.0
25%,0.0,2.0,,20.25,0.0,0.0,7.925
50%,0.0,3.0,,28.0,0.0,0.0,14.4542
75%,1.0,3.0,,38.0,1.0,0.0,31.1375


First we create classes and label them.

In [2]:
from sklearn import preprocessing

titanic['Age Group'] = pd.qcut(x=titanic['Age'], q=4)
titanic['Fare Group'] = pd.qcut(x=titanic['Fare'], q=4)

for col in ['Sex', 'Age Group', 'Fare Group']:
    le = preprocessing.LabelEncoder()
    titanic[col] = le.fit_transform(titanic[col])

titanic.head(5)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Age Group,Fare Group
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Mr. Owen Harris Braund,0,3,1,22.0,1,0,7.25,1,0
Mrs. John Bradley (Florence Briggs Thayer) Cumings,1,1,0,38.0,1,0,71.2833,2,3
Miss. Laina Heikkinen,1,3,0,26.0,0,0,7.925,1,0
Mrs. Jacques Heath (Lily May Peel) Futrelle,1,1,0,35.0,1,0,53.1,2,3
Mr. William Henry Allen,0,3,1,35.0,0,0,8.05,2,1


Some preliminary definitions to use later.

In [3]:
all_features = ['Pclass', 'Sex', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Age Group', 'Fare Group']
max_depth = len(all_features) + 1
X = titanic[all_features]
y = titanic['Survived']

In [4]:
## (a) Choose Three classifiers and evaluate their performance using all attributes

In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(metric='canberra'), # Note: We use the `canberra` metric, as it has proven to be the best one in the past exercises.
    'Naïve Bayes': GaussianNB(),
}

In [6]:
def create_perf_db() -> pd.DataFrame:
    return pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score']).set_index('Model')

In [7]:
from sklearn.model_selection import train_test_split
from util import *

perf = create_perf_db()
for name, model in models.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=224)
    a, p, r, f = evaluate(model, X_train, X_test, y_train, y_test)

    row = pd.Series({
        'Accuracy': a,
        'Precision': p,
        'Recall': r,
        'F1-Score': f,
    }, name=model)
    perf = perf.append(row)
    # Frickin pd.concat isn't worth a penny -.- Tried so many different things to make it work, but nah...

perf

  perf = perf.append(row)
  perf = perf.append(row)
  perf = perf.append(row)


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DecisionTreeClassifier(),0.855856,0.784091,0.841463,0.811765
KNeighborsClassifier(metric='canberra'),0.869369,0.818182,0.847059,0.83237
GaussianNB(),0.81982,0.875,0.726415,0.793814


## (b) Define a feature selection method and use it on all the classifiers ;

In [8]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, f_regression, chi2, mutual_info_classif, mutual_info_regression

X_train, _, y_train, _ = train_test_split(X, y, test_size=0.25, random_state=224)

feat_scores = []
for score_func in [f_classif, f_regression, chi2, mutual_info_classif, mutual_info_regression]:
    best_features = SelectKBest(score_func=score_func, k=len(all_features))
    fit = best_features.fit(X_train, y_train)

    df_scores = pd.DataFrame(fit.scores_)
    df_columns = pd.DataFrame(X_train.columns)

    feature_scores = pd.concat([df_columns, df_scores], axis=1)
    feature_scores.columns = ['Feature_Name', 'Score']
    feature_scores.set_index('Feature_Name', inplace=True)
    feature_scores.sort_values(by='Score', ascending=False, inplace=True)

    print(f'{score_func.__name__}:')
    display(feature_scores)

f_classif:


Unnamed: 0_level_0,Score
Feature_Name,Unnamed: 1_level_1
Sex,238.754111
Pclass,64.458204
Fare Group,48.799911
Parents/Children Aboard,4.773355
Siblings/Spouses Aboard,1.164731
Age Group,0.593326


f_regression:


Unnamed: 0_level_0,Score
Feature_Name,Unnamed: 1_level_1
Sex,238.754111
Pclass,64.458204
Fare Group,48.799911
Parents/Children Aboard,4.773355
Siblings/Spouses Aboard,1.164731
Age Group,0.593326


chi2:


Unnamed: 0_level_0,Score
Feature_Name,Unnamed: 1_level_1
Sex,60.366719
Fare Group,40.161871
Pclass,17.600476
Parents/Children Aboard,8.187396
Siblings/Spouses Aboard,2.82839
Age Group,0.503002


mutual_info_classif:


Unnamed: 0_level_0,Score
Feature_Name,Unnamed: 1_level_1
Sex,0.13877
Pclass,0.036356
Fare Group,0.014454
Siblings/Spouses Aboard,0.01372
Parents/Children Aboard,0.002635
Age Group,0.0


mutual_info_regression:


Unnamed: 0_level_0,Score
Feature_Name,Unnamed: 1_level_1
Sex,0.149948
Parents/Children Aboard,0.072817
Pclass,0.044094
Fare Group,0.038076
Siblings/Spouses Aboard,0.0
Age Group,0.0


I opt for the top three features `['Sex', 'Pclass', 'Fare Group']`.

In [9]:
top_features = ['Sex', 'Pclass', 'Fare Group']

perf = create_perf_db()
for name, model in models.items():
    X_train, X_test, y_train, y_test = train_test_split(X[top_features], y, test_size=0.25, random_state=224)
    a, p, r, f = evaluate(model, X_train, X_test, y_train, y_test)

    row = pd.Series({
        'Accuracy': a,
        'Precision': p,
        'Recall': r,
        'F1-Score': f,
    }, name=model)
    perf = perf.append(row)
    # Frickin pd.concat isn't worth a penny -.- Tried so many different things to make it work, but nah...

perf

  perf = perf.append(row)
  perf = perf.append(row)
  perf = perf.append(row)


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DecisionTreeClassifier(),0.824324,0.727273,0.810127,0.766467
KNeighborsClassifier(metric='canberra'),0.792793,0.761364,0.728261,0.744444
GaussianNB(),0.797297,0.818182,0.712871,0.761905


(c) Compare the classifiers and explain the differences observed;