# Exercise 1

Take the titanic dataset and using all attributes to predict the class `Survived` (convert age and fare into classes ; exclude names from the attribute list),
Build a boosting ensemble model with:

1. Adaboost
2. Gradientboost
3. XGB

In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', None)
titanic = pd.read_csv('../Data/titanic.csv.zst', index_col='Name')

titanic['Age Group'] = pd.qcut(x=titanic['Age'], q=4)
titanic['Fare Group'] = pd.qcut(x=titanic['Fare'], q=4)

titanic.head(5)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Age Group,Fare Group
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Mr. Owen Harris Braund,0,3,male,22.0,1,0,7.25,"(20.25, 28.0]","(-0.001, 7.925]"
Mrs. John Bradley (Florence Briggs Thayer) Cumings,1,1,female,38.0,1,0,71.2833,"(28.0, 38.0]","(31.138, 512.329]"
Miss. Laina Heikkinen,1,3,female,26.0,0,0,7.925,"(20.25, 28.0]","(-0.001, 7.925]"
Mrs. Jacques Heath (Lily May Peel) Futrelle,1,1,female,35.0,1,0,53.1,"(28.0, 38.0]","(31.138, 512.329]"
Mr. William Henry Allen,0,3,male,35.0,0,0,8.05,"(28.0, 38.0]","(7.925, 14.454]"


In [2]:
titanic.describe(include='all')

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Age Group,Fare Group
count,887.0,887.0,887,887.0,887.0,887.0,887.0,887,887
unique,,,2,,,,,4,4
top,,,male,,,,,"(20.25, 28.0]","(-0.001, 7.925]"
freq,,,573,,,,,243,238
mean,0.385569,2.305524,,29.471443,0.525366,0.383315,32.30542,,
std,0.487004,0.836662,,14.121908,1.104669,0.807466,49.78204,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,
25%,0.0,2.0,,20.25,0.0,0.0,7.925,,
50%,0.0,3.0,,28.0,0.0,0.0,14.4542,,
75%,1.0,3.0,,38.0,1.0,0.0,31.1375,,


We preprocess the age and fare groups into learnable class indices.

In [3]:
from sklearn import preprocessing

for col in ['Sex', 'Age Group', 'Fare Group']:
    le = preprocessing.LabelEncoder()
    titanic[col] = le.fit_transform(titanic[col])

titanic.head(5)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Age Group,Fare Group
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Mr. Owen Harris Braund,0,3,1,22.0,1,0,7.25,1,0
Mrs. John Bradley (Florence Briggs Thayer) Cumings,1,1,0,38.0,1,0,71.2833,2,3
Miss. Laina Heikkinen,1,3,0,26.0,0,0,7.925,1,0
Mrs. Jacques Heath (Lily May Peel) Futrelle,1,1,0,35.0,1,0,53.1,2,3
Mr. William Henry Allen,0,3,1,35.0,0,0,8.05,2,1


Some preliminary definitions to use later.

In [4]:
all_features = ['Pclass', 'Sex', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Age Group', 'Fare Group']

In [5]:
from typing import Any
import numpy as np
from util import kfold_eval


def boosting_performance(model: Any, k: int = 5, average: Any = 'binary', zero_division: Any = 'warn') -> (
np.array, np.array, np.array, np.array):
    """
    Performs a k-fold cross-validation on an SVM with the specified kernel.

    :param svm: The SVM model
    :param k: how many folds to perform
    :param average: default='binary': This parameter is required for multiclass/multilabel targets.
    :param zero_division: default='warn'

    :return: accuracy, precision, recall, f1
    """
    X = titanic[all_features]
    y = titanic['Survived']

    return kfold_eval(model=model, X=X, y=y, k=k, average=average, zero_division=zero_division)

## Show the Comparison of the Performance of the models.

In [6]:
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier


k_folds = 5
models = {
    'AdaBoost': AdaBoostClassifier(),
    'GradientBoost': GradientBoostingClassifier(),
    'XGB': XGBClassifier(),
}

for name, model in models.items():
    print(f'{name}:')
    a, p, r, f = boosting_performance(model, k=k_folds)
    data = {
        'Fold': range(1, k_folds + 1),
        'Accuracy': a,
        'Precision': p,
        'Recall': r,
        'F1-Score': f,
    }

    scores = pd.DataFrame(data).set_index('Fold')
    display(scores)

AdaBoost:


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.769663,0.694915,0.640625,0.666667
2,0.797753,0.734177,0.794521,0.763158
3,0.813559,0.785714,0.753425,0.769231
4,0.779661,0.591549,0.807692,0.682927
5,0.80791,0.714286,0.737705,0.725806


GradientBoost:


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.808989,0.728814,0.704918,0.716667
2,0.775281,0.670886,0.791045,0.726027
3,0.819209,0.771429,0.771429,0.771429
4,0.836158,0.690141,0.875,0.771654
5,0.836158,0.761905,0.774194,0.768


XGB:


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.797753,0.694915,0.694915,0.694915
2,0.780899,0.683544,0.794118,0.734694
3,0.841808,0.785714,0.808824,0.797101
4,0.819209,0.647887,0.867925,0.741935
5,0.864407,0.809524,0.809524,0.809524
