# Ensembles

sklearn allows you to easily create ensembles of multiple models.

We will just be looking at 'voting' classifiers today. These allow you to combine 'conceptually different machine learning classifiers', so are very flexible.

You may also want to look into stacking models if this is an area you are interested in exploring further. 

The main new import is `VotingClassifier` from the `sklearn.ensemble` module.

In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score, precision_score, \
                            recall_score, roc_auc_score

In [78]:
try:
    data = pd.read_csv("data/processed_data.csv")

except FileNotFoundError:
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '1804_python_healthcare/master/titanic/data/processed_data.csv'

    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data
    data.to_csv(data_directory + 'processed_data.csv', index=False)

data = data.astype(float)

# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data

data.drop('PassengerId', inplace=True, axis=1)

X = data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
y = data['Survived'] # y = 'survived' column from 'data'

feature_names = X.columns.tolist()

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

print(f"Training Dataset Samples: {len(X_train)}")
print(f"Validation Dataset Samples: {len(X_validate)}")
print(f"Testing Dataset Samples: {len(X_test)}")

Training Dataset Samples: 569
Validation Dataset Samples: 143
Testing Dataset Samples: 179


Let's first set up a function to allow us to quickly pull back results for different machine learning models.

In [79]:
def fit_train(name="XGBoost",
              X_train=X_train, X_validate=X_validate,
              y_train=y_train, y_validate=y_validate,
              model=XGBClassifier(random_state=42)
              ):

     model.fit(X_train, y_train)

     y_pred_train = model.predict(X_train)
     y_pred_val = model.predict(X_validate)

     tn, fp, fn, tp = confusion_matrix(y_validate, y_pred_val, labels=[0, 1]).ravel()

     return pd.DataFrame({
            'Accuracy (training)': np.mean(y_pred_train == y_train),
            'Accuracy (validation)': np.mean(y_pred_val == y_validate),
            'Precision (validation)': precision_score(y_validate, y_pred_val, average='macro'),
            'Recall (validation)': recall_score(y_validate, y_pred_val, average='macro'),
            "AUC": roc_auc_score(y_validate, y_pred_val),
            "f1": f1_score(y_validate, y_pred_val, average='macro'),
            "FP": fp,
            "FN": fn
          }, index=[name]
).round(3)

Let's first just train an XGBoost model to get an idea of performance on this dataset.

In [80]:
clf1 = XGBClassifier(random_state=42)
results_df = fit_train(model = clf1)
results_df

Unnamed: 0,Accuracy (training),Accuracy (validation),Precision (validation),Recall (validation),AUC,f1,FP,FN
XGBoost,0.979,0.797,0.788,0.786,0.786,0.787,14,15


Let's also train a decision tree for comparison.

In [81]:
clf2 = DecisionTreeClassifier(max_depth=6, random_state=42)
results_df = pd.concat([results_df,fit_train(model=clf2, name="Decision Tree")])


## The ensemble

First, let's try creating an ensemble of these two models.

In [82]:
voting_classifier_1 = VotingClassifier(
    estimators=[('dt', clf1), ('xGB', clf2)],
    voting='hard')

results_df = pd.concat(
    [results_df,
     fit_train(model=voting_classifier_1, name="DT, XGB: hard")]
     )

results_df

Unnamed: 0,Accuracy (training),Accuracy (validation),Precision (validation),Recall (validation),AUC,f1,FP,FN
XGBoost,0.979,0.797,0.788,0.786,0.786,0.787,14,15
Decision Tree,0.886,0.818,0.814,0.8,0.8,0.805,10,16
"DT, XGB: hard",0.898,0.832,0.836,0.808,0.808,0.817,7,17


### Working with more classifiers

Now let's try this with some additional models.

In [83]:
clf3 = KNeighborsClassifier(n_neighbors=7)

clf4 = SVC(kernel='rbf', probability=True)

voting_classifier_2 = VotingClassifier(estimators=[
    ('XGBoost', clf1),
    ('Decision Tree', clf2),
    ('K-Nearest Neighbours', clf3),
    ('SVC', clf4)],
    voting='hard')

results_df = pd.concat([results_df,fit_train(model=voting_classifier_2, name="DT, XGBoost, KNN + SVC Voting: hard")])
results_df

Unnamed: 0,Accuracy (training),Accuracy (validation),Precision (validation),Recall (validation),AUC,f1,FP,FN
XGBoost,0.979,0.797,0.788,0.786,0.786,0.787,14,15
Decision Tree,0.886,0.818,0.814,0.8,0.8,0.805,10,16
"DT, XGB: hard",0.898,0.832,0.836,0.808,0.808,0.817,7,17
"DT, XGBoost, KNN + SVC Voting: hard",0.854,0.734,0.767,0.677,0.677,0.68,5,33


## Hard and Soft Voting

In [84]:
voting_classifier_1 = VotingClassifier(
    estimators=[('dt', clf1), ('xGB', clf2)],
    voting='soft')

results_df = pd.concat(
    [results_df,
     fit_train(model=voting_classifier_1, name="DT, XGB: soft")]
     )

results_df

Unnamed: 0,Accuracy (training),Accuracy (validation),Precision (validation),Recall (validation),AUC,f1,FP,FN
XGBoost,0.979,0.797,0.788,0.786,0.786,0.787,14,15
Decision Tree,0.886,0.818,0.814,0.8,0.8,0.805,10,16
"DT, XGB: hard",0.898,0.832,0.836,0.808,0.808,0.817,7,17
"DT, XGBoost, KNN + SVC Voting: hard",0.854,0.734,0.767,0.677,0.677,0.68,5,33
"DT, XGB: soft",0.951,0.804,0.795,0.795,0.795,0.795,14,14


In [85]:
voting_classifier_1 = VotingClassifier(
    estimators=[('dt', clf1), ('xGB', clf2)],
    voting='soft',
    weights=[1, 2])

results_df = pd.concat(
    [results_df,
     fit_train(model=voting_classifier_1, name="DT, XGB: soft, 2:1")]
     )

results_df

Unnamed: 0,Accuracy (training),Accuracy (validation),Precision (validation),Recall (validation),AUC,f1,FP,FN
XGBoost,0.979,0.797,0.788,0.786,0.786,0.787,14,15
Decision Tree,0.886,0.818,0.814,0.8,0.8,0.805,10,16
"DT, XGB: hard",0.898,0.832,0.836,0.808,0.808,0.817,7,17
"DT, XGBoost, KNN + SVC Voting: hard",0.854,0.734,0.767,0.677,0.677,0.68,5,33
"DT, XGB: soft",0.951,0.804,0.795,0.795,0.795,0.795,14,14
"DT, XGB: soft, 2:1",0.924,0.818,0.81,0.806,0.806,0.808,12,14


In [86]:
voting_classifier_2 = VotingClassifier(estimators=[
    ('XGBoost', clf1),
    ('Decision Tree', clf2),
    ('K-Nearest Neighbours', clf3),
    ('SVC', clf4)],
    voting='soft')

results_df = pd.concat([results_df,fit_train(model=voting_classifier_2, name="DT, XGBoost, KNN + SVC Voting: soft")])
results_df

Unnamed: 0,Accuracy (training),Accuracy (validation),Precision (validation),Recall (validation),AUC,f1,FP,FN
XGBoost,0.979,0.797,0.788,0.786,0.786,0.787,14,15
Decision Tree,0.886,0.818,0.814,0.8,0.8,0.805,10,16
"DT, XGB: hard",0.898,0.832,0.836,0.808,0.808,0.817,7,17
"DT, XGBoost, KNN + SVC Voting: hard",0.854,0.734,0.767,0.677,0.677,0.68,5,33
"DT, XGB: soft",0.951,0.804,0.795,0.795,0.795,0.795,14,14
"DT, XGB: soft, 2:1",0.924,0.818,0.81,0.806,0.806,0.808,12,14
"DT, XGBoost, KNN + SVC Voting: soft",0.923,0.783,0.775,0.765,0.765,0.769,13,18


In [87]:
voting_classifier_2 = VotingClassifier(estimators=[
    ('XGBoost', clf1),
    ('Decision Tree', clf2),
    ('K-Nearest Neighbours', clf3),
    ('SVC', clf4)],
    voting='soft',
    weights=[2, 2, 1, 1])

results_df = pd.concat([results_df,fit_train(model=voting_classifier_2, name="DT, XGBoost, KNN + SVC Voting: soft, 2:2:1:1")])
results_df

Unnamed: 0,Accuracy (training),Accuracy (validation),Precision (validation),Recall (validation),AUC,f1,FP,FN
XGBoost,0.979,0.797,0.788,0.786,0.786,0.787,14,15
Decision Tree,0.886,0.818,0.814,0.8,0.8,0.805,10,16
"DT, XGB: hard",0.898,0.832,0.836,0.808,0.808,0.817,7,17
"DT, XGBoost, KNN + SVC Voting: hard",0.854,0.734,0.767,0.677,0.677,0.68,5,33
"DT, XGB: soft",0.951,0.804,0.795,0.795,0.795,0.795,14,14
"DT, XGB: soft, 2:1",0.924,0.818,0.81,0.806,0.806,0.808,12,14
"DT, XGBoost, KNN + SVC Voting: soft",0.923,0.783,0.775,0.765,0.765,0.769,13,18
"DT, XGBoost, KNN + SVC Voting: soft, 2:1:2",0.94,0.818,0.812,0.803,0.803,0.807,11,15
