<a href="https://colab.research.google.com/github/kurtispykes/deep-learning-examples/blob/main/ensembling_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import typing as t
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from mlxtend.plotting import plot_decision_regions
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
# create training data
X, y = make_classification(
    n_samples=10000,
    n_informative=10,
    random_state=2022
)

# split into train and test
X_new, X_test = X[:9000, :], X[9000:, ]
y_new, y_test = y[:9000], y[9000:]

X_train, X_val, y_train, y_val = train_test_split(
    X_new, y_new, 
    test_size=0.3
)


print(f"Train data: {X_train.shape}\n\
Train labels: {y_train.shape}\n\
Testing data: {X_test.shape}\n\
Test labels: {y_test.shape}\n\
Validation data: {X_val.shape}\n\
Validation labels: {y_val.shape}")

Train data: (6300, 20)
Train labels: (6300,)
Testing data: (1000, 20)
Test labels: (1000,)
Validation data: (2700, 20)
Validation labels: (2700,)


# Baseline

In [3]:
# building and training model
model = tf.keras.Sequential([
                            tf.keras.layers.Dense(10, 
                                                  input_shape=(X_train.shape[1],),
                                                  activation="relu"),
                            tf.keras.layers.Dense(10,
                                                  activation="relu"),
                            tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=20, verbose=0)

_, val_acc = model.evaluate(X_val, y_val, verbose=0)
_, test_acc = model.evaluate(X_test, y_test, verbose=0)

print(f"Validation Accuracy: {val_acc}\n\
Test Accuracy: {test_acc}")

Validation Accuracy: 0.9603703618049622
Test Accuracy: 0.9610000252723694


# Cross validation Ensemble

In [4]:
# building a 3 layer NN 
def build_model(X:np.array,
                y:np.array, 
                X_val:np.array,
                y_val:np.array
                ) -> t.Union[t.List[tf.keras.Sequential], t.List[float]]: 
  model = tf.keras.Sequential([
                              tf.keras.layers.Dense(10, 
                                                    input_shape=(X.shape[1],),
                                                    activation="relu"),
                              tf.keras.layers.Dense(10,
                                                    activation="relu"),
                              tf.keras.layers.Dense(1, activation="sigmoid")
  ])

  model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
  model.fit(X, y, epochs=20, verbose=0)
  
  _, acc = model.evaluate(X_val, y_val, verbose=0)
  return model, acc

# to save models and their performance
# on the validation data
scores = []
ensemble = []

# create folds
kfold = KFold(n_splits=5, shuffle=True)
for train_idx, val_idx in kfold.split(X_new): 
  X_train, y_train = X_new[train_idx], y_new[train_idx]
  X_val, y_val = X_new[val_idx], y_new[val_idx]
  # train a model
  model, acc = build_model(X=X_train, y=y_train, X_val=X_val, y_val=y_val) 
  # saving model and performance
  scores.append(acc)
  ensemble.append(model)

print(f"Accuracy of Constituents: {scores}\n\
Expected Accuracy from Ensemble: {np.mean(scores)}")

Accuracy of Constituents: [0.9627777934074402, 0.9538888931274414, 0.9605555534362793, 0.9566666483879089, 0.9688888788223267]
Expected Accuracy from Ensemble: 0.9605555534362793


In [5]:
# combining the models to use on test data
y_hat = [model.predict(X_test) for model in ensemble]
# convert preds to binary 
y_hat = [list(map(lambda x: 0 if x < 0.5 else 1, preds)) for preds in y_hat]
# taking the most common prediction from each model for each instance
y_hat_preds = np.array(pd.DataFrame(y_hat).mode().T)
# final model on test data
accuracy_score(y_test, y_hat_preds)

0.964

# Bagging Ensemble 

In [8]:
def create_bootstrap(X:np.array, y:np.array, size:int):
  # create an index for each instance 
  idx = [i for i in range(len(X))]
  # create a bootstrap dataset of 7500 instances
  bootstrap_idx = np.random.choice(range(len(X)), size=size)
  # all other instances not in training data 
  # will be used to validate the model
  val_idx = [x for x in idx if x not in bootstrap_idx]
  # creating the training and validation datasets 
  train_data, train_labels = X[bootstrap_idx], y[bootstrap_idx]
  val_data, val_labels = X[val_idx], y[val_idx]
  return train_data, train_labels, val_data, val_labels

def build_model(X:np.array, y:np.array, X_val:np.array, y_val:np.array): 
  # building and training model
  model = tf.keras.Sequential([
                              tf.keras.layers.Dense(10, 
                                                    input_shape=(X.shape[1],),
                                                    activation="relu"),
                              tf.keras.layers.Dense(10,
                                                    activation="relu"),
                              tf.keras.layers.Dense(1, activation="sigmoid")
  ])

  model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
  model.fit(X, y, epochs=20, verbose=0)
  _, acc = model.evaluate(X_val, y_val, verbose=0)
  return model, val_acc

def ensemble_predict(test_data:np.array, ensemble:t.List[tf.keras.Sequential]) -> np.array:
  # combining the models to use on test data
  y_hat = [model.predict(test_data) for model in ensemble]
  # convert preds to binary 
  y_hat = [list(map(lambda x: 0 if x < 0.5 else 1, preds)) for preds in y_hat]
  # taking the most common prediction from each model for each instance
  y_hat_preds = np.array(pd.DataFrame(y_hat).mode().T)
  return y_hat_preds

def train(X:np.array, 
          y:np.array, 
          n_models:int, 
          n_samples:int
          ) -> t.Union[
                       t.List[tf.keras.Sequential],
                       t.List[float],
                       float
                       ]:

  scores = []
  ensemble = []
  expected_performance = None 
  
  for _ in range(n_models):
    train_data, train_labels, val_data, val_labels = create_bootstrap(
        X, y, n_samples)
    model, acc = build_model(
        train_data, train_labels, val_data, val_labels)
    ensemble.append(model)
    scores.append(acc)
  expected_performance = np.mean(scores)
  return ensemble, scores, expected_performance

ensemble, scores, expected_performance = train(X=X_new, y=y_new, n_models=5, n_samples=8000)
 

print(f"Accuracy of Constituents: {scores}\n\
Expected Accuracy from Ensemble: {expected_performance}")

Accuracy of Constituents: [0.9603703618049622, 0.9603703618049622, 0.9603703618049622, 0.9603703618049622, 0.9603703618049622]
Expected Accuracy from Ensemble: 0.9603703618049622


In [9]:
# ensemble model predictions
y_hat_preds = ensemble_predict(X_test, ensemble)
accuracy_score(y_test, y_hat_preds)

0.964