# COMP47590: Advanced Machine Learning
# Assignment 1: The Super Learner
# SOLUTION

## Imports Use Packages Etc

In [None]:
from IPython.display import display, HTML, Image

from TAS_Python_Utilities import data_viz
from TAS_Python_Utilities import data_viz_target
from TAS_Python_Utilities import visualize_tree

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
from random import randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from scipy.spatial import distance
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn import metrics
from sklearn import tree
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn import linear_model
from sklearn import neighbors
from sklearn.utils import resample
from sklearn.metrics import cohen_kappa_score

%matplotlib inline
#%qtconsole

## Define Super Learner Classifier

The *Super Learner* is a heterogeneous stacked ensemble classifier. This is a classification model that uses a set of base classifiers of different types, the outputs of which are then combined in another classifier at the stacked layer. The Super Learner was described in [(van der Laan et al, 2007)](https://pdfs.semanticscholar.org/19e9/c732082706f39d2ba12845851309714db135.pdf) but the stacked ensemble idea has been around for a long time. 

Figure 1 shows a flow diagram of the Super Learner process (this is from (van der Laan et al, 2007) and the process is also described in the COMP47590 lecture "[COMP47590 2017-2018 L04 Supervised Learning Ensembles 3](https://www.dropbox.com/s/1ksx94nxtuyn4l8/COMP47590%202017-2018%20L04%20Supervised%20Learning%20Ensembles%203.pdf?raw=1)"). The base classifiers are trained and their outputs are combined along with the training dataset labels into a training set for the stack layer classifier. To avoid overfitting the generation of the stacked layer training set uses a k-fold cross validation process (described as V-fold in Figure 1). To further add variety to the base estimators a bootstrapping selection (as is used in the bagging ensemble approach).
 
![Super Learner Process Flow](SuperLearnerProcessFlow.png "Logo Title Text 1")
Figure 1: A flow diagram for the Super Learner


### Define the SuperLearnerClassifier Class

In [None]:
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class SuperLearnerClassifier(BaseEstimator, ClassifierMixin):
    
    """An ensemble classifier that uses heterogeneous models at the base layer and a aggregatnio model at the aggregation layer. A k-fold cross validation is used to gnerate training data for the stack layer model.

    Parameters
    ----------
    base_estimators: list 
        A list of the classifiers in the ase layer of the ensemble. Supported types are
        - "svm" Support Vector Machine implemented by sklearn.svm.SVC
        - "logreg" Logistic Regression implemented by sklearn.linear_models.LogisticRegression
        - "knn" k Nearest Neighbour implemented by sklearn.neighbors.KNeighborsClassifier
        - "tree" Decision Tree implemented by sklearn.tree.DecisionTreeClassifier
        - "randomforest" RandomForest implemented by sklearn.tree.RandomForestClassifier    
    classifier_duplicates: int, optional (default = 1)
        How many instances of each classifier type listed in base_estimators is included in the ensemble
    use_probs: boolean, optional (default = True)
        Whether labels or probabilities are generated and used for the stack layer traiing data. 
    stack_layer_classifier: string, optional (default = "logreg')
        The classifier type used at the stack layer. The same classifier types as are supported at the base layer are supported        
    training_folds: int, optional (default = 4)
        How many folds will be used to generate the training set for the stacked layer
    include_base_features_at_stack: boolean, optional (default = False)
        Whether or not the base feature values should be includeds as inputs at the stack layer
        
    Attributes
    ----------
    classes_ : array of shape = [n_classes] 
        The classes labels (single output problem).


    Notes
    -----
    The default values for most base learners are used.

    See also
    --------
    
    ----------
    .. [1]  van der Laan, M., Polley, E. & Hubbard, A. (2007). 
            Super Learner. Statistical Applications in Genetics 
            and Molecular Biology, 6(1) 
            doi:10.2202/1544-6115.1309
    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.model_selection import cross_val_score
    >>> clf = SuperLearnerClassifier()
    >>> iris = load_iris()
    >>> cross_val_score(clf, iris.data, iris.target, cv=10)

    """
    # Constructor for the classifier object
    def __init__(self, base_estimators = ["svm", "logreg", "knn", "tree", "randomforest"], estimator_duplicates = 1, use_probs = False, stack_layer_classifier_type = "tree", training_folds = 4, include_base_features_at_stack = False):
        """Setup a SuperLearner classifier .
        Parameters
        ----------

        Returns
        -------

        """     
        self.base_estimators = base_estimators
        self.base_estimator_types = list()
        self.estimator_duplicates = estimator_duplicates
        self.use_probs = use_probs
        self.stack_layer_classifier_type = stack_layer_classifier_type
        self.training_folds = training_folds
        self.include_base_features_at_stack = include_base_features_at_stack

    # The fit function to train a classifier
    def fit(self, X, y):
        """Build a SuperLearner classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. 
        y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """    
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        
        # Set up the base classifeirs in the ensemble
        classifiers_ = list()
        for i in range(0, self.estimator_duplicates):
            for t in self.base_estimators:
                self.base_estimator_types.append(t)
                if t== "svm":
                    c = svm.SVC(probability=True)

                elif t == "logreg":
                    c = linear_model.LogisticRegression()

                elif t == "knn":
                    c = neighbors.KNeighborsClassifier()

                elif t == "tree":
                    c = tree.DecisionTreeClassifier(min_samples_split=200)

                elif t == "randomforest":
                    c = ensemble.RandomForestClassifier()

                classifiers_.append(c)
        
        self.n_estimators_ = len(classifiers_)
        
        # divide the dataset into k-folds
        skf = StratifiedKFold(n_splits=self.training_folds)
        skf.get_n_splits(X, y)

        self.X_stack_train = None #(dtype = float)
        self.y_stack_train = np.array([]) #(dtype = float)
        # Iterate through the folds training models and using the test splits to generate training data for the stack layer
        for train_index, test_index in skf.split(X, y):

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            self.y_stack_train = np.r_[self.y_stack_train, y_test]
            
            if self.include_base_features_at_stack:
                X_stack_train_fold = X_test
            else:
                X_stack_train_fold = None
            
            for classifier in classifiers_:
                    
                X_train_samp, y_train_samp = resample(X_train, y_train, replace=True)    
                classifier.fit(X_train_samp, y_train_samp)
                if not self.use_probs:
                    y_pred = classifier.predict(X_test)
                else:
                    y_pred = classifier.predict_proba(X_test)
                
                try:
                    X_stack_train_fold = np.c_[X_stack_train_fold, y_pred]
                except ValueError:
                    X_stack_train_fold = y_pred
    
            try:
                self.X_stack_train = np.r_[self.X_stack_train, X_stack_train_fold]
            except ValueError:
                self.X_stack_train = X_stack_train_fold
        
            # Train the stack layer using the newly created dataset
            if self.stack_layer_classifier_type == "svm":
                self.stack_layer_classifier_ = svm.SVC(probability=True)
                
            elif self.stack_layer_classifier_type == "logreg":
                self.stack_layer_classifier_ = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear', max_iter=1000)
                
            elif self.stack_layer_classifier_type == "knn":
                self.stack_layer_classifier_ = neighbors.KNeighborsClassifier()
                
            elif self.stack_layer_classifier_type == "tree":
                self.stack_layer_classifier_ = tree.DecisionTreeClassifier(min_samples_split=200)

            elif self.stack_layer_classifier_type == "randomforest":
                self.stack_layer_classifier_ = ensemble.RandomForestClassifier()
                
            self.stack_layer_classifier_.fit(self.X_stack_train, self.y_stack_train)
            
            # Retrain the base classifiers in the ensemble using the full dataset
            self.classifiers_ = list()
            for i in range(0, self.estimator_duplicates):
                for t in self.base_estimators:
                    if t == "svm":
                        c = svm.SVC(probability = True)

                    elif t == "logreg":
                        c = linear_model.LogisticRegression()

                    elif t == "knn":
                        c = neighbors.KNeighborsClassifier()

                    elif t == "tree":
                        c = tree.DecisionTreeClassifier(min_samples_split=200)

                    elif t == "randomforest":
                        c = ensemble.RandomForestClassifier()

                    # Perform a bootstrap sample
                    X_samp, y_samp = resample(X, y, replace=True)
                    c.fit(X_samp, y_samp)
                    self.classifiers_.append(c)
            
        # Return the classifier
        return self

    # The predict function to make a set of predictions for a set of query instances
    def predict(self, X):
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['stack_layer_classifier_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)
        
        
        X_stack_label_queries = None
        
        if self.include_base_features_at_stack:
            X_stack_queries = X
        else:
            X_stack_queries = None
                
        for classifier in self.classifiers_:
            
            y_pred_labels = classifier.predict(X)
            y_pred_probs = classifier.predict_proba(X)
            
            if not self.use_probs:
                y_pred = y_pred_labels
            else:
                y_pred = y_pred_probs
                
            try:
                X_stack_queries = np.c_[X_stack_queries, y_pred]
            except ValueError:
                X_stack_queries = y_pred
                
            try:
                X_stack_label_queries = np.c_[X_stack_label_queries, y_pred_labels]
            except ValueError:
                X_stack_label_queries = y_pred_labels
        
        self.last_X_stack_queries = X_stack_label_queries
        
        return self.stack_layer_classifier_.predict(X_stack_queries)
    
    # The predict function to make a set of predictions for a set of query instances
    def predict_proba(self, X):
        """Predict class probabilities of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, n_labels].
            The predicted class label probabilities of the input samples. 
        """
        # Check is fit had been called by confirming that the teamplates_ dictiponary has been set up
        check_is_fitted(self, ['stack_layer_classifier_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)
        
        if self.include_base_features_at_stack:
            X_stack_queries = X
        else:
            X_stack_queries = None
        X_stack_label_queries = None
        
        for classifier in self.classifiers_:
            
            y_pred_labels = classifier.predict(X)
            y_pred_probs = classifier.predict_proba(X)
            
            if not self.use_probs:
                y_pred = y_pred_labels
            else:
                y_pred = y_pred_probs
                
            try:
                X_stack_queries = np.c_[X_stack_queries, y_pred]
            except ValueError:
                X_stack_queries = y_pred
                
            try:
                X_stack_label_queries = np.c_[X_stack_label_queries, y_pred_labels]
            except ValueError:
                X_stack_label_queries = y_pred_labels
        
        self.last_X_stack_queries = X_stack_label_queries
        
        return self.stack_layer_classifier_.predict_proba(X_stack_queries)

### Test the SuperLearnerClassifier

Perform a simple test using the SuperLearnClassifier on the Iris dataset

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
clf = SuperLearnerClassifier()
cross_val_score(clf, iris.data, iris.target, cv=10)

Do a grid search using Iris data

In [None]:
# Set up the parameter grid to seaerch
param_grid = [
 {'base_estimators': [["tree", "logreg"], ["svm", "logreg", "knn", "tree"], ["svm", "logreg", "knn", "tree", "randomforest"]], 
  'stack_layer_classifier_type':["tree", "logreg"],
  'estimator_duplicates':[1,2,5,10], 
  'training_folds': list(range(4, 11, 3)), 
  'use_probs':[False, True], 
  'include_base_features_at_stack':[False, True]}  
]

# Perform the search
my_tuned_model = GridSearchCV(SuperLearnerClassifier(), param_grid, cv=2, verbose = 1, n_jobs=-1)
my_tuned_model.fit(iris.data, iris.target)

# Print details
print("Best parameters set found on development set:")
print(my_tuned_model.best_params_)
print(my_tuned_model.best_score_)
display(my_tuned_model.grid_scores_)

## Load & Partition Data

### Setup - IMPORTANT

Take only a sample of the dataset for fast testing

In [None]:
data_sampling_rate = 0.5

Setup the number of folds for all grid searches (should be 5 - 10)

In [None]:
cv_folds = 5

### Load Dataset

Load the dataset and explore it.

In [None]:
dataset = pd.read_csv('fashion-mnist_train.csv')
dataset = dataset.sample(frac=data_sampling_rate) #take a sample from the dataset so everyhting runs smoothly
num_classes = 10
classes = {0: "T-shirt/top", 1:"Trouser", 2: "Pullover", 3:"Dress", 4:"Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8:"Bag", 9:"Ankle boot"}
display(dataset.head())

### Pre-process & Partition Data

Perform data pre-processing and manipulation as required

In [None]:
X = dataset[dataset.columns[1:]]
Y = np.array(dataset["label"])

In [None]:
X = X/255

In [None]:
X_train_plus_valid, X_test, y_train_plus_valid, y_test \
    = train_test_split(X, Y, random_state=0, \
                                    train_size = 0.7)

X_train, X_valid, y_train, y_valid \
    = train_test_split(X_train_plus_valid, \
                                        y_train_plus_valid, \
                                        random_state=0, \
                                        train_size = 0.5/0.7)

## Train and Evaluate a Simple Model

Train a Super Learner Classifier using the prepared dataset

In [None]:
my_model = SuperLearnerClassifier()
my_model.fit(X_train, y_train)

In [None]:
# Make a set of predictions for the training data
y_pred = my_model.predict(X_train)

# Print performance details
accuracy = metrics.accuracy_score(y_train, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_train, y_pred))

# Print nicer homemade confusion matrix
print("Confusion Matrix")
display(pd.crosstab(np.array(y_train), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

Measure the perfromance of the individual models in the stack

In [None]:
# Make a set of predictions for the training data
my_model.predict(X_train)

for i in range(my_model.n_estimators_):
    print("** ", i, " ", my_model.base_estimator_types[i])
    
    # Isolate the predictions from each model
    y_pred = my_model.last_X_stack_queries[:, i]

    # Print performance details
    accuracy = metrics.accuracy_score(y_train, y_pred) # , normalize=True, sample_weight=None
    print("Accuracy: " +  str(accuracy))
    print(metrics.classification_report(y_train, y_pred))

    # Print nicer homemade confusion matrix
    print("Confusion Matrix")
    display(pd.crosstab(np.array(y_train), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

In [None]:
# Make a set of predictions for the training data
y_pred = my_model.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

# Print nicer homemade confusion matrix
print("Confusion Matrix")
pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
# Make a set of predictions for the test data
my_model.predict(X_test)

for i in range(my_model.n_estimators_):
    print("** ", i, " ", my_model.base_estimator_types[i])
    
    # Make a set of predictions for the training data
    y_pred = my_model.last_X_stack_queries[:, i]

    # Print performance details
    accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
    print("Accuracy: " +  str(accuracy))
    print(metrics.classification_report(y_test, y_pred))

    # Print nicer homemade confusion matrix
    print("Confusion Matrix")
    display(pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

## Do a Cross Validation Experiment With SuperLearnerClassifier

Perfrom a 10-fold cross validation experiment to evaluate the perofrmance of the SuperLearnerClassifier

In [None]:
my_model = SuperLearnerClassifier()
scores = cross_val_score(my_model, X_train_plus_valid, y_train_plus_valid, cv=cv_folds, n_jobs=-1, verbose = 2)
print(scores)

## Comparing the Performance of Different Stack Layer Approaches (Task 5)

Compare the performance of the ensemble when a label based stack layer training set and a probability based stack layer training set is used.

In [None]:
my_model = SuperLearnerClassifier(use_probs=True, stack_layer_classifier_type="logreg")
scores = cross_val_score(my_model, X_train_plus_valid, y_train_plus_valid, cv=cv_folds, n_jobs=-1, verbose = 2)
print(scores)

## Grid Search Through SuperLearnerClassifier Architectures & Parameters (Task 7)

Perfrom a grid search experiment to detemrine the optimal architecture and hyper-parameter values for the SuperLearnClasssifier for the MNIST Fashion classification problem.

In [None]:
# Set up the parameter grid to seaerch
param_grid = [
 {'base_estimators': [["tree", "logreg"], ["svm", "logreg", "knn", "tree"], ["svm", "logreg", "knn", "tree", "randomforest"]], 
  'stack_layer_classifier_type':["tree", "logreg"],
  'estimator_duplicates':[1,2,5,10], 
  'training_folds': list(range(4, 11, 3)), 
  'use_probs':[False, True], 
  'include_base_features_at_stack':[False, True]}  
]

# Perform the search
my_tuned_model = GridSearchCV(SuperLearnerClassifier(), param_grid, cv=cv_folds, verbose = 1, n_jobs=-1)
my_tuned_model.fit(X_train_plus_valid, y_train_plus_valid)

# Print details
print("Best parameters set found on development set:")
print(my_tuned_model.best_params_)
print(my_tuned_model.best_score_)
display(my_tuned_model.grid_scores_)

Evaluate the perofmrance of the model selected by the grid search on a hold-out dataset

In [None]:
# Make a set of predictions for the test data
y_pred = my_tuned_model.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

## Evaluating the Impact of Adding Original Descriptive Features at the Stack Layer (Task 8)

Evaluate the impact of adding original descriptive features at the stack layer.

In [None]:
my_model = SuperLearnerClassifier(use_probs=True, include_base_features_at_stack = True, stack_layer_classifier_type = "logreg")
scores = cross_val_score(my_model, X_train_plus_valid, y_train_plus_valid, cv=cv_folds, n_jobs=-1, verbose = 2)
print("Mean accuracy ", mean(scores))
print(scores)

## Explore the Ensemble Model (Task 9)

Perform an analysis to investigate the strength of the base estimators and the strengths of the correlations between them.

Measure the strength of the individual classifiers within the ensemble by measureing the accuracy of their predictions on a test set. 

In [None]:
print("Individual accuracies")

# Make a set of predictions for the test data
y_pred = my_model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("** Ensemble  Accuracy: " +  str(accuracy))
    

for i in range(my_model.n_estimators_):
    
    # Make a set of predictions for the training data
    y_pred = my_model.last_X_stack_queries[:, i]

    # Print performance details
    accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
    print("** ", i, " ", my_model.base_estimator_types[i], " Accuracy: " +  str(accuracy))


Measrue the disagreement between base estimators by calculating the Cohen's kappa metric between each of their classicications.

In [None]:
kappa_matrix = np.zeros((my_model.n_estimators_, my_model.n_estimators_))
for i in range(my_model.n_estimators_):
    for j in range(my_model.n_estimators_):
        kappa = cohen_kappa_score(my_model.last_X_stack_queries[:, i], my_model.last_X_stack_queries[:, j], labels=None, weights=None)
        kappa_matrix[i][j] = kappa
        
print(kappa_matrix)