# COMP47590: Advanced Machine Learning
# Assignment 1: The Super Learner

## Import Packages Etc

In [1]:
from IPython.display import display, HTML, Image
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.svm import SVC
from sklearn import metrics
from sklearn import ensemble
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier


# Add more packages as required

%matplotlib inline
#%qtconsole

## Define Super Learner Classifier

The *Super Learner* is a heterogeneous stacked ensemble classifier. This is a classification model that uses a set of base classifiers of different types, the outputs of which are then combined in another classifier at the stacked layer. The Super Learner was described in [(van der Laan et al, 2007)](https://pdfs.semanticscholar.org/19e9/c732082706f39d2ba12845851309714db135.pdf) but the stacked ensemble idea has been around for a long time. 

Figure 1 shows a flow diagram of the Super Learner process (this is from (van der Laan et al, 2007) and the process is also described in the COMP47590 lecture "[COMP47590 2017-2018 L04 Supervised Learning Ensembles 3](https://www.dropbox.com/s/1ksx94nxtuyn4l8/COMP47590%202017-2018%20L04%20Supervised%20Learning%20Ensembles%203.pdf?raw=1)"). The base classifiers are trained and their outputs are combined along with the training dataset labels into a training set for the stack layer classifier. To avoid overfitting the generation of the stacked layer training set uses a k-fold cross validation process (described as V-fold in Figure 1). To further add variety to the base estimators a bootstrapping selection (as is used in the bagging ensemble approach).
 
![Super Learner Process Flow](SuperLearnerProcessFlow.png "Logo Title Text 1")
Figure 1: A flow diagram for the Super Learner


### Define the SuperLearnerClassifier Class

In [2]:
true_label = []
test_index = []
k_store =[]
train_index = []
class SuperLearnerClassifier(BaseEstimator, ClassifierMixin):
       
    def __init__(self, training_type, layer_type, estimators):
        self.dt_model = None
        self.rf_model = None
        self.svm_model = None
        self.lr_model = None
        self.knn_model = None
        self.nb_model = None
        self.base_models = None
        self.superlearner = None
        self.layer_model =None
        self.training_type  = training_type
        self.layer_type = layer_type
        self.estimators = estimators
           
    def fit(self, X, y):
        
        self.dt_model = DecisionTreeClassifier()
        self.rf_model = RandomForestClassifier()
        self.svm_model = SVC(probability=True)
        self.lr_model = LogisticRegression()
        self.knn_model = KNeighborsClassifier()
        self.nb_model= GaussianNB()
        
        estimator_list = []
        
        for k,(train, test) in enumerate(kfold.split(X, y)):
            
            true_label.append(y[test])
            train_index.append(train)
            test_index.append(test)
            k_store.append(k)
            
            if 'SVM' in self.estimators:
                svm_fit = self.svm_model.fit(X.iloc[train], y[train])
                print("[SVM fold {0}], fit SVM model".format(k))
                estimator_list.append(('svm', svm_fit))
                
            if 'Decision Tree' in self.estimators:
                dt_fit = self.dt_model.fit(X.iloc[train], y[train])            
                print("[Decision Tree fold {0}], fit decision tree model".format(k))
                estimator_list.append(('dt', dt_fit))
            
            if 'Ramdon Forest' in self.estimators:
                rf_fit = self.rf_model.fit(X.iloc[train], y[train])
                print("[Random Forest fold {0}], fit random forest model".format(k))
                estimator_list.append(('rf', rf_fit))
            
            if 'Logistical Regression' in self.estimators:
                lr_fit = self.lr_model.fit(X.iloc[train], y[train])
                print("[Logistical Regression fold {0}], fit logistical regression model".format(k))
                estimator_list.append(('lr', lr_fit))
            
            if 'KNN' in self.estimators:
                knn_fit = self.knn_model.fit(X.iloc[train], y[train])
                print("[KNN fold {0}], fit KNN model".format(k))
                estimator_list.append(('knn', knn_fit))
            
            if 'Navie Bayes' in self.estimators:
                nb_fit = self.nb_model.fit(X.iloc[train], y[train])
                print("[Navie Bayes fold {0}], fit navie bayes model".format(k))
                estimator_list.append(('nb', nb_fit))
                
            print("**************************")
        
        print(estimator_list)
        self.base_models = VotingClassifier(estimators = estimator_list, voting ='hard')
            
#             self.base_models = VotingClassifier(estimators = [('dt',self.dt_model),('rf',self.rf_model),
#                                                          ('svm',self.svm_model), ('lr',self.lr_model),
#                                                          ('knn',self.knn_model), ('nb',self.nb_model)], voting ='hard')
            
        label = np.concatenate(true_label[0:len(k_store)], axis =0)
        if self.layer_type == 'Decision Tree':
            self.layer_model = DecisionTreeClassifier() 
            
            if self.training_type == 'label_base': 
                self.layer_model.fit(self.predict(X), label)
                print('using label base traning data to fit stack layer classifier')
            elif self.training_type == 'proba_base': 
                self.layer_model.fit(self.predict_proba(X), label)
                print('using probability base traning data to fit stack layer classifier')
                
            print('stack layer model is Decision Tree ')    
            self.superlearner = VotingClassifier(estimators = [('bm', self.base_models), 
                                                               ('dt', self.layer_model)],voting = 'hard')
                                 
        elif self.layer_type == 'Random Forest':
            self.layer_model = RandomForestClassifier()
            
            if self.training_type == 'label_base': 
                self.layer_model.fit(self.predict(X), label)
                print('using label base traning data to fit stack layer classifier')
            elif self.training_type == 'proba_base': 
                self.layer_model.fit(self.predict_proba(X), label)
                print('using probability base traning data to fit stack layer classifier')
                
            self.superlearner = VotingClassifier(estimators = [('bm',self.base_models), 
                                                               ('rf', self.layer_model)],voting = 'hard')
            print('stack layer model is Random Forest ')
            
        elif self.layer_type == 'SVM':
            self.layer_model = SVC(probability=True) 
            
            if self.training_type == 'label_base': 
                self.layer_model.fit(self.predict(X), label)
                print('using label base traning data to fit stack layer classifier')
            elif self.training_type == 'proba_base': 
                self.layer_model.fit(self.predict_proba(X), label)
                print('using probability base traning data to fit stack layer classifier')
                
            self.superlearner = VotingClassifier(estimators = [('bm',self.base_models), 
                                                               ('svm', self.layer_model)],voting = 'hard')
            print('stack layer model is SVM')
            
            
        elif self.layer_type == 'Logistic Regression':
            self.layer_model = LogisticRegression()
            
            if self.training_type == 'label_base': 
                self.layer_model.fit(self.predict(X), label)
                print('using label base traning data to fit stack layer classifier')
            elif self.training_type == 'proba_base': 
                self.layer_model.fit(self.predict_proba(X), label)
                print('using probability base traning data to fit stack layer classifier')
                
            self.superlearner = VotingClassifier(estimators = [('bm',self.base_models), 
                                                               ('lr', self.layer_model)],voting = 'hard')
            print('stack layer model is Logistic Regression ')
            
        elif self.layer_type == 'KNN':
            self.layer_model = KNeighborsClassifier()

            if self.training_type == 'label_base': 
                self.layer_model.fit(self.predict(X), label)
                print('using label base traning data to fit stack layer classifier')
            elif self.training_type == 'proba_base': 
                self.layer_model.fit(self.predict_proba(X), label)
                print('using probability base traning data to fit stack layer classifier')
                
            self.superlearner = VotingClassifier(estimators = [('bm', self.base_models), 
                                                               ('knn', self.layer_model)],voting = 'hard')        
            print('stack layer model is KNN ')
                 
        print('**************************')
        print('CONSTRUCT AND FIT A SUPERLEARNER')
        return self.superlearner.fit(X,y)
          
#         return self.superlearner.fit(X_train_plus_valid, y_train_plus_valid)
                                        
    def predict(self, X):
        
        base_svm_pred_list = []
        base_rf_pred_list = []
        base_dt_pred_list = []
        base_lr_pred_list = []
        base_knn_pred_list = []
        base_nb_pred_list = []
            
        for i in test_index:
            base_svm_pred = self.svm_model.predict(X.iloc[i])
            base_svm_pred_list.append(base_svm_pred)

            base_dt_pred = self.dt_model.predict(X.iloc[i])
            base_dt_pred_list.append(base_dt_pred)

            base_rf_pred = self.rf_model.predict(X.iloc[i])
            base_rf_pred_list.append(base_rf_pred)

            base_lr_pred = self.lr_model.predict(X.iloc[i])
            base_lr_pred_list.append(base_lr_pred)

            base_knn_pred = self.knn_model.predict(X.iloc[i])
            base_knn_pred_list.append(base_knn_pred)

            base_nb_pred = self.nb_model.predict(X.iloc[i])
            base_nb_pred_list.append(base_nb_pred)

        # construct traning data for stack layer classifier (label-based)
        svm_data = np.concatenate(base_svm_pred_list[0:len(base_svm_pred_list)])
        dt_data = np.concatenate(base_dt_pred_list[0:len(base_dt_pred_list)])
        rf_data = np.concatenate(base_rf_pred_list[0:len(base_rf_pred_list)])
        lr_data = np.concatenate(base_lr_pred_list[0:len(base_lr_pred_list)])
        knn_data = np.concatenate(base_knn_pred_list[0:len(base_knn_pred_list)])
        nb_data = np.concatenate(base_nb_pred_list[0:len(base_nb_pred_list)])

        conbination_list = [svm_data, dt_data, rf_data, lr_data, knn_data, nb_data]
        training_data_label_base = pd.DataFrame(conbination_list).T
            
        return training_data_label_base
                

    def predict_proba(self, X):
        
        base_svm_proba_list = []
        base_rf_proba_list = []
        base_dt_proba_list = []
        base_lr_proba_list = []
        base_knn_proba_list = []
        base_nb_proba_list = []
        
        for j in test_index:
            base_svm_proba = self.svm_model.predict_proba(X.iloc[j])
            base_svm_proba_list.append(base_svm_proba)
            
            base_dt_proba = self.dt_model.predict_proba(X.iloc[j])
            base_dt_proba_list.append(base_dt_proba)
            
            base_rf_proba = self.rf_model.predict_proba(X.iloc[j])
            base_rf_proba_list.append(base_rf_proba)
            
            base_lr_proba = self.lr_model.predict_proba(X.iloc[j])
            base_lr_proba_list.append(base_lr_proba)
            
            base_knn_proba = self.knn_model.predict_proba(X.iloc[j])
            base_knn_proba_list.append(base_knn_proba)
                        
            base_nb_proba = self.nb_model.predict_proba(X.iloc[j])
            base_nb_proba_list.append(base_nb_proba)
        
        # construct traning data for stack layer classifier (probability-based)
        svm_proba_data = np.concatenate(base_svm_proba_list[0:len(base_svm_proba_list)])
        dt_proba_data = np.concatenate(base_dt_proba_list[0:len(base_dt_proba_list)])
        rf_proba_data = np.concatenate(base_rf_proba_list[0:len(base_rf_proba_list)])
        lr_proba_data = np.concatenate(base_lr_proba_list[0:len(base_lr_proba_list)])
        knn_proba_data = np.concatenate(base_knn_proba_list[0:len(base_knn_proba_list)])
        nb_proba_data = np.concatenate(base_nb_proba_list[0:len(base_nb_proba_list)])
        training_data_proba_base = np.concatenate([svm_proba_data, dt_proba_data, rf_proba_data, 
                                      lr_proba_data, knn_proba_data, nb_proba_data ], axis = 1)        
        return training_data_proba_base

### Test the SuperLearnerClassifier

Perform a simple test using the SuperLearnClassifier on the Iris dataset

In [3]:
# from sklearn.datasets import load_iris
# iris = load_iris()
# clf = SuperLearnerClassifier()
# cv_folds = 2
# clf.fit(iris.data, iris.target)
# clf.predict(iris.data)
# clf.predict_proba(iris.data)
# x = cross_val_score(clf, iris.data, iris.target, cv=10)

## Load & Partition Data

### Setup - IMPORTANT

Take only a sample of the dataset for fast testing

In [4]:
data_sampling_rate = 0.05

Setup the number of folds for all grid searches (should be 5 - 10)

In [5]:
cv_folds = 5
kfold = KFold(5)

### Load Dataset

Load the dataset and explore it.

In [6]:
dataset = pd.read_csv('fashion-mnist_train.csv')
dataset = dataset.sample(frac=data_sampling_rate) #take a sample from the dataset so everyhting runs smoothly
num_classes = 10
classes = {0: "T-shirt/top", 1:"Trouser", 2: "Pullover", 3:"Dress", 4:"Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8:"Bag", 9:"Ankle boot"}
display(dataset.head())

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
2947,2,0,0,0,0,0,0,0,0,25,...,123,128,123,0,0,0,0,0,0,0
17989,6,0,0,0,0,0,0,0,0,0,...,96,10,0,0,0,0,0,0,0,0
6447,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37821,8,0,0,0,0,0,0,0,0,0,...,93,93,93,91,85,97,22,0,0,0
2858,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Pre-process & Partition Data

Perform data pre-processing and manipulation as required

In [7]:
# X - train set y - test set 
X = dataset[dataset.columns[1:]]
y = np.array(dataset["label"])

In [8]:
# Normalise the data
X = X/255

In [None]:
#Split the data into a training set, a vaidation set, and a test set
X_train_plus_valid, X_test, y_train_plus_valid, y_test \
    = train_test_split(X, y, random_state=0, \
                                    train_size = 0.7)
X_train, X_valid, y_train, y_valid \
    = train_test_split(X_train_plus_valid, \
                                        y_train_plus_valid, \
                                        random_state=0, \
                                        train_size = 0.5/0.7)

## Train and Evaluate a Simple Model

Train a Super Learner Classifier using the prepared dataset

In [None]:
#stack learner is decision tree
clf1 = SuperLearnerClassifier('label_base', 'Decision Tree', ['SVM', 'Decision Tree','Ramdon Forest','Logistical Regression','KNN','Navie Bayes'])
superlearner1 = clf1.fit(X_train_plus_valid, y_train_plus_valid)
# clf.predict(X_train_plus_valid)

[SVM fold 0], fit SVM model
[Decision Tree fold 0], fit decision tree model
[Random Forest fold 0], fit random forest model
[Logistical Regression fold 0], fit logistical regression model
[KNN fold 0], fit KNN model
[Navie Bayes fold 0], fit navie bayes model
**************************
[SVM fold 1], fit SVM model
[Decision Tree fold 1], fit decision tree model
[Random Forest fold 1], fit random forest model
[Logistical Regression fold 1], fit logistical regression model
[KNN fold 1], fit KNN model
[Navie Bayes fold 1], fit navie bayes model
**************************
[SVM fold 2], fit SVM model
[Decision Tree fold 2], fit decision tree model
[Random Forest fold 2], fit random forest model
[Logistical Regression fold 2], fit logistical regression model
[KNN fold 2], fit KNN model
[Navie Bayes fold 2], fit navie bayes model
**************************
[SVM fold 3], fit SVM model
[Decision Tree fold 3], fit decision tree model
[Random Forest fold 3], fit random forest model
[Logistical Reg

using label base traning data to fit stack layer classifier
stack layer model is Decision Tree 
**************************
CONSTRUCT AND FIT A SUPERLEARNER


In [None]:
pred = superlearner1.predict(X_valid)

Evaluate the trained classifier

In [None]:
# Evaluate the superLearnerClassifier
accuracy = metrics.accuracy_score(pred, y_valid)
accuracy

## Cross Validation Experiment (Task 2)

Perfrom a 10-fold cross validation experiment to evaluate the performance of the SuperLearnerClassifier

In [None]:
scores = cross_val_score(superlearner1, X_test, y_test, cv=10)
scores

## Comparing the Performance of Different Stack Layer Approaches (Task 5)

Compare the performance of the ensemble when a label based stack layer training set and a probability based stack layer training set is used.

In [None]:
# stack layer is decision tree

In [None]:
pred1 = superlearner1.predict(X_test)
accuracy1 = metrics.accuracy_score(pred1, y_test)
accuracy1

In [None]:
clf2 = SuperLearnerClassifier('proba_base', 'Decision Tree', ['SVM', 'Decision Tree','Ramdon Forest','Logistical Regression','KNN','Navie Bayes'])
superlearner2 = clf2.fit(X_train_plus_valid, y_train_plus_valid)

In [None]:
pred2 = superlearner2.predict(X_test)
accuracy2 = metrics.accuracy_score(pred2, y_test)
accuracy2

In [None]:
# Stack layer is logistic regression | input - label base data

In [None]:
clf3 = SuperLearnerClassifier('label_base', 'Logistic Regression', ['SVM', 'Decision Tree','Ramdon Forest','Logistical Regression','KNN','Navie Bayes'])
superlearner3 = clf3.fit(X_train_plus_valid, y_train_plus_valid)

In [None]:
pred3 = superlearner3.predict(X_test)
accuracy = metrics.accuracy_score(pred3, y_test)
accuracy

In [None]:
# Stack layer is logistic regression | input - probability base data

In [None]:
clf4 = SuperLearnerClassifier('proba_base', 'Logistic Regression', ['SVM', 'Decision Tree','Ramdon Forest','Logistical Regression','KNN','Navie Bayes'])
superlearner4 = clf4.fit(X_train_plus_valid, y_train_plus_valid)

In [None]:
pred4 = superlearner4.predict(X_test)
accuracy4 = metrics.accuracy_score(pred4, y_test)
accuracy4

## Grid Search Through SuperLearnerClassifier Architectures & Parameters (Task 7)

Perfrom a grid search experiment to detemrine the optimal architecture and hyper-parameter values for the SuperLearnClasssifier for the MNIST Fashion classification problem.

In [None]:
# Add code here

Evaluate the performance of the model selected by the grid search on a hold-out dataset

In [None]:
# Add code here

## Evaluating the Impact of Adding Original Descriptive Features at the Stack Layer (Task 8)

Evaluate the impact of adding original descriptive features at the stack layer.

In [None]:
# Add code here

## Explore the Ensemble Model (Task 9)

Perform an analysis to investigate the strength of the base estimators and the strengths of the correlations between them.

In [None]:
# Add code here