In [1]:
from deslib.des.des_knn import DESKNN
from sklearn import datasets
import numpy as np
import pandas as pd
import keras
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets import mnist
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, BatchNormalization
import tensorflow as tf
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from keras.models import load_model
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.ensemble import BaggingClassifier

## MNIST dataset
---

In [28]:
mnist_data = mnist.load_data()
(X, y), (X_test, y_test) = mnist_data
X, X_test = X / 255.0, X_test / 255.0
X = X.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)
X_train, X_dsel, y_train, y_dsel = train_test_split(X, y, test_size=10000, random_state=42)
m = X_train.shape[0]
n = X_train.shape[1]

In [3]:
m, n

(50000, 784)

In [4]:
pool_of_classifiers = RandomForestClassifier(n_estimators=100, max_features='sqrt', n_jobs=-1, verbose=1)

In [5]:
pool_of_classifiers.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   15.8s finished


In [24]:
y_pred = pool_of_classifiers.predict(X_test[:5000])

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [25]:
accuracy_score(y_test[:5000], y_pred)

0.954

In [11]:
des_knn = DESKNN(pool_classifiers=pool_of_classifiers, DFP=False, with_IH=False, pct_accuracy=0.3, pct_diversity=0.3, n_jobs=-1)

In [12]:
des_knn.fit(X_dsel, y_dsel)

In [26]:
y_pred_des = des_knn.predict(X_test[:5000])

In [27]:
accuracy_score(y_test[:5000], y_pred_des)

0.9474

## Breast Cancer
---

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target

In [3]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=200 ,random_state=42)
X_dsel, X_test, y_dsel, y_test = train_test_split(X_test, y_test, test_size=50 ,random_state=42)

### Training Random Forest Classifiers

In [73]:
clf1 = RandomForestClassifier(n_estimators=400, max_features='sqrt', n_jobs=-1, verbose=1)

In [74]:
X.shape

(569, 30)

In [75]:
clf1.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.0s finished


In [76]:
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.0s finished


0.94

In [105]:
des_knn = DESKNN(pool_classifiers=clf1, DFP=False, k=10 ,with_IH=False, pct_accuracy=0.5, pct_diversity=0.3, n_jobs=-1, voting='hard')

In [106]:
des_knn.fit(X_dsel, y_dsel)

In [107]:
y_pred_des = des_knn.predict(X_test)
accuracy_score(y_test, y_pred_des)

0.98

#### Using DES-KNN

In [104]:
from deslib.static import (StackedClassifier,
                           SingleBest,
                           StaticSelection,
                           Oracle)
oracle = Oracle(clf1).fit(X_train, y_train)
print('Oracle result: {}' .format(oracle.score(X_test, y_test)))

Oracle result: 1.0


In [108]:
dist, indx = des_knn.get_competence_region(query=X_test, k=10)

In [111]:
competence_region = indx

In [115]:
predictions = des_knn._predict_base(X_test)

In [116]:
predictions.shape

(50, 400)

In [161]:
predictions # predictions (class given to each sample by each base classifier) 

array([[1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [119]:
accuracy, diversity = des_knn.estimate_competence(competence_region=competence_region, distances=dist, predictions=predictions)

In [130]:
accuracy # shape would be -> (n_test_samples, n_base_clf)

array([[1. , 1. , 1. , ..., 1. , 1. , 1. ],
       [1. , 1. , 1. , ..., 0.9, 1. , 1. ],
       [1. , 1. , 1. , ..., 1. , 1. , 1. ],
       ...,
       [1. , 1. , 1. , ..., 1. , 1. , 1. ],
       [1. , 1. , 1. , ..., 1. , 0.9, 0.8],
       [0.9, 0.9, 1. , ..., 0.7, 0.9, 1. ]])

In [131]:
diversity # shape would be -> (n_test_samples, n_base_clf)

array([[  0. ,   0. ,   0. , ...,   0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. , ...,  -3.9,   0. ,   0. ],
       [  0. ,   0. ,   0. , ...,   0. ,   0. ,   0. ],
       ...,
       [  0. ,   0. ,   0. , ...,   0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. , ...,   0. , -11.3, -15.6],
       [-28. , -28. ,   0. , ..., -33.1, -28. ,   0. ]])

In [146]:
selected_clf = des_knn.select(accuracy=accuracy, diversity=diversity)
print ("The shape of selected_clf - " , selected_clf.shape) # Shape = (n_test_samples, J)
# 400*0.3 = 120 classifier selcted for each query sample

The shape of selected_clf -  (50, 120)


In [154]:
selected_clf # The elements are the indices of classifiers in the pool which is selected for further voting for that query sample

array([[348, 189, 398, ...,  85,  84,  83],
       [341, 209, 164, ...,  80,  81, 104],
       [326, 193, 180, ...,  90,  88,  87],
       ...,
       [199, 188, 198, ...,  82,  81,  80],
       [343, 192, 213, ..., 368, 303, 366],
       [399,  71, 135, ..., 390,  17,  38]], dtype=int64)

In [4]:
X.shape, y.shape

((569, 30), (569,))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5 ,random_state=42)
X_dsel, X_test, y_dsel, y_test = train_test_split(X_test, y_test, test_size=0.5 ,random_state=42)

In [6]:
X_train.shape, X_dsel.shape, X_test.shape

((284, 30), (142, 30), (143, 30))

In [7]:
# 284 samples for training (50% data)
# 142 for DSEL (25% data)
# 143 for test (25% data)

In [22]:
num_classifiers = 100
# Training 100 perceptrons using bootstrap
pool_of_classifiers = BaggingClassifier(estimator=Perceptron(early_stopping=True, verbose=1, n_jobs=-1), max_samples=0.5, max_features=0.5, verbose=1, n_estimators=num_classifiers, n_jobs=-1, random_state=42)

In [23]:
pool_of_classifiers

In [24]:
pool_of_classifiers.fit(X_train, y_train)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.3s remaining:    7.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.3s finished


In [25]:
pool_of_classifiers.score(X_test, y_test)*100

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.0s finished


98.6013986013986

#### Using KNORA-U

In [26]:
from deslib.des.knora_u import KNORAU
knora_u = KNORAU(pool_classifiers=pool_of_classifiers, k=7, voting='hard', DFP=False, with_IH = False, n_jobs=-1)

In [27]:
knora_u.fit(X_dsel, y_dsel)

In [28]:
knora_u.score(X_test, y_test)

0.986013986013986

In [29]:
dist, indx = knora_u.get_competence_region(X_test)

In [30]:
dist, dist.shape

(array([[1.70096636, 1.79422219, 2.04943359, ..., 2.55897825, 2.62398469,
         2.66412718],
        [1.98302909, 2.12452483, 2.23690662, ..., 2.33189566, 2.39503271,
         2.43112084],
        [2.80329045, 3.55623735, 3.90831616, ..., 4.20191363, 4.20493542,
         4.33698177],
        ...,
        [2.85857113, 2.86758023, 3.06148603, ..., 3.48591018, 3.51521823,
         3.55405064],
        [1.44489092, 1.89647688, 1.94433377, ..., 2.11097084, 2.32832155,
         2.37367061],
        [2.0722089 , 2.3219208 , 2.5219492 , ..., 2.56852142, 2.59515977,
         2.62025291]]),
 (143, 7))

In [31]:
indx, indx.shape # the shape should be (n_test_samples, k)
# k is no. of nearest neighbours around a each sample we want! here k = 7

(array([[ 43,  88, 116, ...,  12, 135,  62],
        [100,  44,  21, ...,  70,   1, 123],
        [ 10,  56,  67, ...,  64,  48,  25],
        ...,
        [  4,   2,  94, ...,  53,  51,  80],
        [113,  88, 105, ..., 111,  43,  60],
        [ 77,  21,  52, ..., 104,  85,  44]], dtype=int64),
 (143, 7))

In [32]:
predictions = knora_u._predict_base(X_test) # get predictions for each base present in the pool of classifiers for each test sample
predictions, predictions.shape

(array([[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1]], dtype=int64),
 (143, 100))

In [33]:
competence_level = knora_u.estimate_competence(competence_region=indx, distances=dist, predictions=predictions)

In [34]:
competence_level, competence_level.shape # get competence level for all base classifiers for each test sample

(array([[7., 7., 7., ..., 7., 7., 7.],
        [7., 7., 7., ..., 7., 7., 7.],
        [7., 7., 7., ..., 7., 7., 7.],
        ...,
        [7., 7., 7., ..., 7., 7., 7.],
        [6., 7., 6., ..., 6., 6., 6.],
        [7., 7., 7., ..., 7., 7., 7.]]),
 (143, 100))

In [35]:
selected_classifiers = knora_u.select(competence_level) # Basically, where competence_level > 0 then selected

In [36]:
selected_classifiers, selected_classifiers.shape

(array([[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]]),
 (143, 100))

In [37]:
(i, j) = np.where(selected_classifiers == False)

In [38]:
num_clf_notSelected = selected_classifiers[i, j].shape[0]
num_clf_notSelected

0

So, the number of classifiers that are <span style="color:red">**NOT**</span> Selected are <span style="color:red">**ONLY 6**</span>. This maybe the reason why using this technique results in **Less** or **NO** difference in accuracy. since here in KNORA-U, we are end up selecting all classifier for our maximum test sample as it's definiton says we have to select all classifier that is producing atleast one correct output for any sample in competence region of the test sample.

##### Selecting Subset from Selected models from KNORA-U

In [39]:
def predict_by_subset(selected_classifiers, competence_level, knora_u, X_test, num_minusone_rem=23):
    '''
    -----------------------------------------------------------------
    voting is based on competence_level.
    This function calculates the final output label for each sample.
    -----------------------------------------------------------------
    Returns: 
        prediction_ :  shape -> (num_query_sample, num_classifiers) contains predictions for each base classifier which
                       are selected as 1 or 0 else -1 if base classifier is not selected.
        y_pred : shape -> (num_query_sample, ) contains final prediction for each query sample considering competence_level
                 for each base classifier which are selected more is competence level more will be it's contribution in voting scheme.
    '''
    predictions = knora_u._predict_base(X_test)
    predictions_ = np.ones((selected_classifiers.shape[0], selected_classifiers.shape[1]))
    predictions_ = np.multiply(predictions, -1)
    print(f"{num_minusone_rem} models will be removed for each query sample!")
    for i in range(0, selected_classifiers.shape[0]): # for each query sample
        clf_indxs = np.where(selected_classifiers[i] == True)[0]
        predictions_[i][clf_indxs] = predictions[i][clf_indxs]
        total_minus_ones = np.sum(predictions_[i] == -1)
        if total_minus_ones < num_minusone_rem:
            possible_indices = np.where(predictions_ [i] != -1)[0]
            selected_indices = np.random.choice(possible_indices, num_minusone_rem - total_minus_ones, replace=False)
            predictions_[i][selected_indices] = -1
    subset_predictions = predictions_
    y_pred = np.ones((subset_predictions.shape[0],), dtype='int')
    y_pred = np.multiply(-1, y_pred)
    for i in range(0, y_pred.shape[0]):
        indx_ones = np.where(subset_predictions[i] == 1)[0]
        indx_zeros = np.where(subset_predictions[i] == 0)[0]
        ones_competences = competence_level[i][indx_ones]
        zeros_competences = competence_level[i][indx_zeros]
        total_votes_ones = np.sum(ones_competences)
        total_votes_zeros = np.sum(zeros_competences)
        if total_votes_ones >= total_votes_zeros:
            y_pred[i] = 1
        else:
            y_pred[i] = 0
    return y_pred
    return predictions_, y_pred

In [126]:
y_subset_pred = predict_by_subset(selected_classifiers=selected_classifiers, competence_level=competence_level, knora_u=knora_u, X_test=X_test, num_minusone_rem=90)

90 models will be removed for each query sample!


In [127]:
accuracy_score(y_test, y_subset_pred)

0.993006993006993