## Stream and Pool Active Learning using QBC Strategy

In [1]:
import numpy as np
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

### Load the Breast Cancer dataset

In [3]:
# load dataset
x, y = fetch_covtype(return_X_y=True)
NUM_CLASSES = 7

In [4]:
# pre-processing, scale features between 0 and 1
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [5]:
# divide the whole dataset in train & test splits
x, _, y, _ = train_test_split(x, y, test_size = 0.70, random_state = 4) # discard because data is huge
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.40, random_state = 4)

# further divide the train data in labelled & unlabelled data
# retain only 10% of labelled points for training
NUM_EXAMPLES = len(X_train)
X_labelled, X_unlabelled, y_labelled, y_oracle = train_test_split(X_train, y_train, test_size = 0.90, random_state = 4)

## Query By Committee
Committee used:
- Decision Tree Classifier
- Random Forest Classifier
- Gaussian NB
- K Neighbors Classifier
- Ada Boost Classifier

Disagreement measures used:
- Vote entropy
- KL Divergence

In [6]:
committee = []
committee.append(DecisionTreeClassifier())
committee.append(RandomForestClassifier())
committee.append(GaussianNB())
committee.append(KNeighborsClassifier())
committee.append(AdaBoostClassifier())

NUM_COMMITTEE_MEMBERS = len(committee)

for i in range(NUM_COMMITTEE_MEMBERS):
    committee[i].fit(X_labelled, y_labelled)

In [22]:
def qbc(committee, measure, ALtype, labelPercent):
    
    if ALtype == "stream":
        queriedData, newLabels = [], []
        # stream of points - one by one
        for i in range(len(X_unlabelled)):
            predictions = []
            # each committee member gives their prediction
            for j in range(NUM_COMMITTEE_MEMBERS):
                committee_model = committee[j]
                predictions.append(committee_model.predict(X_unlabelled[i].reshape(1, -1)))
            # vote entropy based disagreement
            if measure == "voteEntropy":
                entropy = 0
                # finding counts for each class
                for k in range(NUM_CLASSES):
                    count = predictions.count(k)
                    # no entropy if vote doesn't exist
                    if count == 0:
                        continue
                    entropy -= (count / NUM_COMMITTEE_MEMBERS) * np.log2(count / NUM_COMMITTEE_MEMBERS)
                if entropy > 0.9:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            # KL Divergence based disagreement   
            elif measure == "KLDivergence":
                kld = 0
                # finding counts for each class
                for k in range(NUM_CLASSES):
                    count = predictions.count(k)
                    # no KLD if vote doesn't exist
                    if count == 0:
                        continue
                    kld -= count * np.log2(count / NUM_COMMITTEE_MEMBERS)
                if kld > 7:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            else:
                assert False, "Disagreement measure not implemented"
        return np.asarray(queriedData), np.asarray(newLabels)
            
        
    elif ALtype == "pool":
        predictions = []
        disagreement = []
        num_queries = int(NUM_EXAMPLES * labelPercent)
        for k in range(NUM_COMMITTEE_MEMBERS):
            predictions.append(committee[k].predict(X_unlabelled))
        predictions = np.asarray(predictions)
        for i in range(len(X_unlabelled)):
            instanceDisagreement = 0
            for j in range(NUM_CLASSES):
                count = sum(predictions[:, i] == (j + 1))
                # no contibution for classes not present in prediction
                if count == 0:
                    continue
                if measure == "voteEntropy":
                    instanceDisagreement -= (count / NUM_COMMITTEE_MEMBERS) * np.log2(count / NUM_COMMITTEE_MEMBERS)
                elif measure == "KLDivergence":
                    instanceDisagreement -= count * np.log2(count / NUM_COMMITTEE_MEMBERS)
                else:
                    assert False, "Disagreement measure not implemented"
            disagreement.append(instanceDisagreement)
        disagreement = np.asarray(disagreement)
        indices = np.argsort(disagreement, axis = 0)[:num_queries]
        return X_unlabelled[indices], y_oracle[indices]
    else: 
        assert False, "Active learning type not implemented"

### Machine Learning Model Training without Additional Data

Model used is `Naive Bayes Classifier`

In [8]:
model = GaussianNB()

model.fit(X_labelled, y_labelled)
prediction = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, prediction))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, prediction))

Accuracy: 0.09507759387280916

Confusion Matrix:
[[  537   178   259     0  2541   624 21260]
 [ 2954   873  2912   347  9256  1747 16059]
 [    0     0  1701  2532     3    10    15]
 [    0     0     5   341     0     0     0]
 [    0     3   286     0   668    67   148]
 [    0     0   635  1141    42   181    21]
 [    9     0     7     0    23     9  2328]]


## Results for different Active Learning Types with various Information Measures

In [9]:
def evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent):
    X_active, y_active = X_labelled.copy(), y_labelled.copy()

    queriedData, newLabels = qbc(committee, measure, al_type, label_percent)

    X_active = np.concatenate((X_active, queriedData))
    y_active = np.concatenate((y_active, newLabels))
    
    new_model = GaussianNB()

    new_model.fit(X_active, y_active)
    prediction = new_model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, prediction))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, prediction))

### Stream using Vote Entropy

In [17]:
al_type = 'stream'
measure = 'voteEntropy'
label_percent = 0.2
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent)

KeyboardInterrupt: 

### Stream using KL Divergence

In [11]:
al_type = 'stream'
measure = 'KLDivergence'
label_percent = 0.1
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent)

Accuracy: 0.13239723473222226

Confusion Matrix:
[[  262   453   259     0  5352    19 19054]
 [   64  3763  2914   345 13837   211 13014]
 [    0     0  1706  2532     6     3    14]
 [    0     0     5   341     0     0     0]
 [    0     3   286     0   763     0   120]
 [    0     0   651  1140   154    50    25]
 [    9     0     7     0    14     0  2346]]


### Pool using Vote Entropy

In [23]:
al_type = 'pool'
measure = 'voteEntropy'
label_percent = 0.1
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent)

[1 2 3 4 5 6 7]
Accuracy: 0.0924385416367861

Confusion Matrix:
[[  729    70   233     0  2862  1683 19822]
 [ 3568   337  2801   351  9924  2175 14992]
 [    0     0  1797  2404     9    36    15]
 [    0     0     9   337     0     0     0]
 [    0     2   211     0   709    89   161]
 [    0     1   607  1101    53   230    28]
 [   21     9     7     0    23    10  2306]]


### Pool using KL Divergence

In [24]:
al_type = 'pool'
measure = 'KLDivergence'
label_percent = 0.1
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent)

[1 2 3 4 5 6 7]
Accuracy: 0.0924385416367861

Confusion Matrix:
[[  729    70   233     0  2862  1683 19822]
 [ 3568   337  2801   351  9924  2175 14992]
 [    0     0  1797  2404     9    36    15]
 [    0     0     9   337     0     0     0]
 [    0     2   211     0   709    89   161]
 [    0     1   607  1101    53   230    28]
 [   21     9     7     0    23    10  2306]]
