## Stream and Pool Active Learning using QBC Strategy

In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

### Load the Breast Cancer dataset

In [3]:
# load dataset
x, y = load_breast_cancer(return_X_y=True)
NUM_EXAMPLES = len(x)
NUM_CLASSES = 2

In [4]:
# pre-processing, scale features between 0 and 1
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [5]:
# divide the whole dataset in train & test splits
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 4)

# further divide the train data in labelled & unlabelled data
# retain only 3% of labelled points for training
X_labelled, X_unlabelled, y_labelled, y_oracle = train_test_split(X_train, y_train, test_size = 0.97, random_state = 4)

## Query By Committee
Committee used:
- Logistic Regression
- Decision Tree Classifier
- Random Forest Classifier
- SVC
- Gaussian NB
- Linear Discriminant Analysis
- K Neighbors Classifier
- Ada Boost Classifier

Disagreement measures used:
- Vote entropy
- KL Divergence

In [6]:
committee = []
committee.append(LogisticRegression())
committee.append(DecisionTreeClassifier())
committee.append(RandomForestClassifier())
committee.append(SVC())
committee.append(GaussianNB())
committee.append(LinearDiscriminantAnalysis())
committee.append(KNeighborsClassifier())
committee.append(AdaBoostClassifier())

NUM_COMMITTEE_MEMBERS = len(committee)

for i in range(NUM_COMMITTEE_MEMBERS):
    committee[i].fit(X_labelled, y_labelled)

In [7]:
def qbc(committee, measure, ALtype, labelPercent):
    
    if ALtype == "stream":
        queriedData, newLabels = [], []
        # stream of points - one by one
        for i in range(len(X_unlabelled)):
            predictions = []
            # each committee member gives their prediction
            for j in range(NUM_COMMITTEE_MEMBERS):
                committee_model = committee[j]
                predictions.append(committee_model.predict(X_unlabelled[i].reshape(1, -1)))
            # vote entropy based disagreement
            if measure == "voteEntropy":
                entropy = 0
                # finding counts for each class
                for k in range(NUM_CLASSES):
                    count = predictions.count(k)
                    # no entropy if vote doesn't exist
                    if count == 0:
                        continue
                    entropy -= (count / NUM_COMMITTEE_MEMBERS) * np.log2(count / NUM_COMMITTEE_MEMBERS)
                if entropy > 0.9:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            # KL Divergence based disagreement   
            elif measure == "KLDivergence":
                kld = 0
                # finding counts for each class
                for k in range(NUM_CLASSES):
                    count = predictions.count(k)
                    # no KLD if vote doesn't exist
                    if count == 0:
                        continue
                    kld -= count * np.log2(count / NUM_COMMITTEE_MEMBERS)
                if kld > 7:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            else:
                assert False, "Disagreement measure not implemented"
        return np.asarray(queriedData), np.asarray(newLabels)
            
        
    elif ALtype == "pool":
        predictions = []
        disagreement = []
        num_queries = int(NUM_EXAMPLES * labelPercent)
        for k in range(NUM_COMMITTEE_MEMBERS):
            predictions.append(committee[k].predict(X_unlabelled))
        predictions = np.asarray(predictions)
        for i in range(len(X_unlabelled)):
            instanceDisagreement = 0
            for j in range(NUM_CLASSES):
                count = predictions[j][i]
                # no contibution for classes not present in prediction
                if count == 0:
                    continue
                if measure == "voteEntropy":
                    instanceDisagreement -= (count / NUM_COMMITTEE_MEMBERS) * np.log2(count / NUM_COMMITTEE_MEMBERS)
                elif measure == "KLDivergence":
                    instanceDisagreement -= count * np.log2(count / NUM_COMMITTEE_MEMBERS)
                else:
                    assert False, "Disagreement measure not implemented"
            disagreement.append(instanceDisagreement)
        disagreement = np.asarray(disagreement)
        indices = np.argsort(disagreement, axis = 0)[:num_queries]
        return X_unlabelled[indices], y_oracle[indices]
    else: 
        assert False, "Active learning type not implemented"

### Machine Learning Model Training without Additional Data

Model used is `Logistic Regression`

In [8]:
model = LogisticRegression(random_state=0)

model.fit(X_labelled, y_labelled)
prediction = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, prediction))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, prediction))

Accuracy: 0.8070175438596491

Confusion Matrix:
[[34  0]
 [22 58]]


In [9]:
strategy_list = ['uncertainty', 'qbc']
al_type_list = ['stream', 'pool']
# measure_list = ['leastConfident', 'marginSampling', 'entropy']
measure_list = ['voteEntropy', 'KLDivergence']

strategy = strategy_list[1]
al_type = al_type_list[1]
measure = measure_list[1]

labelPercent = 0.3 # percent of newly labelled data needed

## Results for different Active Learning Types with various Information Measures

In [10]:
def evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent=None):
    X_active, y_active = X_labelled.copy(), y_labelled.copy()

    queriedData, newLabels = qbc(committee, measure, al_type, labelPercent=label_percent)

    X_active = np.concatenate((X_active, queriedData))
    y_active = np.concatenate((y_active, newLabels))
    
    newModel = LogisticRegression(random_state=0)

    model.fit(X_active, y_active)
    prediction = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, prediction))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, prediction))

### Stream using Vote Entropy

In [11]:
al_type = 'stream'
measure = 'voteEntropy'
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type)

Accuracy: 0.9736842105263158

Confusion Matrix:
[[31  3]
 [ 0 80]]


### Stream using KL Divergence

In [12]:
al_type = 'stream'
measure = 'KLDivergence'
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type)

Accuracy: 0.9736842105263158

Confusion Matrix:
[[31  3]
 [ 0 80]]


### Pool using Vote Entropy

In [13]:
al_type = 'pool'
measure = 'voteEntropy'
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent = 0.3)

Accuracy: 0.8947368421052632

Confusion Matrix:
[[34  0]
 [12 68]]


### Pool using KL Divergence

In [14]:
al_type = 'pool'
measure = 'KLDivergence'
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent = 0.3)

Accuracy: 0.8947368421052632

Confusion Matrix:
[[34  0]
 [12 68]]
