# Stream and Pool Active Learning using Uncertainty Strategy

In [1]:
import numpy as np
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

### Load the Breast Cancer dataset

In [2]:
# load dataset
x, y = fetch_covtype(return_X_y=True)

In [3]:
# pre-processing, scale features between 0 and 1
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [4]:
# divide the whole dataset in train & test splits
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.40, random_state = 4)

# further divide the train data in labelled & unlabelled data
# retain only 3% of labelled points for training
NUM_EXAMPLES = len(X_train)
X_labelled, X_unlabelled, y_labelled, y_oracle = train_test_split(X_train, y_train, test_size = 0.90, random_state = 4)

### Uncertainty Sampling Implementation
Information measures used:
- Least Confident
- Margin Sampling
- Entropy

In [5]:
def uncertaintySampling(model, measure, ALtype, labelPercent):

    probabilities = model.predict_proba(X_unlabelled)

    # stream-based active learning
    if ALtype == "stream":
        queriedData, newLabels = [], []
        # stream of points - one by one
        num_queries = int(NUM_EXAMPLES * labelPercent)
        for i in range(len(X_unlabelled)):
            if i == num_queries:
                break
            # return all points with confidence threshold
            if measure == "leastConfident":
                if probabilities[i].max() < 0.6:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            # return all points with margin threshold
            elif measure == "marginSampling":
                sortedProbabilities = np.sort(probabilities[i])
                margin = abs(sortedProbabilities[1] - sortedProbabilities[0])
                if margin < 0.2:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            # return all points with entropy threshold
            elif measure == "entropy":
#                 print(probabilities[i])
                entropy = -np.sum(probabilities[i] * np.log2(probabilities[i] + 1e-100))
#                 print(entropy)
                if entropy > 0.99:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            else:
                assert False, "Information measure not implemented"
        return np.asarray(queriedData), np.asarray(newLabels)

    
    # pool-based active learning
    elif ALtype == "pool":
        # return labelPercent points of all points
        num_queries = int(NUM_EXAMPLES * labelPercent)
        if measure == "leastConfident":
            sortedProbabilities = np.argsort(probabilities.max(axis=1))
            indices = sortedProbabilities[-num_queries:]
        elif measure == "marginSampling":
            sortedProbabilities = np.sort(probabilities)
            indices = np.argsort(sortedProbabilities[:, -2] - sortedProbabilities[:, -1])[:num_queries]
        elif measure == "entropy":
            entropies = -np.sum(probabilities * np.log2(probabilities + 1e-100), axis=1)
            indices = np.argsort(entropies)[:num_queries]
        else:
            assert False, "Information measure not implemented"            
        return X_unlabelled[indices], y_oracle[indices]

    else:
        assert False, "Active learning type not implemented"

### Machine Learning Model Training without Additional Data

Model used is `Naive Bayes Classifier`

In [6]:
model = GaussianNB()

model.fit(X_labelled, y_labelled)
prediction = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, prediction))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, prediction))

Accuracy: 0.09537660549471827

Confusion Matrix:
[[ 2987   130   833     0 19534   813 60517]
 [13070   895  9629  1239 45354  1240 41955]
 [    0     0  6134  8024    59    46     5]
 [    0     0    33  1057     0     0     0]
 [    0     3   996     0  2590    27   206]
 [    0     0  2446  3475   367   496     5]
 [    4     0    32     0   197     0  8007]]


## Results for different Active Learning Types with various Information Measures

In [7]:
def evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent):
    X_active, y_active = X_labelled.copy(), y_labelled.copy()

    queriedData, newLabels = uncertaintySampling(model, measure, al_type, label_percent)

    X_active = np.concatenate((X_active, queriedData))
    y_active = np.concatenate((y_active, newLabels))
    
    new_model = GaussianNB()

    new_model.fit(X_active, y_active)
    prediction = new_model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, prediction))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, prediction))

### Stream using Least Confident

In [8]:
al_type = 'stream'
measure = 'leastConfident'
label_percent = 0.1
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent)

Accuracy: 0.09588003700436738

Confusion Matrix:
[[ 2983   130   833     0 19710   868 60290]
 [13064   895  9629  1239 46081  1517 40957]
 [    0     0  6134  8024    62    46     2]
 [    0     0    33  1057     0     0     0]
 [    0     3   996     0  2678    32   113]
 [    0     0  2446  3475   344   519     5]
 [    4     0    32     0   187     0  8017]]


### Stream using Margin Sampling

In [9]:
al_type = 'stream'
measure = 'marginSampling'
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent)

Accuracy: 0.0908112992405499

Confusion Matrix:
[[ 2635   163   833     0 17892   729 62562]
 [12401   550  9629  1239 45224   845 43494]
 [    0     0  5763  8395    59    46     5]
 [    0     0    14  1076     0     0     0]
 [    0     0   996     0  2561    24   241]
 [    0     0  2224  3697   404   459     5]
 [    4     0    32     0   143     0  8061]]


### Stream using Entropy

In [10]:
al_type = 'stream'
measure = 'entropy'
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent)

Accuracy: 0.09575955766872485

Confusion Matrix:
[[ 2985   130   833     0 19674   822 60370]
 [13064   895  9629  1239 45951  1303 41301]
 [    0     0  6134  8024    62    46     2]
 [    0     0    33  1057     0     0     0]
 [    0     3   996     0  2663    27   133]
 [    0     0  2446  3475   354   509     5]
 [    4     0    32     0   192     0  8012]]


### Pool using Least Confident

In [11]:
al_type = 'pool'
measure = 'leastConfident'
label_percent = 0.3
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent)

Accuracy: 0.1834297885157376

Confusion Matrix:
[[13863  1362   833     0 49866   544 18346]
 [ 4726 11920  9616  1252 72873   381 12614]
 [    0     0  5680  8107    40   417    24]
 [    0     0     0  1071     0    19     0]
 [    0     3   996     0  2606    24   193]
 [    0     0  2107  3592   622   465     3]
 [  123     0    32     0  1060     0  7025]]


### Pool using Margin Sampling

In [12]:
al_type = 'pool'
measure = 'marginSampling'
label_percent = 0.2
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent)

Accuracy: 0.1758223790365956

Confusion Matrix:
[[12092  1362   833     0 57539   544 12444]
 [ 3976 11920  9618  1250 84004   381  2233]
 [    0     0  6067  8091    64    46     0]
 [    0     0    19  1071     0     0     0]
 [    0     3   996     0  2799    24     0]
 [    0     0  2329  3592   625   243     0]
 [  123     0    32     0  1415     0  6670]]


### Pool using Entropy

In [13]:
al_type = 'pool'
measure = 'entropy'
label_percent = 0.2
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent)

Accuracy: 0.20475032809104796

Confusion Matrix:
[[14746  1362   833     0 57577   545  9751]
 [ 4966 11920 10437   431 84004   381  1243]
 [    0     0 10121  3748    64   335     0]
 [    0     0   157   914     0    19     0]
 [    0     3   996     0  2799    24     0]
 [    0     0  4388  1347   625   429     0]
 [  137     0    32     0  1415     0  6656]]
