# Stream and Pool Active Learning using Uncertainty Strategy

In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

### Load the Breast Cancer dataset

In [2]:
# load dataset
x, y = load_breast_cancer(return_X_y=True)
NUM_EXAMPLES = len(x)

In [3]:
# pre-processing, scale features between 0 and 1
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [4]:
# divide the whole dataset in train & test splits
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 4)

# further divide the train data in labelled & unlabelled data
# retain only 3% of labelled points for training
X_labelled, X_unlabelled, y_labelled, y_oracle = train_test_split(X_train, y_train, test_size = 0.97, random_state = 4)

### Uncertainty Sampling Implementation
Information measures used:
- Least Confident
- Margin Sampling
- Entropy

In [5]:
def uncertaintySampling(model, measure, ALtype, labelPercent=None):

    probabilities = model.predict_proba(X_unlabelled)

    # stream-based active learning
    if ALtype == "stream":
        queriedData, newLabels = [], []
        # stream of points - one by one
        for i in range(len(X_unlabelled)):
            # return all points with confidence threshold
            if measure == "leastConfident":
                if probabilities[i].max() < 0.6:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            # return all points with margin threshold
            elif measure == "marginSampling":
                sortedProbabilities = np.sort(probabilities[i])
                margin = abs(sortedProbabilities[1] - sortedProbabilities[0])
                if margin < 0.2:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            # return all points with entropy threshold
            elif measure == "entropy":
                entropy = -np.sum(probabilities[i] * np.log2(probabilities[i]))
                if entropy > 0.99:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            else:
                assert False, "Information measure not implemented"
        return np.asarray(queriedData), np.asarray(newLabels)

    
    # pool-based active learning
    elif ALtype == "pool":
        # return labelPercent points of all points
        num_queries = int(NUM_EXAMPLES * labelPercent)
        if measure == "leastConfident":
            sortedProbabilities = np.argsort(probabilities.max(axis=1))
            indices = sortedProbabilities[:num_queries]
        elif measure == "marginSampling":
            sortedProbabilities = np.sort(probabilities)
            indices = np.argsort(sortedProbabilities[:,1] - sortedProbabilities[:,0])[:num_queries]
        elif measure == "entropy":
            entropies = -np.sum(probabilities * np.log2(probabilities), axis=1)
            indices = np.argsort(entropies)[-num_queries:]
        else:
            assert False, "Information measure not implemented"            
        return X_unlabelled[indices], y_oracle[indices]

    else:
        assert False, "Active learning type not implemented"

### Machine Learning Model Training without Additional Data

Model used is `Logistic Regression`

In [6]:
model = LogisticRegression(random_state=0)

model.fit(X_labelled, y_labelled)
prediction = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, prediction))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, prediction))

Accuracy: 0.8070175438596491

Confusion Matrix:
[[34  0]
 [22 58]]


## Results for different Active Learning Types with various Information Measures

In [7]:
def evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent=None):
    X_active, y_active = X_labelled.copy(), y_labelled.copy()

    queriedData, newLabels = uncertaintySampling(model, measure, al_type, labelPercent=label_percent)

    X_active = np.concatenate((X_active, queriedData))
    y_active = np.concatenate((y_active, newLabels))
    
    newModel = LogisticRegression(random_state=0)

    model.fit(X_active, y_active)
    prediction = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, prediction))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, prediction))

### Stream using Least Confident

In [8]:
al_type = 'stream'
measure = 'leastConfident'
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type)

Accuracy: 0.9649122807017544

Confusion Matrix:
[[30  4]
 [ 0 80]]


### Stream using Margin Sampling

In [9]:
al_type = 'stream'
measure = 'marginSampling'
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type)

Accuracy: 0.2982456140350877

Confusion Matrix:
[[34  0]
 [80  0]]


### Stream using Entropy

In [10]:
al_type = 'stream'
measure = 'entropy'
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type)

Accuracy: 0.9649122807017544

Confusion Matrix:
[[34  0]
 [ 4 76]]


### Pool using Least Confident

In [11]:
al_type = 'pool'
measure = 'leastConfident'
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent = 0.3)

Accuracy: 0.9912280701754386

Confusion Matrix:
[[34  0]
 [ 1 79]]


### Pool using Margin Sampling

In [12]:
al_type = 'pool'
measure = 'marginSampling'
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent = 0.3)

Accuracy: 0.9912280701754386

Confusion Matrix:
[[34  0]
 [ 1 79]]


### Pool using Entropy

In [13]:
al_type = 'pool'
measure = 'entropy'
evaluate_using_al(X_labelled, y_labelled, model, measure, al_type, label_percent = 0.3)

Accuracy: 0.9912280701754386

Confusion Matrix:
[[34  0]
 [ 1 79]]
