# ML Assignment 2 - Active Learning

In [13]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import os
import sys
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

## Load the MNIST digit recognition dataset

In [14]:
mnist = fetch_openml('mnist_784')
x = mnist.data
y = mnist.target
num_examples = len(x)

In [15]:
scaler = MinMaxScaler()
scaler.fit_transform(x)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
# retaining only 10% of labelled points
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 4)
X_labelled, X_unlabelled, y_labelled, y_oracle = train_test_split(X_train, y_train, test_size = 0.90, random_state = 4)

## Uncertainty Sampling
3 possible information measures: Least confident, margin sampling, entropy

In [18]:
def uncertaintySampling(model, measure, ALtype, labelPercent):
    
    probabilities = model.predict_proba(X_unlabelled)
    
    # stream-based active learning
    if ALtype == "stream":
        
        queriedData, newLabels = [], []
        
        # stream of points - one by one
        for i in range(len(X_unlabelled)):
            
            # slight modification - return all points with confidence threshold
            if measure == "leastConfident":
                if probabilities[i].max() < 0.4:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            
            
            # slight modification - return all points with margin threshold
            elif measure == "marginSampling":
                sortedProbabilities = np.sort(probabilities[i])
                margin = abs(sortedProbabilities[1] - sortedProbabilities[0][0])
                if margin < 0.05:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
    
                
            # slight modification - return all points with entropy threshold
            elif measure == "entropy":
                entropy = -np.sum(probabilities[i] * np.log2(probabilities[i]))
                if entropy > 1.09:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            
            else:
                assert False, "Information measure not implemented"
                
        return np.asarray(queriedData), np.asarray(newLabels)

    
    # pool-based active learning
    elif ALtype == "pool":
        
        num_queries = num_examples * labelPercent
        
        if measure == "leastConfident":
            sortedProbabilities = np.argsort(probabilities.max(axis=1))
            indices = sortedProbabilities[:num_queries]
            
        elif measure == "marginSampling":
            sortedProbabilities = np.sort(probabilities)
            indices = np.argsort(sortedProbabilities[:,1] - np.sort(prob)[:,0])[:num_queries]
            
        elif measure == "entropy":
            entropies = -np.sum(probabilities * np.log2(probabilities), axis=1)
            indices = np.argsort(entropies)[-num_queries:]
            
        else:
            assert False, "Information measure not implemented"
            
        return X_unlabelled[indices], y_oracle[indices]

    else:
        assert False, "Active learning type not implemented"

## Query By Committee
2 disagreement measures: Vote entropy and KL Divergence