# ML Assignment 2 - Active Learning

Group Members
1. Anirudh Srinivasan Chakravarthy - 2017A7PS1195P
2. S Hariharan - 2017A7PS0134P
3. Divyam Goel - 2017A7PS1196P

In [1]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import os
import sys
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix

## Load the MNIST digit recognition dataset

In [3]:
mnist = fetch_openml('mnist_784')
x = mnist.data
y = mnist.target

In [4]:
NUM_EXAMPLES = len(x)
NUM_CLASSES = 10

In [5]:
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [6]:
# retaining only 10% of labelled points
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 4)
X_labelled, X_unlabelled, y_labelled, y_oracle = train_test_split(X_train, y_train, test_size = 0.90, random_state = 4)

## Uncertainty Sampling
3 possible information measures: Least confident, margin sampling, entropy

In [7]:
def uncertaintySampling(model, measure, ALtype, labelPercent):
    
    probabilities = model.predict_proba(X_unlabelled)
    
    # stream-based active learning
    if ALtype == "stream":
        
        queriedData, newLabels = [], []
        
        # stream of points - one by one
        for i in range(len(X_unlabelled)):
            
            # slight modification - return all points with confidence threshold
            if measure == "leastConfident":
                if probabilities[i].max() < 0.4:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            
            
            # slight modification - return all points with margin threshold
            elif measure == "marginSampling":
                sortedProbabilities = np.sort(probabilities[i])
                margin = abs(sortedProbabilities[1] - sortedProbabilities[0][0])
                if margin < 0.05:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
    
                
            # slight modification - return all points with entropy threshold
            elif measure == "entropy":
                entropy = -np.sum(probabilities[i] * np.log2(probabilities[i]))
                if entropy > 1.09:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            
            else:
                assert False, "Information measure not implemented"
                
        return np.asarray(queriedData), np.asarray(newLabels)

    
    # pool-based active learning
    elif ALtype == "pool":
        
        num_queries = int(NUM_EXAMPLES * labelPercent)
        
        if measure == "leastConfident":
            sortedProbabilities = np.argsort(probabilities.max(axis=1))
            indices = sortedProbabilities[:num_queries]
            
        elif measure == "marginSampling":
            sortedProbabilities = np.sort(probabilities)
            indices = np.argsort(sortedProbabilities[:,1] - np.sort(prob)[:,0])[:num_queries]
            
        elif measure == "entropy":
            entropies = -np.sum(probabilities * np.log2(probabilities), axis=1)
            indices = np.argsort(entropies)[-num_queries:]
            
        else:
            assert False, "Information measure not implemented"
            
        return X_unlabelled[indices], y_oracle[indices]

    else:
        assert False, "Active learning type not implemented"

## Query By Committee
2 disagreement measures: Vote entropy and KL Divergence

Number of committee members = 6

In [8]:
# create appropriate committee models though
NUM_COMMITTEE_MEMBERS = 9

def qbc(committee, measure, ALtype, labelPercent):
    
    if ALtype == "stream":
        
        queriedData, newLabels = [], []
            
        # stream of points - one by one
        for i in range(len(X_unlabelled)):
            predictions = []
            
            # each committee member gives their prediction
            for j in range(NUM_COMMITTEE_MEMBERS):
                predictions.append(committee[j].predict(X_unlabelled[i])[0])
        

            # vote entropy based disagreement
            if measure == "voteEntropy":
                entropy = 0
                
                # finding counts for each class
                for i in range(NUM_CLASSES):
                    
                    count = predictions.count(i)
                    
                    # no entropy if vote doesn't exist
                    if count == 0:
                        continue
                        
                    entropy -= (count / NUM_COMMITTEE_MEMBERS) * np.log2(count / NUM_COMMITTEE_MEMBERS)
                    
                if entropy > 1.09:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
                    
            # KL Divergence based disagreement   
            elif measure == "KLDivergence":
                kld = 0
                
                # finding counts for each class
                for i in range(NUM_CLASSES):
                    
                    count = predictions.count(i)
                    
                    # no KLD if vote doesn't exist
                    if count == 0:
                        continue
                        
                    kld -= count * np.log2(count / NUM_COMMITTEE_MEMBERS)
                    
                if kld > 600:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
                
            else:
                assert False, "Disagreement measure not implemented"
                
            return np.asarray(queriedData), np.asarray(newLabels)
            
        
    elif ALtype == "pool":
        
        predictions = []
        disagreement = []
        num_queries = int(NUM_EXAMPLES * labelPercent)
        
        for j in range(NUM_COMMITTEE_MEMBERS):
            predictions.append(committee[j].predict(X_unlabelled))
            
        for i in range(len(X_unlabelled)):

            instanceDisagreement = 0
            for j in range(NUM_CLASSES):
                count = predictions[i].count(j)
                
                # no contibution for classes not present in prediction
                if count == 0:
                    continue
                
                if measure == "voteEntropy":
                    instanceDisagreement -= (count / NUM_COMMITTEE_MEMBERS) * np.log2(count / NUM_COMMITTEE_MEMBERS)
                    
                elif measure == "KLDivergence":
                    instanceDisagreement -= count * np.log2(count / NUM_COMMITTEE_MEMBERS)
                    
                else:
                    assert False, "Disagreement measure not implemented"
                    
            disagreement.append(instanceDisagreement)
            
        disagreement = np.asarray(disagreement)
        indices = np.argsort(disagreement, axis = 0)[-num_queries:]
        return X_unlabelled[indices], y_oracle[indices]
        
    else: 
        assert False, "Active learning type not implemented"

## Training

Active Learning on Decision tree classifier

In [9]:
model = DecisionTreeClassifier()

model.fit(X_labelled, y_labelled)
prediction = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, prediction))
print(confusion_matrix(y_test, prediction))

Accuracy: 0.758
[[1167    4   31   31   16   37   26   29   17   20]
 [   2 1442   28   23    7   20    7   23   25   15]
 [  23   22  983   83   40   21   52   39   79   25]
 [  18   30   55  981   23  134   18   41   77   47]
 [   1    8   38   17  979   24   48   45   55  147]
 [  54   25   32  111   26  809   58   28   71   50]
 [  24   25   60   19   43   31 1130    9   42   30]
 [   9   24   50   11   18   15    8 1181   23   52]
 [  18   28   55   64   50   66   30   26  922   88]
 [  17   20   42   26  134   49   13   79   64 1018]]


In [10]:
committee = []
committee.append(LogisticRegression())
committee.append(DecisionTreeClassifier())
committee.append(RandomForestClassifier())
committee.append(SVC())
committee.append(GaussianNB())
committee.append(LinearDiscriminantAnalysis())
committee.append(KNeighborsClassifier())
committee.append(MLPClassifier())
committee.append(AdaBoostClassifier())


for i in range(NUM_COMMITTEE_MEMBERS):
    committee[i].fit(X_labelled, y_labelled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Inputs

In [11]:
ALtype = "stream"
strategy = "uncertainty"
measure = "leastConfident"
labelPercent = 0.2 # percent of newly labelled data needed

In [13]:
X_active, y_active = X_labelled, y_labelled

if strategy == "uncertainty":
    queriedData, newLabels = uncertaintySampling(model, measure, ALtype, labelPercent)
    X_active = np.concatenate(X_active, queriedData)
    y_active = np.concatenate(y_active, newLabels)
    
elif strategy == "qbc":
    queriedData, newLabels = qbc(model, measure, ALtype, labelPercent)
    X_active = np.concatenate(X_active, queriedData)
    y_active = np.concatenate(y_active, newLabels)
    
else:
    assert False, "Strategy not implemented"

TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
newModel = DecisionTreeClassifier()

model.fit(X_active, y_active)
prediction = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, prediction))
print(confusion_matrix(y_test, prediction))