# ML Assignment 2 - Active Learning

Group Members
1. Anirudh Srinivasan Chakravarthy - 2017A7PS1195P
2. S Hariharan - 2017A7PS0134P
3. Divyam Goel - 2017A7PS1196P

In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

### Load the Breast Cancer dataset

In [2]:
# load dataset
x, y = load_breast_cancer(return_X_y=True)
NUM_EXAMPLES = len(x)

In [3]:
# pre-processing, scale features between 0 and 1
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [4]:
# divide the whole dataset in train & test splits
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 4)

# further divide the train data in labelled & unlabelled data
# retain only 3% of labelled points for training
X_labelled, X_unlabelled, y_labelled, y_oracle = train_test_split(X_train, y_train, test_size = 0.97, random_state = 4)

### Uncertainty Sampling
3 possible information measures: Least confident, margin sampling, entropy

In [5]:
def uncertaintySampling(model, measure, ALtype, labelPercent):

    probabilities = model.predict_proba(X_unlabelled)

    # stream-based active learning
    if ALtype == "stream":
        queriedData, newLabels = [], []
        # stream of points - one by one
        for i in range(len(X_unlabelled)):
            # return all points with confidence threshold
            if measure == "leastConfident":
                if probabilities[i].max() < 0.6:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            # return all points with margin threshold
            elif measure == "marginSampling":
                sortedProbabilities = np.sort(probabilities[i])
                margin = abs(sortedProbabilities[1] - sortedProbabilities[0])
                if margin < 0.2:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            # return all points with entropy threshold
            elif measure == "entropy":
                entropy = -np.sum(probabilities[i] * np.log2(probabilities[i]))
                if entropy > 0.99:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
            else:
                assert False, "Information measure not implemented"
        return np.asarray(queriedData), np.asarray(newLabels)

    
    # pool-based active learning
    elif ALtype == "pool":
        # return labelPercent points of all points
        num_queries = int(NUM_EXAMPLES * labelPercent)
        if measure == "leastConfident":
            sortedProbabilities = np.argsort(probabilities.max(axis=1))
            indices = sortedProbabilities[:num_queries]
        elif measure == "marginSampling":
            sortedProbabilities = np.sort(probabilities)
            indices = np.argsort(sortedProbabilities[:,1] - sortedProbabilities[:,0])[:num_queries]
        elif measure == "entropy":
            entropies = -np.sum(probabilities * np.log2(probabilities), axis=1)
            indices = np.argsort(entropies)[-num_queries:]
        else:
            assert False, "Information measure not implemented"            
        return X_unlabelled[indices], y_oracle[indices]

    else:
        assert False, "Active learning type not implemented"

## Query By Committee
2 disagreement measures: Vote entropy and KL Divergence

Number of committee members = 6

In [21]:
# create appropriate committee models though
NUM_COMMITTEE_MEMBERS = 9

def qbc(committee, measure, ALtype, labelPercent):
    
    if ALtype == "stream":
        
        queriedData, newLabels = [], []
            
        # stream of points - one by one
        for i in range(len(X_unlabelled)):
            predictions = []
            
            # each committee member gives their prediction
            for j in range(NUM_COMMITTEE_MEMBERS):
                predictions.append(committee[j].predict(X_unlabelled[i])[0])
        

            # vote entropy based disagreement
            if measure == "voteEntropy":
                entropy = 0
                
                # finding counts for each class
                for i in range(NUM_CLASSES):
                    
                    count = predictions.count(i)
                    
                    # no entropy if vote doesn't exist
                    if count == 0:
                        continue
                        
                    entropy -= (count / NUM_COMMITTEE_MEMBERS) * np.log2(count / NUM_COMMITTEE_MEMBERS)
                    
                if entropy > 1.09:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
                    
            # KL Divergence based disagreement   
            elif measure == "KLDivergence":
                kld = 0
                
                # finding counts for each class
                for i in range(NUM_CLASSES):
                    
                    count = predictions.count(i)
                    
                    # no KLD if vote doesn't exist
                    if count == 0:
                        continue
                        
                    kld -= count * np.log2(count / NUM_COMMITTEE_MEMBERS)
                    
                if kld > 600:
                    queriedData.append(X_unlabelled[i])
                    newLabels.append(y_oracle[i])
                
            else:
                assert False, "Disagreement measure not implemented"
                
            return np.asarray(queriedData), np.asarray(newLabels)
            
        
    elif ALtype == "pool":
        
        predictions = []
        disagreement = []
        num_queries = int(NUM_EXAMPLES * labelPercent)
        
        for j in range(NUM_COMMITTEE_MEMBERS):
            predictions.append(committee[j].predict(X_unlabelled))
            
        for i in range(len(X_unlabelled)):

            instanceDisagreement = 0
            for j in range(NUM_CLASSES):
                count = predictions[i].count(j)
                
                # no contibution for classes not present in prediction
                if count == 0:
                    continue
                
                if measure == "voteEntropy":
                    instanceDisagreement -= (count / NUM_COMMITTEE_MEMBERS) * np.log2(count / NUM_COMMITTEE_MEMBERS)
                    
                elif measure == "KLDivergence":
                    instanceDisagreement -= count * np.log2(count / NUM_COMMITTEE_MEMBERS)
                    
                else:
                    assert False, "Disagreement measure not implemented"
                    
            disagreement.append(instanceDisagreement)
            
        disagreement = np.asarray(disagreement)
        indices = np.argsort(disagreement, axis = 0)[-num_queries:]
        return X_unlabelled[indices], y_oracle[indices]
        
    else: 
        assert False, "Active learning type not implemented"

## Training

Active Learning on Decision tree classifier

In [6]:
model = LogisticRegression(random_state=0)

model.fit(X_labelled, y_labelled)
prediction = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, prediction))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, prediction))

Accuracy: 0.8070175438596491

Confusion Matrix
[[34  0]
 [22 58]]


### Inputs

In [7]:
strategy_list = ['uncertainty', 'qbc']
al_type_list = ['stream', 'pool']
measure_list = ['leastConfident', 'marginSampling', 'entropy']

strategy = strategy_list[0]
al_type = al_type_list[0]
measure = measure_list[0]

labelPercent = 0.3 # percent of newly labelled data needed

In [8]:
X_active, y_active = X_labelled, y_labelled

if strategy == "uncertainty":
    queriedData, newLabels = uncertaintySampling(model, measure, al_type, labelPercent)
    X_active = np.concatenate((X_active, queriedData))
    y_active = np.concatenate((y_active, newLabels))
    
elif strategy == "qbc":
    queriedData, newLabels = qbc(model, measure, al_type, labelPercent)
    X_active = np.concatenate(X_active, queriedData)
    y_active = np.concatenate(y_active, newLabels)
    
else:
    assert False, "Strategy not implemented"

In [9]:
newModel = LogisticRegression(random_state=0)

model.fit(X_active, y_active)
prediction = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, prediction))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, prediction))

Accuracy: 0.9649122807017544
[[30  4]
 [ 0 80]]
