# Active Learning
Load the `mnist` dataset. Split it into training and test sets. Choose a default classification model. Comapre the following tuple sampling strategies for different labeling budgets $ B $:
- Random sampling of $ B $ tuples from the training set. 
- Sampling $ B $ tuples from the training set based on an active learning strategy, such as uncertainty sampling.
- Clustering data points of the training set into $ B $ cluster to sample the centroid of these $ B $ cluster. 

For different labeling budgets $ B $, you should:
- Repeat the followings 5 times:
    - Run the above sampling strategies to sample 3 different sets of tuples for labeling.
    - Label the sampled tuples.
    - Train 3 models on these 3 sets of sampled tuples separately. 
    - Test the models on the test set and store the results.
- Calculate the accuracy mean and standard deviation of the models for each labeling budget $ B $

Finally, draw a line chart to show how the accuracy of the models increases with more labeled tuples, when using different tuple sampling strategies.

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import accuracy_score
import sklearn.model_selection
import sklearn.ensemble
import sklearn.cluster
from sklearn.model_selection import train_test_split
from sklearn.utils.random import sample_without_replacement
import plotly.graph_objects as go

In [2]:
df = pd.read_csv('mnist.csv')
df = df.set_index('id')
print(df.shape)
df.head(3)  

(4000, 785)


Unnamed: 0_level_0,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31953,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34452,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60897,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Spliting the Data into Training and Test Sets

In [3]:
x = df.drop(['class'], axis=1)
y = df['class']
x_train, x_test, y_train, y_test = train_test_split(x, y)

# Preliminaries

In [4]:
initial_budget = 10
budget_per_iteration = 30
number_of_iterations = 20

# Random Sampling

In [5]:
number_of_labels_random_sampling = []
accuracies_random_sampling = []

sampled_indexes = random.sample(range(x_train.shape[0]), initial_budget)

model = None

for i in range(number_of_iterations):
    
    #Sample data prints points for labelling
    if model is not None:
        sampled_indexes += random.sample(range(x_train.shape[0]), budget_per_iteration)
        sampled_indexes = list(set(sampled_indexes))
    
    #Train and test the model
    x_train_new = x_train.iloc[sampled_indexes, :]
    y_train_new = y_train.iloc[sampled_indexes]
    model = sklearn.ensemble.RandomForestClassifier()
    model.fit(x_train_new, y_train_new)
    y_predicted = model.predict(x_test)
    accuracy = sklearn.metrics.accuracy_score(y_test, y_predicted)
    
    #Save the results
    
    number_of_labels_random_sampling.append(len(sampled_indexes))
    accuracies_random_sampling.append(accuracy)
    

# Uncertainty Sampling

In [18]:
number_of_labels_uncertainty_sampling = []
accuracies_uncertainty_sampling = []
sampled_indexes = random.sample(range(x_train.shape[0]), initial_budget)
model = None

for i in range (number_of_iterations):
    # Sample data points for Labeling
    if model is not None:
        probabilities = model.predict_proba(x_train)
        highest_probability = probabilities.max(axis=1)
        uncertainty = 1 - highest_probability
        sampled_indexes += np.argpartition(uncertainty, -budget_per_iteration)[-budget_per_iteration:].tolist()
        sampled_indexes = list(set(sampled_indexes))
    
    # Train and test the model
    x_train_new = x_train.iloc[sampled_indexes, :]
    y_train_new = y_train.iloc[sampled_indexes]
    model = sklearn.ensemble.RandomForestClassifier()
    model.fit(x_train_new, y_train_new)
    y_predicted = model.predict(x_test)
    accuracy = sklearn.metrics.accuracy_score(y_test, y_predicted)
    # Save the results
    number_of_labels_uncertainty_sampling.append(len(sampled_indexes))
    accuracies_uncertainty_sampling.append(accuracy)

# Clustering-Based Sampling

In [6]:
number_of_labels_clustering_based_sampling = []
accuracies_clustering_based_sampling = []

clustering_model = sklearn.cluster.KMeans(n_clusters=budget_per_iteration)
clustering_model.fit(x_train)
x_train_cluster_ids = pd.DataFrame({"Row ID": range(x_train.shape[0]), "Cluster ID": clustering_model.labels_})
sampled_indexes = []

for i in range(number_of_iterations):
    
    #Sample data points for labeling
    temp_df = x_train_cluster_ids.groupby("Cluster ID").apply(lambda x: x.sample(n=1)).reset_index(drop=True)
    sampled_indexes += temp_df['Row ID'].tolist()
    sampled_indexes = list(set(sampled_indexes))
    
    # Train and Test the model
    
    x_train_new = x_train.iloc[sampled_indexes, :]
    y_train_new = y_train.iloc[sampled_indexes]
    model = sklearn.ensemble.RandomForestClassifier()
    model.fit(x_train_new, y_train_new)
    y_predicted = model.predict(x_test)
    accuracy = sklearn.metrics.accuracy_score(y_test, y_predicted)
    
    #save the results
    
    number_of_labels_clustering_based_sampling.append(len(sampled_indexes))
    accuracies_clustering_based_sampling.append(accuracy)
    


# Visualization

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=number_of_labels_random_sampling, y=accuracies_random_sampling,
                         mode='lines', name='Random Sampling'))

fig.add_trace(go.Scatter(x=number_of_labels_uncertainty_sampling, y=accuracies_uncertainty_sampling,
                         mode='lines', name='Uncertainty Sampling'))

fig.add_trace(go.Scatter(x=number_of_labels_clustering_based_sampling, y=accuracies_clustering_based_sampling,
                         mode='lines', name='Clustering-Based Sampling'))

