<a href="https://colab.research.google.com/github/hafizbudi/active-learning/blob/main/Digits_RS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Digits with random sampling**

## **Connect to google drive**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd gdrive/My Drive/Project/active_learning

In [None]:
pip install modal

## **Load library and digits data**

In [184]:
from dataclasses import dataclass
import dataclasses
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import numpy as np
import statistics
from tqdm import tqdm
import scipy

from sklearn.datasets import load_digits
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling


## **Load, Train and Split data**

In [185]:
digits = load_digits()

# array to save number of instance and accuracy result
instance_number = []
accuracy_result = []

# flatten into 1d array
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
Y = digits.target

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)


In [192]:
# data_points 100,...,900
data_points=range(30,400,10)

# train a classifier and predict
def train_and_predict(n_instances, X_train, y_train, X_test):
    clf = RandomForestClassifier()
    
    dataset_size = len(X_train)
    potential_candidates = list(range(dataset_size))
    train_indexes = np.array(random.sample(potential_candidates, n_instances))
    #print(train_indexes)
    clf.fit(X_train[train_indexes], y_train[train_indexes])
    predicted = clf.predict(X_test)
    return predicted

def random_selection_learning(n_instances, X_train, y_train):
    clf = RandomForestClassifier()
    #clf = LogisticRegression()
    X_selected, X_not_selected, y_selected, y_not_selected = \
        train_test_split(X_train, y_train, train_size=n_instances, shuffle=True)
    clf.fit(X_selected, y_selected)
    return clf, X_selected, X_not_selected, y_selected, y_not_selected

In [None]:
def evaluate(model, X_real, y_real):
    y_predictions = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_real,y_predictions)
    #print(f"{accuracy}")
    return accuracy

@dataclass
class Experiment:
    n_samples: int
    repeat: int
    accuracy: float

def random_sampling_policy(model, n_new_samples, X_remaining, y_remaining):
    #print("n_new_samples ",n_new_samples)
    X_new_selected, X_remaining, y_new_selected, y_remaining = \
               train_test_split(X_remaining, y_remaining,  train_size=n_new_samples, shuffle=True)
    return X_new_selected, X_remaining, y_new_selected, y_remaining

def uncertainty_sampling_policy(model, n_new_samples, X_remaining, y_remaining):
    y_proba = model.predict_proba(X_remaining)
    uncertainty = 1. - np.max(y_proba, axis=1)
    #uncertainty = -scipy.stats.entropy(y_proba, axis=1)
    #print(uncertainty)
    ordered_positions = uncertainty.argsort()
    selected = ordered_positions[-n_new_samples:]
    #print(len(selected))
    non_selected = ordered_positions[:-n_new_samples]
    #print(len(non_selected))#print(non_selected)
    return X_remaining[selected], X_remaining[non_selected], y_remaining[selected], y_remaining[non_selected]

def learning_curve(data_points, repeats, selection_policy):
    experiments = []
    for i in tqdm(range(repeats)):
        np.random.seed(123+i)
        # First learn 
        n_samples = data_points[0]
        #print("n_samples",n_samples)
        model, X_selected, X_remaining, y_selected, y_remaining = \
            random_selection_learning(n_samples, X_train, y_train)
        acc = evaluate(model, X_test, y_test)
        experiments.append(Experiment(n_samples=n_samples, repeat=i, accuracy=acc))
        for n_samples in data_points[1:]:
            # Select the additional examples
            n_new_samples = n_samples - len(X_selected)
            #print(n_new_samples)
            X_new_selected, X_remaining, y_new_selected, y_remaining = \
                selection_policy(model, n_new_samples, X_remaining, y_remaining)
            
            X_selected = np.vstack((X_selected, X_new_selected))
            y_selected = np.hstack((y_selected, y_new_selected))
            #print(X_selected.shape, y_selected.shape)
            model.fit(X_selected, y_selected)
            acc = evaluate(model, X_test, y_test)
            experiments.append(Experiment(n_samples=n_samples, repeat=i, accuracy=acc))
            
    #print("mean n 100", mean_n100)
    #print(mean_n)
    return experiments

experiments_uncertainty = learning_curve(data_points, 5, uncertainty_sampling_policy)
experiments_random = learning_curve(data_points, 5, random_sampling_policy)
print(experiments)
#print("Non-100 array",mn)

100%|██████████| 5/5 [00:24<00:00,  4.89s/it]
 80%|████████  | 4/5 [00:16<00:04,  4.17s/it]

In [None]:
print(dataclasses.asdict(experiments[0]))

In [None]:
df = pd.DataFrame([dataclasses.asdict(e) for e in experiments])
print(df)

In [None]:
def plot_learning_curve(df, name):
    # Compute percentiles
    p10 = df.groupby('n_samples').quantile(.10)['accuracy']
    p50 = df.groupby('n_samples').median()['accuracy']
    p90 = df.groupby('n_samples').quantile(.90)['accuracy']
    training_sizes= p50.index.to_numpy()

    with plt.style.context('seaborn-white'):
        plt.title('Learning curve for classifier '+name)
        plt.plot(training_sizes, p50, label=name, marker='o')

        # show standard deviation
        plt.fill_between(training_sizes, p10, p90,alpha=0.1)
    plt.legend(loc='lower right')
    plt.xlabel("N sample")
    plt.ylabel("Accuracy")
    plt.grid()
    #plt.show()

In [None]:
df_random = pd.DataFrame([dataclasses.asdict(e) for e in experiments_random])
df_us = pd.DataFrame([dataclasses.asdict(e) for e in experiments_uncertainty])
plot_learning_curve(df_random, "Random")
plot_learning_curve(df_us, "Uncertainty")