# Active Learning Notebook

In [1]:
import random
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier as rf

from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

## Init config

In [2]:
filename = './df_pickles/df_150000_20.pkl'
multiclass = True

## Data Pre-processing

In [3]:
df = pd.read_pickle(filename)
X = df.iloc[:,:df.shape[1]-2]
y = df.iloc[:,df.shape[1]-1]

In [4]:
if multiclass == False:
    y = y.replace([2,3],1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(),
                                                    y.to_numpy(),
                                                    test_size=0.30,
                                                    stratify=y,
                                                    random_state=77)

In [6]:
print(f'X train shape:{X_train.shape}')
print(f'y train shape:{y_train.shape}')
print(f'X test shape:{X_test.shape}')
print(f'y test shape:{y_test.shape}')

X train shape:(105000, 18)
y train shape:(105000,)
X test shape:(45000, 18)
y test shape:(45000,)


In [7]:
# Isolate our examples for our labeled dataset.
n_labeled_examples = X_train.shape[0]
training_indices = np.random.randint(low=0,
                                     high=n_labeled_examples + 1,
                                     size=100)

In [8]:
# Initial Seed
X_init = X_train[training_indices]
y_init = y_train[training_indices]

In [9]:
# Pool
# Delete same
X_pool = np.delete(X_train, training_indices, axis=0)
y_pool = np.delete(y_train, training_indices, axis=0)

# Make Pool
n_labeled_examples = X_pool.shape[0]
pool_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=10000)
X_pool = X_pool[pool_indices]
y_pool = y_pool[pool_indices]

In [10]:
print(f'X init shape:{X_init.shape}')
print(f'y init shape:{y_init.shape}')
print(f'X pool shape:{X_pool.shape}')
print(f'y pool shape:{y_pool.shape}')

X init shape:(100, 18)
y init shape:(100,)
X pool shape:(10000, 18)
y pool shape:(10000,)


In [None]:
rf = rf()
learner = ActiveLearner(estimator=rf, X_training=X_init, y_training=y_init)

In [None]:
predictions = learner.predict(X_test)

In [None]:
init_score = learner.score(X_test, y_test)
print(f'Initial Score: {init_score}')

In [None]:
init_size = 100
n_labeled_examples = X_train.shape[0]
training_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=init_size)

# Initial 
X_init = X_train[training_indices]
y_init = y_train[training_indices]

# Pool (Unlabled)
X_pool = np.delete(X_train, training_indices, axis=0)
y_pool = np.delete(y_train, training_indices, axis=0)

print(f'X init shape:{X_init.shape}')
print(f'y init shape:{y_init.shape}')
print(f'X pool shape:{X_pool.shape}')
print(f'y pool shape:{y_pool.shape}')

clf = rf()
learner = ActiveLearner(estimator=clf, X_training=X_init, y_training=y_init)

predictions = learner.predict(X_test)

init_score = learner.score(X_test, y_test)
print(f'Initial Score:{init_score}')

performance_history = [init_score]

# Allow our model to query our unlabeled dataset for the most
# informative points according to our query strategy (uncertainty sampling).
for index in range(1000):
    query_index, query_instance = learner.query(X_pool)

    # Teach our ActiveLearner model the record it has requested.
    X, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, )
    learner.teach(X=X, y=y)

    # Remove the queried instance from the unlabeled pool.
    X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index)

    # Calculate and report our model's accuracy.
    model_accuracy = learner.score(X_test, y_test)
    print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy))

    # Save our model's performance for plotting.
    performance_history.append(model_accuracy)

file_name = str(init_size)+'_uncert.pkl'
file = open(file_name, 'wb')
pickle.dump(performance_history, file)
file.close()