# Active Learning Notebook

In [157]:
import random
import glob
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier as rf

from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

## Init config

In [134]:
FILENAME = 'df_pickles/df_180000_20.pkl'
MULTICLASS = True
QUERY_NUM = 5
REPEAT = 2
INIT_SEED_PATTERN = [10, 15]
POOL_SIZE_PATTERN = [17, 18]

## Data Pre-processing

In [135]:
df = pd.read_pickle(FILENAME)
X = df.iloc[:,:df.shape[1]-2]
y = df.iloc[:,df.shape[1]-1]

In [136]:
df['label'].value_counts()

0    90000
1    30000
2    30000
3    30000
Name: label, dtype: int64

In [137]:
if MULTICLASS == False:
    y = y.replace([2,3],1)

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(),
                                                    y.to_numpy(),
                                                    test_size=60000,
                                                    stratify=y,
                                                    random_state=77)

In [139]:
np.unique(y_train, return_counts=True)

(array([0, 1, 2, 3]), array([60000, 20000, 20000, 20000]))

In [140]:
np.unique(y_test, return_counts=True)

(array([0, 1, 2, 3]), array([30000, 10000, 10000, 10000]))

In [6]:
print('===== Train & Test Data =====')
print(f'X train shape:{X_train.shape}')
print(f'y train shape:{y_train.shape}')
print(f'X test shape:{X_test.shape}')
print(f'y test shape:{y_test.shape}')
print('=============================\n')

===== Train & Test Data =====
X train shape:(100000, 18)
y train shape:(100000,)
X test shape:(50000, 18)
y test shape:(50000,)



In [7]:
def split_seeds(init_size, pool_size, X_train, y_train):
    n_labeled_examples = X_train.shape[0]

    # Pick Init Seed
    training_indices = np.random.randint(low=0, high=n_labeled_examples, size=init_size)
    X_init = X_train[training_indices]
    y_init = y_train[training_indices]

    # Delete the init from Train and store in Pool
    X_pool = np.delete(X_train, training_indices, axis=0)
    y_pool = np.delete(y_train, training_indices, axis=0)

    # Pick Pool 
    current_pool_size = X_pool.shape[0]
    pool_indices = np.random.randint(low=0, high=current_pool_size, size=pool_size)
    X_pool = X_pool[pool_indices]
    y_pool = y_pool[pool_indices]

    print('===== Init & Pool Seed =====')
    print(f'X init shape:{X_init.shape}')
    print(f'y init shape:{y_init.shape}')
    print(f'X pool shape:{X_pool.shape}')
    print(f'y pool shape:{y_pool.shape}')
    print('============================\n')

    return X_init, y_init, X_pool, y_pool

In [11]:
def active_learning(X_init, y_init, X_pool, y_pool, X_test, y_test):
    print('===== Active Learning =====')

    clf = rf()
    learner = ActiveLearner(estimator=clf, X_training=X_init, y_training=y_init)

    predictions = learner.predict(X_test)
    predictions_history = [predictions]
    accuracy = learner.score(X_test, y_test)
    accuracy_history = [accuracy]
    for index in range(QUERY_NUM):
        query_index, query_instance = learner.query(X_pool)

        X, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, )
        learner.teach(X=X, y=y)

        # Remove the queried instance from the unlabeled pool
        X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index)

        predictions = learner.predict(X_test)
        predictions_history.append(predictions)
        accuracy = learner.score(X_test, y_test)
        accuracy_history.append(accuracy)
        print(f'Query:{index+1} Accuracy:{accuracy}')
    print('==========================\n')
    return predictions_history

In [12]:
for init_size in INIT_SEED_PATTERN:

    history = []
    for r in range(REPEAT):
        for pool_size in POOL_SIZE_PATTERN:
            X_init, y_init, X_pool, y_pool = split_seeds(init_size, pool_size, X_train, y_train)
            history.append(active_learning(X_init, y_init, X_pool, y_pool, X_test, y_test))
    
    # Save Result for every init pattern
    history = np.array(history)
    print(history)
    print(history.shape)

===== Init & Pool Seed =====
X init shape:(10, 18)
y init shape:(10,)
X pool shape:(17, 18)
y pool shape:(17,)

===== Active Learning =====
Query:1 Accuracy:0.70172
Query:2 Accuracy:0.82804
Query:3 Accuracy:0.8188
Query:4 Accuracy:0.8532
Query:5 Accuracy:0.88724

===== Init & Pool Seed =====
X init shape:(10, 18)
y init shape:(10,)
X pool shape:(18, 18)
y pool shape:(18,)

===== Active Learning =====
Query:1 Accuracy:0.86368
Query:2 Accuracy:0.87974
Query:3 Accuracy:0.86964
Query:4 Accuracy:0.88984
Query:5 Accuracy:0.86558

===== Init & Pool Seed =====
X init shape:(10, 18)
y init shape:(10,)
X pool shape:(17, 18)
y pool shape:(17,)

===== Active Learning =====
Query:1 Accuracy:0.77094
Query:2 Accuracy:0.85238
Query:3 Accuracy:0.92834
Query:4 Accuracy:0.94632
Query:5 Accuracy:0.9518

===== Init & Pool Seed =====
X init shape:(10, 18)
y init shape:(10,)
X pool shape:(18, 18)
y pool shape:(18,)

===== Active Learning =====
Query:1 Accuracy:0.78594
Query:2 Accuracy:0.93062
Query:3 Accurac

## Evaluation

In [21]:
history.shape

(4, 6, 50000)

In [141]:
file = open('./result_pickles/test_run/uncert_5_times/10_init/', 'rb')
result = pickle.load(file)

In [142]:
result.shape

(3, 11, 5)

In [151]:
QUERY_NUM = 1000
REPEAT = 5
INIT_SEED_PATTERN = [2, 10, 40, 200]
POOL_SIZE_PATTERN = [1000, 3000, 5000, 7000, 9000, 15000, 50000, 100000]
DIR_NAME = './result_pickles/test_run/uncert_5_times/*'

In [167]:
QUERY_NUM = 10
REPEAT = 3
INIT_SEED_PATTERN = [10, 15]
POOL_SIZE_PATTERN = [17, 18, 19, 20]

In [187]:
def getAvg(result):
    pool_result = []
    for i in range(QUERY_NUM + 1):
        avg = 0
        for p in range(REPEAT):
            avg += result[p][i][4]
        avg /= REPEAT
        pool_result.append(avg)
    return pool_result

In [189]:
for init_size in INIT_SEED_PATTERN:
    for pool_size in POOL_SIZE_PATTERN:
        target = DIR_NAME + f'{init_size}*/*{pool_size}*'
        file_name = glob.glob(target)[0]
        print(f'Loading {file_name} ...')
        with open(file_name, 'rb') as f:
            rs = pickle.load(f)
            tmp = getAvg(rs)
            print(len(tmp))
            
            
    print('Next Init ...')

Loading ./result_pickles/test_run/uncert_5_times/10_init/10_init_17_pool.pkl ...
11
Loading ./result_pickles/test_run/uncert_5_times/10_init/10_init_18_pool.pkl ...
11
Loading ./result_pickles/test_run/uncert_5_times/10_init/10_init_19_pool.pkl ...
11
Loading ./result_pickles/test_run/uncert_5_times/10_init/10_init_20_pool.pkl ...
11
Next Init ...
Loading ./result_pickles/test_run/uncert_5_times/15_init/15_init_17_pool.pkl ...
11
Loading ./result_pickles/test_run/uncert_5_times/15_init/15_init_18_pool.pkl ...
11
Loading ./result_pickles/test_run/uncert_5_times/15_init/15_init_19_pool.pkl ...
11
Loading ./result_pickles/test_run/uncert_5_times/15_init/15_init_20_pool.pkl ...
11
Next Init ...
