# Question Answering Model 
## no trainer

- dataset
- torch
- transformers
- transformers[torch]
- evaluate

import packages

In [5]:
from datasets import load_dataset, disable_caching
from transformers import (
    AutoTokenizer,
    default_data_collator,
    get_scheduler,
    AutoModelForQuestionAnswering
)
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch
from torch.cuda import amp

from collections import defaultdict, Counter
from tqdm.auto import tqdm
import numpy as np

# for badge_query
from scipy import stats
from sklearn.metrics import pairwise_distances
import pdb

import os
import re
import string

arguments.py

In [6]:
# STRATEGY_NAME = 'RandomSampling'
STRATEGY_NAME = 'LeastConfidence'
# STRATEGY_NAME = 'MarginSampling'
# STRATEGY_NAME = 'BALDDropout'
DATA_NAME = 'BioASQ'
# EXPE_ROUND = 5
MODEL_BATCH = 8
MAX_LENGTH = None
LEARNING_RATE = 3e-5
MODEL_NAME = 'RoBERTa'
LOW_RES = True
NUM_TRAIN_EPOCH = 3
UNIQ_CONTEXT = True
if LOW_RES:
    args_input_quota = 200
    NUM_QUERY = 50
else:
    NUM_INIT_LB = 500 # 1000
    args_input_quota = 2000 # 200
    NUM_QUERY = 500 # 50
# ITERATION = int(args_input_quota / NUM_QUERY)

stride = 128

model_dir = '/mount/arbeitsdaten31/studenten1/linku/dev_models'
CACHE_DIR = '/mount/arbeitsdaten31/studenten1/linku/.cache'

preprocess.py

In [7]:
def preprocess_training_examples(examples, tokenizer):
    # no ['offset_mapping'], for .train() and .eval()
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


def preprocess_training_features(examples, tokenizer):
    # keep ["offset_mapping"] and ["example_id"], for compute_metrics()
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs["offset_mapping"]
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    example_ids = []
    contexts = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        
        # added for used in unlabel data predict
        example_ids.append(examples["id"][sample_idx]) 
        # added for unique context filter
        contexts.append(examples["context"][sample_idx])

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["context"] = contexts
    inputs["example_id"] = example_ids
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

def preprocess_validation_examples(examples, tokenizer):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

def preprocess_training_examples_lowRes(examples, tokenizer):
    # no ['offset_mapping'], for .train() and .eval()
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    detected_answers = examples["detected_answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = detected_answers[sample_idx]
        start_char = answer["char_spans"][0]["start"][0]
        end_char = answer["char_spans"][0]["end"][0]
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

def preprocess_training_features_lowRes(examples, tokenizer):
    # keep ["offset_mapping"] and ["example_id"], for compute_metrics()
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs["offset_mapping"]
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["detected_answers"]
    start_positions = []
    end_positions = []
    example_ids = []
    contexts = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["char_spans"][0]["start"][0]
        end_char = answer["char_spans"][0]["end"][0]
        sequence_ids = inputs.sequence_ids(i)
        
        # added for used in unlabel data predict
        example_ids.append(examples["qid"][sample_idx])
        # added for unique context filter
        contexts.append(examples['context'][sample_idx])

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["context"] = contexts
    inputs["example_id"] = example_ids
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

def preprocess_validation_examples_lowRes(examples, tokenizer):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["qid"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs


utils.py

In [8]:
def get_aubc(quota, bsize, resseq):
	# it is equal to use np.trapz for calculation
	ressum = 0.0
	if quota % bsize == 0:
		for i in range(len(resseq)-1):
			ressum = ressum + (resseq[i+1] + resseq[i]) * bsize / 2

	else:
		for i in range(len(resseq)-2):
			ressum = ressum + (resseq[i+1] + resseq[i]) * bsize / 2
		k = quota % bsize
		ressum = ressum + ((resseq[-1] + resseq[-2]) * k / 2)
	ressum = round(ressum / quota,3)
	
	return ressum


def get_mean_stddev(datax):
	return round(np.mean(datax),4),round(np.std(datax),4)


def get_unlabel_data(n_pool, labeled_idxs, train_dataset):
    unlabeled_idxs = np.arange(n_pool)[~labeled_idxs]
    unlabeled_data = train_dataset.select(indices=unlabeled_idxs)
    return unlabeled_idxs, unlabeled_data


def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)


def get_model(m):
	if m.lower() == 'bert':
		return 'bert-base-uncased'
	elif m.lower() == 'bertlarge':
		return 'bert-large-uncased'
	elif m.lower() == 'roberta':
		return 'roberta-base'
	elif m.lower() == 'robertalarge':
		return 'roberta-large'


def get_context_id(data):
    context_id = {}
    for i, c in enumerate(set(data['context'])):
        context_id[c] = i+1
    return context_id


def preprocess_data(train_data, val_data):
	tokenizer = AutoTokenizer.from_pretrained(get_model(MODEL_NAME))

	if LOW_RES:
		train_dataset = train_data.map(
			preprocess_training_examples_lowRes,
			batched=True,
			remove_columns=train_data.column_names,
			fn_kwargs=dict(tokenizer=tokenizer)
		)
		train_features = train_data.map(
			preprocess_training_features_lowRes,
			batched=True,
			remove_columns=train_data.column_names,
			fn_kwargs=dict(tokenizer=tokenizer)
		)
		val_dataset = val_data.map(
			preprocess_validation_examples_lowRes,
			batched=True,
			remove_columns=val_data.column_names,
			fn_kwargs=dict(tokenizer=tokenizer)
		)
		val_features = val_data.map(
			preprocess_validation_examples_lowRes,
			batched=True,
			remove_columns=val_data.column_names,
			fn_kwargs=dict(tokenizer=tokenizer)
		)
	else:
		train_dataset = train_data.map(
			preprocess_training_examples,
			batched=True,
			remove_columns=train_data.column_names,
			fn_kwargs=dict(tokenizer=tokenizer)
		)
		train_features = train_data.map(
			preprocess_training_features,
			batched=True,
			remove_columns=train_data.column_names,
			fn_kwargs=dict(tokenizer=tokenizer)
		)
		val_dataset = val_data.map(
			preprocess_validation_examples,
			batched=True,
			remove_columns=val_data.column_names,
			fn_kwargs=dict(tokenizer=tokenizer)
		)
		val_features = val_data.map(
			preprocess_validation_examples,
			batched=True,
			remove_columns=val_data.column_names,
			fn_kwargs=dict(tokenizer=tokenizer)
		)

	train_dataset.set_format("torch")
	train_features.set_format("torch")
	val_dataset = val_dataset.remove_columns(["offset_mapping"])
	val_dataset.set_format("torch")
	val_features.set_format("torch")

	return train_dataset, train_features, val_dataset, val_features


def load_dataset_mrqa(d):
	'''
	return train_set, val_set
	'''
	data = load_dataset("mrqa", cache_dir=CACHE_DIR)
	if d == 'squad':
		# the first to 86588th in train set
		# the first to 10507th in val set
		squad_train = data['train'].select(range(86588))
		squad_val = data['validation'].select(range(10507))
		for t in squad_train: assert t['subset'] == 'SQuAD', 'Please select corrrect train data for SQuAD.'
		for v in squad_val: assert v['subset'] == 'SQuAD', 'Please select corrrect validation data for SQuAD.'
		return squad_train, squad_val
	elif d == 'newsqa':
		# the 86589th to 160748th in train set
		# the 10508th to 14719th in val set
		data_set = data['train'].select(range(86588, 160748))
		newsqa_train = data_set.shuffle(1127).select(range(10000))
		newsqa_val = data['validation'].select(range(10507, 14719))
		for t in newsqa_train: assert t['subset'] == 'NewsQA', 'Please select corrrect train data for NewQA.'
		for v in newsqa_val: assert v['subset'] == 'NewsQA', 'Please select corrrect validation data for NewQA.'
		return newsqa_train, newsqa_val
	elif d == 'searchqa':
		# the 222437th to 339820th in train set
		# the 22505th to 39484th in val set
		data_set = data['train'].select(range(222436, 339820))
		searchqa_train = data_set.shuffle(1127).select(range(10000))
		searchqa_val = data['validation'].select(range(22504, 39484))	
		for t in searchqa_train: assert t['subset'] == 'SearchQA', 'Please select corrrect train data for SearchQA.'
		for v in searchqa_val: assert v['subset'] == 'SearchQA', 'Please select corrrect validation data for SearchQA.'
		return searchqa_train, searchqa_val
	elif d == 'bioasq':
		# the first to the 1504th in the test set
		sub = data['test'].select(range(1504))
		len_sub_val = len(sub) // 10
		bioasq_train = sub.select(range(len_sub_val, len(sub)))
		bioasq_val = sub.select(range(len_sub_val))
		for t in bioasq_train: assert t['subset'] == 'BioASQ', 'Please select corrrect train data for BioASQ.'
		for v in bioasq_val: assert v['subset'] == 'BioASQ', 'Please select corrrect validation data for BioASQ.'
		return bioasq_train, bioasq_val
	elif d == 'textbookqa':
		# the 8131st to 9633rd
		sub = data['test'].select(range(8130, 9633))
		len_sub_val = len(sub) // 10
		textbookqa_train = sub.select(range(len_sub_val, len(sub)))
		textbookqa_val = sub.select(range(len_sub_val)) 
		for t in textbookqa_train: assert t['subset'] == 'TextbookQA', 'Please select corrrect train data for TextbookQA.'
		for v in textbookqa_val: assert v['subset'] == 'TextbookQA', 'Please select corrrect validation data for TextbookQA.'
		return textbookqa_train, textbookqa_val
	elif d == 'drop': # Discrete Reasoning Over Paragraphs
		# the 1505th to 3007th in test set
		sub = data['test'].select(range(1504, 3007))
		len_sub_val = len(sub) // 10
		drop_train = sub.select(range(len_sub_val, len(sub)))
		drop_val = sub.select(range(len_sub_val))
		for t in drop_train: assert t['subset'] == 'DROP', 'Please select corrrect train data for DROP.'
		for v in drop_val: assert v['subset'] == 'DROP', 'Please select corrrect validation data for DROP.'
		return drop_train, drop_val
	

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluation(theoretical_answers, predicted_answers, skip_no_answer=False):
    '''
	theoretical_answers, datatype=dict
	{strings of id: list of ground truth answers}
	predicted_answers, datatype=dict
	{strings of id: strings of prediction text}
	'''
    f1 = exact_match = total = 0
    for qid, ground_truths in theoretical_answers.items():
        if qid not in predicted_answers:
            if not skip_no_answer:
                message = 'Unanswered question %s will receive score 0.' % qid
                print(message)
                total += 1
            continue
        total += 1
        prediction = predicted_answers[qid]
        exact_match += metric_max_over_ground_truths(
            exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
            f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}


def save_model(device, pretrain_dir, strategy_dir):
    '''
    Copy and save model from pretrain_models to current trained models.
    '''
    pretrain_model = AutoModelForQuestionAnswering.from_pretrained(pretrain_dir).to(device)
    model_to_save = pretrain_model.module if hasattr(pretrain_model, 'module') else pretrain_model 
    model_to_save.save_pretrained(strategy_dir)


def init_centers(X, K):
    ind = np.argmax([np.linalg.norm(s, 2) for s in X])
    mu = [X[ind]]
    indsAll = [ind]
    centInds = [0.] * len(X)
    cent = 0
    print('#Samps\tTotal Distance')
    while len(mu) < K:
        if len(mu) == 1:
            D2 = pairwise_distances(X, mu).ravel().astype(float)
        else:
            newD = pairwise_distances(X, [mu[-1]]).ravel().astype(float)
            for i in range(len(X)):
                if D2[i] >  newD[i]:
                    centInds[i] = cent
                    D2[i] = newD[i]
        print(str(len(mu)) + '\t' + str(sum(D2)), flush=True)
        if sum(D2) == 0.0: pdb.set_trace()
        D2 = D2.ravel().astype(float)
        Ddist = (D2 ** 2)/ sum(D2 ** 2)
        customDist = stats.rv_discrete(name='custm', values=(np.arange(len(D2)), Ddist))
        ind = customDist.rvs(size=1)[0]
        while ind in indsAll: ind = customDist.rvs(size=1)[0]
        mu.append(X[ind])
        indsAll.append(ind)
        cent += 1
    return indsAll

In [23]:
def get_unique_context(q_idxs, features, context_dict, exist_c_id=None):	
	# create a new_q_idxs with unique context_id
	if exist_c_id:
		c_id_lst = exist_c_id
	else:
		c_id_lst = []

	new_q_idxs = []
	for q_i in tqdm(q_idxs, "Creating unique context idxs"):
		sample = features.select(indices=[q_i])
		c_id = context_dict[sample['context'][0]]
		if c_id not in c_id_lst:
			new_q_idxs.append(q_i)
			c_id_lst.append(c_id)
	return new_q_idxs

def get_final_c_id(iter_labeled_idxs, features, context_dict):
	c_id_lst = []
	for i in tqdm(iter_labeled_idxs, "Creating final context id"):
		sample = features.select(indices=[i])
		c_id_lst.append(context_dict[sample['context'][0]])
	return c_id_lst

def get_unique_sample(labeled_idxs, q_idxs, n_pool, train_features, iteration=0):
	if LOW_RES:
		num_query_i = NUM_QUERY * iteration
		print('num_query_i in get_unique_sample in LOW_RES:', num_query_i)
	else:
		num_query_i = NUM_QUERY * iteration + NUM_INIT_LB
		print('num_query_i in get_unique_sample:', num_query_i)

	difference_i = 0
	num_set_ex_id_i = 0

	while num_set_ex_id_i < num_query_i:
		labeled_idxs[q_idxs[:NUM_QUERY + difference_i]] = True	# get first num_query, e.g. 50
		iter_i_labeled_idxs = np.arange(n_pool)[labeled_idxs]
		print('len(iter_i_labeled_idxs):', len(iter_i_labeled_idxs))

		iter_i_samples = train_features.select(indices=iter_i_labeled_idxs)
		num_set_ex_id_i = len(set(iter_i_samples['example_id']))
		print('number of unique example id:', num_set_ex_id_i)

		assert num_set_ex_id_i <= num_query_i, 'Select too many examples!'
		assert num_set_ex_id_i > 0, "Did not select examples!"

		difference_i = num_query_i - num_set_ex_id_i
		print('difference_i', difference_i)
	
	return iter_i_labeled_idxs

### load dataset

In [9]:
disable_caching()
if LOW_RES:
	## set dir
	pretrain_model_dir = '/mount/arbeitsdaten31/studenten1/linku/pretrain_models' + '/' + MODEL_NAME + '_SQuAD_full_dataset_lr_3e-5'
	strategy_model_dir = model_dir + '/lowRes_' + str(args_input_quota) + '_' + STRATEGY_NAME + '_' + MODEL_NAME +  '_' + DATA_NAME
	## load data
	train_data, val_data = load_dataset_mrqa(DATA_NAME.lower())
else:
	## set dir
	strategy_model_dir = model_dir + '/' + str(NUM_INIT_LB) + '_' + str(args_input_quota) + '_' + STRATEGY_NAME + '_' + MODEL_NAME +  '_' + DATA_NAME
	## load data
	squad = load_dataset(DATA_NAME.lower(), cache_dir=CACHE_DIR)
	train_data = squad["train"]
	val_data = squad["validation"]
	print('Use full training data and full testing data.')

Found cached dataset mrqa (/mount/arbeitsdaten31/studenten1/linku/.cache/mrqa/plain_text/1.1.0/1f2cf5ac32b43f864e6f91d384057a16b69b7d13ba9bcaa200ac277c90938d19)


  0%|          | 0/3 [00:00<?, ?it/s]

model.py

In [10]:
def to_train(num_train_epochs, train_dataloader, device, model, optimizer, lr_scheduler, record_loss=False):
	if LOW_RES:
		print('Training was performed using {} query data, i.e. {} data.'.format(NUM_QUERY, len(train_dataloader.dataset)))
	else:
		print('Training was performed using the sum of {} initial data and {} query data, i.e. {} data.'.format(NUM_INIT_LB, NUM_QUERY, len(train_dataloader.dataset)))
	
	for epoch in range(num_train_epochs):
		model.train()
		for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
			batch = {key: value.to(device) for key, value in batch.items()}
			outputs = model(**batch)
			loss = outputs.loss
			loss.backward()

			optimizer.step()
			lr_scheduler.step()
			optimizer.zero_grad()

		if record_loss:
			print('Train Epoch: {}\tLoss: {:.6f}'.format(epoch, loss.item()))

	model_to_save = model.module if hasattr(model, 'module') else model 
	model_to_save.save_pretrained(strategy_model_dir)
	print('TRAIN done!')

def to_pretrain(num_train_epochs, train_dataloader, device, model, optimizer, lr_scheduler, scaler):
	print('Training was performed using the full dataset ({} data).'.format(len(train_dataloader.dataset)))
	for epoch in range(num_train_epochs):
		model.train()
		for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
			batch = {key: value.to(device) for key, value in batch.items()}
			with amp.autocast():
				outputs = model(**batch)
				loss = outputs.loss
			
			scaler.scale(loss).backward()

			scaler.step(optimizer)
			scaler.update()
			lr_scheduler.step()
			optimizer.zero_grad()

		print('Train Epoch: {}\tLoss: {:.6f}'.format(epoch, loss.item()))

	model_to_save = model.module if hasattr(model, 'module') else model 
	model_to_save.save_pretrained(pretrain_model_dir)
	print('TRAIN done!')

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = defaultdict(list)
    max_answer_length = 30
    n_best = 20
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples, desc="Computing metrics"):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

def compute_metrics_lowRes(start_logits, end_logits, features, examples):
    example_to_features = defaultdict(list)
    max_answer_length = 30
    n_best = 20
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = dict()
    for example in tqdm(examples, desc="Computing metrics"):
        example_id = example["qid"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers[example_id] = best_answer["text"]
        else:
            predicted_answers[example_id] = ""

    theoretical_answers = dict()
    for ex in examples: theoretical_answers[ex["qid"]] = ex["answers"]
    return evaluation(theoretical_answers, predicted_answers)

def get_pred(dataloader, device, features, examples):
    model = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)
    
    model.eval()
    start_logits = []
    end_logits = []

    for batch in tqdm(dataloader, desc="Evaluating_pred"):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(features)]
    end_logits = end_logits[: len(features)]

    if LOW_RES:
        return compute_metrics_lowRes(start_logits, end_logits, features, examples)
    else:
        return compute_metrics(start_logits, end_logits, features, examples)

def get_pretrain_pred(dataloader, device, features, examples):
    model = AutoModelForQuestionAnswering.from_pretrained(pretrain_model_dir).to(device)
    
    model.eval()
    start_logits = []
    end_logits = []

    for batch in tqdm(dataloader, desc="Evaluating_pred"):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(features)]
    end_logits = end_logits[: len(features)]

    return compute_metrics(start_logits, end_logits, features, examples)

def get_prob(dataloader, device, features, examples):
    model = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)

    model.eval()
    start_logits = []
    end_logits = []

    for batch in tqdm(dataloader, desc="Evaluating_prob"):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(features)]
    end_logits = end_logits[: len(features)]

    prob_dict = {}
    example_to_features = defaultdict(list)
    max_answer_length = 30
    n_best = 20
    
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    for example in tqdm(examples):
        if LOW_RES:
            example_id = example["qid"]
        else:
            example_id = example["id"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answers.append(start_logit[start_index] + end_logit[end_index])
        
            if len(answers) > 1:
                prob_dict[feature_index] = softmax(answers)
            elif example_to_features[example_id] != []:
                prob_dict[feature_index] = np.array([0])
    
    return prob_dict

def get_prob_dropout(dataloader, device, features, examples, n_drop=10):
    model = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)
    
    model.train()
    prob_dict = {}
    
    for i in range(n_drop):
        start_logits = []
        end_logits = []
        for batch in tqdm(dataloader, desc="Evaluating_prob_dropout"):
            batch = {key: value.to(device) for key, value in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            start_logits.append(outputs.start_logits.cpu().numpy())
            end_logits.append(outputs.end_logits.cpu().numpy())

        start_logits = np.concatenate(start_logits)
        end_logits = np.concatenate(end_logits)
        start_logits = start_logits[: len(features)]
        end_logits = end_logits[: len(features)]

        example_to_features = defaultdict(list)
        max_answer_length = 30
        n_best = 20
            
        for idx, feature in enumerate(features):
            example_to_features[feature["example_id"]].append(idx)

        n = 0
        for example in tqdm(examples):
            if LOW_RES:
                example_id = example["qid"]
            else:
                example_id = example["id"]
            answers = []

            # Loop through all features associated with that example
            for feature_index in example_to_features[example_id]:
                start_logit = start_logits[feature_index]
                end_logit = end_logits[feature_index]
                offsets = features[feature_index]["offset_mapping"]

                start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
                end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
                for start_index in start_indexes:
                    for end_index in end_indexes:
                        # Skip answers that are not fully in the context
                        if offsets[start_index] is None or offsets[end_index] is None:
                            continue
                        # Skip answers with a length that is either < 0 or > max_answer_length
                        if (
                            end_index < start_index
                            or end_index - start_index + 1 > max_answer_length
                        ):
                            continue

                        answers.append(start_logit[start_index] + end_logit[end_index])

            if 1 < len(answers) < 200: # pad to same numbers of possible answers
                zero_list = [0] * (200 - len(answers))
                answers.extend(zero_list)
            elif len(answers) >= 200:
                answers = answers[:200]

            if len(answers) > 1:
                if example_to_features[example_id][0] not in prob_dict:
                    prob_dict[example_to_features[example_id][0]] = softmax(answers)
                else:
                    prob_dict[example_to_features[example_id][0]] += softmax(answers)
            elif example_to_features[example_id] != []:
                if example_to_features[example_id][0] not in prob_dict:
                    prob_dict[example_to_features[example_id][0]] = np.array([0])   

    for key in prob_dict.keys():
        prob_dict[key] /= n_drop

    return prob_dict

def get_prob_dropout_split(dataloader, device, features, examples, n_drop=10):
    ## use tensor to save the answers

    model = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)
    
    model.train()

    probs = torch.zeros([n_drop, len(dataloader.dataset), 200])
    
    for i in range(n_drop):
        start_logits = []
        end_logits = []
        for batch in tqdm(dataloader, desc="Evaluating_prob_dropout"):
            batch = {key: value.to(device) for key, value in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            start_logits.append(outputs.start_logits.cpu().numpy())
            end_logits.append(outputs.end_logits.cpu().numpy())

        start_logits = np.concatenate(start_logits)
        end_logits = np.concatenate(end_logits)
        start_logits = start_logits[: len(features)]
        end_logits = end_logits[: len(features)]

        example_to_features = defaultdict(list)
        max_answer_length = 30
        n_best = 20
            
        for idx, feature in enumerate(features):
            example_to_features[feature["example_id"]].append(idx)

        n = 0
        for example in tqdm(examples, desc="Computing metrics"):
            if LOW_RES:
                example_id = example["qid"]
            else:
                example_id = example["id"]
            answers = []

            # Loop through all features associated with that example
            for feature_index in example_to_features[example_id]:
                start_logit = start_logits[feature_index]
                end_logit = end_logits[feature_index]
                offsets = features[feature_index]["offset_mapping"]

                start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
                end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
                for start_index in start_indexes:
                    for end_index in end_indexes:
                        # Skip answers that are not fully in the context
                        if offsets[start_index] is None or offsets[end_index] is None:
                            continue
                        # Skip answers with a length that is either < 0 or > max_answer_length
                        if (
                            end_index < start_index
                            or end_index - start_index + 1 > max_answer_length
                        ):
                            continue

                        answers.append(start_logit[start_index] + end_logit[end_index])

            
                if 1 < len(answers) < 200: # pad to same numbers of possible answers
                    zero_list = [0] * (200 - len(answers))
                    answers.extend(zero_list)
                elif len(answers) >= 200:
                    answers = answers[:200]

                probs[i][feature_index] += torch.tensor(softmax(answers))

    return probs

def get_embeddings(dataloader, device):
    model = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir, output_hidden_states=True).to(device)
    
    model.eval()
    embeddings = torch.zeros([len(dataloader.dataset), model.config.to_dict()['hidden_size']])
    idxs_start = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating_prob"):
            batch = {key: value.to(device) for key, value in batch.items()}
        
            outputs = model(**batch)

            hidden_states = outputs.hidden_states
            embedding_of_last_layer = hidden_states[-2][:, 0, :]

            idxs_end = idxs_start + len(hidden_states[-2])
            embeddings[idxs_start:idxs_end] = embedding_of_last_layer.cpu()
            idxs_start = idxs_end
        
    return embeddings

def get_grad_embeddings(dataloader, device, features, examples):
    model = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir, output_hidden_states=True).to(device)
    
    model.eval()

    nLab = 20
    embDim = model.config.to_dict()['hidden_size']
    embeddings = np.zeros([len(dataloader.dataset), embDim * nLab])

    prob_dict = []
    idxs_start = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating_prob"):
            batch = {key: Variable(value.to(device)) for key, value in batch.items()}
                
            # deepAL+: out, e1 = self.clf(x)
            outputs = model(**batch)
            # deepAL+: e1 = e1.data.cpu().numpy()
            hidden_states = outputs.hidden_states
            embedding_of_last_layer = hidden_states[-2][:, 0, :]
            embedding_of_last_layer = embedding_of_last_layer.data.cpu().numpy()

            # matually create features batch
            data_len_batch = len(outputs.start_logits)
            idxs_end = idxs_start + data_len_batch
            batch_idx = list(range(idxs_start, idxs_end))
            batch_feat = features.select(batch_idx)
            idxs_start = idxs_end

            out = logits_to_prob(outputs.start_logits.cpu().numpy(), outputs.end_logits.cpu().numpy(), batch_feat, batch_idx, examples, 200)
            batchProbs = F.softmax(out, dim=1).data.cpu().numpy()
            maxInds = np.argmax(batchProbs, 1)

            for j in range(data_len_batch):
                for c in range(nLab):
                    if c == maxInds[j]:
                        embeddings[batch_idx[j]][embDim * c : embDim * (c+1)] = deepcopy(embedding_of_last_layer[j]) * (1 - batchProbs[j][c]) * -1.0
                    else:
                        embeddings[batch_idx[j]][embDim * c : embDim * (c+1)] = deepcopy(embedding_of_last_layer[j]) * (-1 * batchProbs[j][c]) * -1.0
            
    return embeddings

def logits_to_prob(start_logits, end_logits, features, batch_idx, examples, num_classes):
    probs = torch.zeros([len(batch_idx), num_classes])
    
    example_to_features = defaultdict(list)
    max_answer_length = 30
    n_best = 20

    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append((idx, batch_idx[idx]))
    
    for example in tqdm(examples, desc="Computing metrics"):
        if LOW_RES:
            example_id = example["qid"]
        else:
            example_id = example["id"]
        answers = []
        
        # Loop through all features associated with that example
        for (feature_index, i) in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue
                    answers.append(start_logit[start_index] + end_logit[end_index])


            if 1 < len(answers) < num_classes: # pad to same numbers of possible answers
                zero_list = [0] * (num_classes - len(answers))
                answers.extend(zero_list)
            elif len(answers) >= num_classes:
                answers = answers[:num_classes]
            probs[feature_index] = torch.tensor(answers)

    return probs

query.py

In [11]:
def random_sampling_query(labeled_idxs, n):
    print('Random querying starts!')
    return np.random.choice(np.where(labeled_idxs==0)[0], n, replace=False)
def least_confidence_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n):
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
		unlabeled_data,
		collate_fn=default_data_collator,
		batch_size=MODEL_BATCH,
	)

    print('LC querying starts!')
    print('Query {} data from {} unlabeled training data.'.format(n, len(unlabeled_data)))

    prob_dict = get_prob(unlabeled_dataloader, device, unlabeled_features, examples)
    print('Got probability!')

    confidence_dict = {}
    for idx, probs in prob_dict.items():
        if len(probs) > 1: # if prob_dict['probs'] is not 0
            confidence_dict[idx] = max(probs)
        elif idx:
            confidence_dict[idx] = np.array([0])

    sorted_confidence_list = sorted(confidence_dict.items(), key=lambda x: x[1])
    return unlabeled_idxs[[idx for (idx, confidence) in sorted_confidence_list[:n]]]
def least_confidence_dropout_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n):
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
		unlabeled_data,
		collate_fn=default_data_collator,
		batch_size=MODEL_BATCH,
	)
    
    print('LC dropout querying starts!')
    print('Query {} data from {} unlabeled training data.'.format(n, len(unlabeled_data)))
    
    prob_dict = get_prob_dropout(unlabeled_dataloader, device, unlabeled_features, examples, n_drop=10)
    print('Got probability!')

    confidence_dict = {}
    for idx, probs in prob_dict.items():
        if len(probs) > 1: # if prob_dict['probs'] is not 0
            confidence_dict[idx] = max(probs)
        elif idx:
            confidence_dict[idx] = np.array([0])

    sorted_confidence_list = sorted(confidence_dict.items(), key=lambda x: x[1])
    return unlabeled_idxs[[idx for (idx, confidence) in sorted_confidence_list[:n]]]
def bald_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n):
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
        unlabeled_data,
        collate_fn=default_data_collator,
        batch_size=MODEL_BATCH,
    )
    print('BALD querying starts!')
    print('Query {} data from {} unlabeled training data.'.format(n, len(unlabeled_data)))
    
    probs = get_prob_dropout_split(unlabeled_dataloader, device, unlabeled_features, examples, n_drop=10)
    print('Got probability!')
    probs_mean = probs.mean(0)
    entropy1 = (-probs_mean*torch.log(probs_mean)).sum(1)
    entropy2 = (-probs*torch.log(probs)).sum(2).mean(0)
    uncertainties = entropy2 - entropy1
    # later on, we can use batch
    return unlabeled_idxs[uncertainties.sort()[1][:n]]
def kcenter_greedy_query(n_pool, labeled_idxs, train_dataset, device, n):
    labeled_idxs_in_query = labeled_idxs.copy()
    # train_data = train_dataset
    train_dataloader = DataLoader(train_dataset,
                                  collate_fn=default_data_collator,
                                  batch_size=MODEL_BATCH,
                                )
    print('KCenter greedy querying starts!')
    print('Query {} data.'.format(n))
    
    embeddings = get_embeddings(train_dataloader, device)
    print('Got embeddings!')
    embeddings = embeddings.numpy()

    dist_mat = np.matmul(embeddings, embeddings.transpose())
    sq = np.array(dist_mat.diagonal()).reshape(len(labeled_idxs_in_query), 1)
    dist_mat *= -2
    dist_mat += sq
    dist_mat += sq.transpose()
    dist_mat = np.sqrt(dist_mat)

    mat = dist_mat[~labeled_idxs_in_query, :][:, labeled_idxs_in_query]

    for i in tqdm(range(n), ncols=100):
        mat_min = mat.min(axis=1)
        q_idx_ = mat_min.argmax()
        q_idx = np.arange(n_pool)[~labeled_idxs_in_query][q_idx_]
        labeled_idxs_in_query[q_idx] = True
        mat = np.delete(mat, q_idx_, 0)
        mat = np.append(mat, dist_mat[~labeled_idxs_in_query, q_idx][:, None], axis=1)
        
    return np.arange(n_pool)[(labeled_idxs ^ labeled_idxs_in_query)]
def badge_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n):
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(unlabeled_data,
                                      collate_fn=default_data_collator,
                                      batch_size=MODEL_BATCH,
                                    )
    print('BADGE querying starts!')
    print('Query {} data from {} unlabeled training data.'.format(n, len(unlabeled_data)))

    gradEmbedding = get_grad_embeddings(unlabeled_dataloader, device, unlabeled_features, examples)
    print('Got embeddings!')
    chosen = init_centers(gradEmbedding, n)
    return unlabeled_idxs[chosen]

Next we will preprocess the dataset (training and evaluation data).

In [12]:
train_dataset, train_features, val_dataset, val_features = preprocess_data(train_data, val_data)
context_dict = get_context_id(train_data)
print('len(context_dict):', len(context_dict))

# get the number of extra data after preprocessing
extra = len(train_dataset) - len(train_data)

Map:   0%|          | 0/1354 [00:00<?, ? examples/s]

Map:   0%|          | 0/1354 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

len(context_dict): 1341


Set seed and device

In [13]:
SEED = 1127
# os.environ['TORCH_HOME']='./basicmodel'
os.environ["CUDA_VISIBLE_DEVICES"] = str(1)

# fix random seed
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

all_acc = []
acq_time = []

Experiment Round 1

In [19]:
## record acc performance 
# acc = np.zeros(ITERATION + 1) # quota/batch runs + iter_0
# acc_em = np.zeros(ITERATION + 1)
acc = np.zeros(4) # quota/batch runs + iter_0
acc_em = np.zeros(4)

## generate initial labeled pool
n_pool = len(train_dataset)
print('n_pool:', n_pool)
labeled_idxs = np.zeros(n_pool, dtype=bool)

if LOW_RES:
    print('in low res setting')
    save_model(device, pretrain_model_dir, strategy_model_dir)
    ## print info
    print(DATA_NAME)
    print(STRATEGY_NAME)
else:
    print('not in low res setting')
    tmp_idxs = np.arange(n_pool)
    print('len(tmp_idxs):', len(tmp_idxs))
    np.random.shuffle(tmp_idxs)
    print('len(tmp_idxs):', len(tmp_idxs))
    
    if UNIQ_CONTEXT:
        print('in uc setting')
        tmp_idxs = tmp_idxs[:NUM_INIT_LB+extra]
        uc_tmp_idxs = get_unique_context(tmp_idxs, train_features, context_dict) # len() = almost num_query + extra
        print('len(uc_tmp_idxs):', len(uc_tmp_idxs))
        iter_0_labeled_idxs = get_unique_sample(labeled_idxs, uc_tmp_idxs, n_pool, train_features)
        c_id = get_final_c_id(iter_0_labeled_idxs, train_features) # len() = num_query
    else:
        print('not in uc setting')
        iter_0_labeled_idxs = get_unique_sample(labeled_idxs, tmp_idxs, n_pool, train_features)

    ## load the selected train data to DataLoader
    train_dataloader = DataLoader(
        train_dataset.select(indices=iter_0_labeled_idxs),
        shuffle=True,
        collate_fn=default_data_collator,
        batch_size=MODEL_BATCH,
    )

    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = NUM_TRAIN_EPOCH * num_update_steps_per_epoch

    ## network
    model = AutoModelForQuestionAnswering.from_pretrained(get_model(MODEL_NAME)).to(device)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    ## print info
    print(DATA_NAME)
    print(STRATEGY_NAME)

    ## iteration 0 accuracy
    to_train(NUM_TRAIN_EPOCH, train_dataloader, device, model, optimizer, lr_scheduler)

## load the selected validation data to DataLoader
eval_dataloader = DataLoader(
    val_dataset, 
    collate_fn=default_data_collator, 
    batch_size=MODEL_BATCH
)

acc_scores_0 = get_pred(eval_dataloader, device, val_features, val_data) # add i=1 to use model from models_dir
acc[0] = acc_scores_0['f1']
acc_em[0] = acc_scores_0['exact_match']

print('Round 0\ntesting accuracy {}'.format(acc[0]))
print('testing accuracy em {}'.format(acc_em[0]))
# time = datetime.datetime.now()
# print('Time spent for init training:', (time - start))
print('\n')

n_pool: 1442
in low res setting
BioASQ
LeastConfidence


Evaluating_pred:   0%|          | 0/21 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/150 [00:00<?, ?it/s]

Round 0
testing accuracy 57.91499048455571
testing accuracy em 35.333333333333336




Iteration 1

In [16]:
i = 1
total_query = NUM_QUERY + extra
		
## query
if STRATEGY_NAME == 'RandomSampling':
    q_idxs = random_sampling_query(labeled_idxs, total_query)
elif STRATEGY_NAME == 'LeastConfidence':
    q_idxs = least_confidence_query(n_pool, labeled_idxs, train_dataset, train_features, train_data, device, total_query)
elif STRATEGY_NAME == 'LeastConfidenceDropout':
    q_idxs = least_confidence_dropout_query(n_pool, labeled_idxs, train_dataset, train_features, train_data, device, total_query)
elif STRATEGY_NAME == 'BALDDropout':
    q_idxs = bald_query(n_pool, labeled_idxs, train_dataset, train_features, train_data, device, total_query)
elif STRATEGY_NAME == 'KCenterGreedy':
    q_idxs = kcenter_greedy_query(n_pool, labeled_idxs, train_dataset, device, total_query)
elif STRATEGY_NAME == 'BadgeSampling':
    q_idxs = badge_query(n_pool, labeled_idxs, train_dataset, train_features, train_data, device, total_query)
else:
    raise NotImplementedError

LC querying starts!
Query 138 data from 1442 unlabeled training data.


Evaluating_prob:   0%|          | 0/181 [00:00<?, ?it/s]

  0%|          | 0/1354 [00:00<?, ?it/s]

Got probability!


In [25]:
## update
if UNIQ_CONTEXT:
    print('in uc setting')
    if LOW_RES:
        print('in lr setting')
        uc_q_idxs = get_unique_context(q_idxs, train_features, context_dict)
    else:
        print('not in lr setting')
        uc_q_idxs = get_unique_context(q_idxs, train_features, context_dict, c_id)
    
    iter_i_labeled_idxs = get_unique_sample(labeled_idxs, uc_q_idxs, n_pool, train_features, i)
    c_id = get_final_c_id(iter_i_labeled_idxs, train_features, context_dict)
else:
    print('not in uc setting')
    iter_i_labeled_idxs = get_unique_sample(labeled_idxs, q_idxs, n_pool, train_features, i)

in uc setting
in lr setting


Creating unique context idxs:   0%|          | 0/138 [00:00<?, ?it/s]

num_query_i in get_unique_sample in LOW_RES: 50
len(iter_i_labeled_idxs): 50
number of unique example id: 50
difference_i 0


Creating final context id:   0%|          | 0/50 [00:00<?, ?it/s]

In [26]:
train_dataloader_i = DataLoader(
    train_dataset.select(indices=iter_i_labeled_idxs),
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=MODEL_BATCH,
)

num_update_steps_per_epoch_i = len(train_dataloader_i)
num_training_steps_i = NUM_TRAIN_EPOCH * num_update_steps_per_epoch_i

model_i = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)
optimizer_i = AdamW(model_i.parameters(), lr=LEARNING_RATE)

lr_scheduler_i = get_scheduler(
    "linear",
    optimizer=optimizer_i,
    num_warmup_steps=0,
    num_training_steps=num_training_steps_i,
)

## train
to_train(NUM_TRAIN_EPOCH, train_dataloader_i, device, model_i, optimizer_i, lr_scheduler_i)

## iteration i accuracy
print('iter_{} get_pred!'.format(i))
acc_scores_i = get_pred(eval_dataloader, device, val_features, val_data)
acc[i] = acc_scores_i['f1']
acc_em[i] = acc_scores_i['exact_match']
print('testing accuracy {}'.format(acc[i]))
print('testing accuracy em {}'.format(acc_em[i]))
# print('Time spent for training after querying:', (datetime.datetime.now() - time))
# time = datetime.datetime.now()
print('\n')

torch.cuda.empty_cache()

Training was performed using 50 query data, i.e. 50 data.


Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

TRAIN done!
iter_1 get_pred!


Evaluating_pred:   0%|          | 0/21 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/150 [00:00<?, ?it/s]

testing accuracy 60.03403810360334
testing accuracy em 35.333333333333336




Iteration 2

In [27]:
i = 2
total_query = NUM_QUERY + extra
		
## query
if STRATEGY_NAME == 'RandomSampling':
    q_idxs = random_sampling_query(labeled_idxs, total_query)
elif STRATEGY_NAME == 'LeastConfidence':
    q_idxs = least_confidence_query(n_pool, labeled_idxs, train_dataset, train_features, train_data, device, total_query)
elif STRATEGY_NAME == 'LeastConfidenceDropout':
    q_idxs = least_confidence_dropout_query(n_pool, labeled_idxs, train_dataset, train_features, train_data, device, total_query)
elif STRATEGY_NAME == 'BALDDropout':
    q_idxs = bald_query(n_pool, labeled_idxs, train_dataset, train_features, train_data, device, total_query)
elif STRATEGY_NAME == 'KCenterGreedy':
    q_idxs = kcenter_greedy_query(n_pool, labeled_idxs, train_dataset, device, total_query)
elif STRATEGY_NAME == 'BadgeSampling':
    q_idxs = badge_query(n_pool, labeled_idxs, train_dataset, train_features, train_data, device, total_query)
else:
    raise NotImplementedError


LC querying starts!
Query 138 data from 1392 unlabeled training data.


Evaluating_prob:   0%|          | 0/174 [00:00<?, ?it/s]

  0%|          | 0/1354 [00:00<?, ?it/s]

Got probability!


In [28]:
## update
if UNIQ_CONTEXT:
    if LOW_RES:
        uc_q_idxs = get_unique_context(q_idxs, train_features, context_dict)
    else:
        uc_q_idxs = get_unique_context(q_idxs, train_features, context_dict, c_id)
    
    iter_i_labeled_idxs = get_unique_sample(labeled_idxs, uc_q_idxs, n_pool, train_features, i)
    c_id = get_final_c_id(iter_i_labeled_idxs, train_features, context_dict)
else:
    iter_i_labeled_idxs = get_unique_sample(labeled_idxs, q_idxs, n_pool, train_features, i)

Creating unique context idxs:   0%|          | 0/138 [00:00<?, ?it/s]

num_query_i in get_unique_sample in LOW_RES: 100
len(iter_i_labeled_idxs): 100
number of unique example id: 99
difference_i 1
len(iter_i_labeled_idxs): 101
number of unique example id: 100
difference_i 0


Creating final context id:   0%|          | 0/101 [00:00<?, ?it/s]

In [29]:

    
train_dataloader_i = DataLoader(
    train_dataset.select(indices=iter_i_labeled_idxs),
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=MODEL_BATCH,
)

num_update_steps_per_epoch_i = len(train_dataloader_i)
num_training_steps_i = NUM_TRAIN_EPOCH * num_update_steps_per_epoch_i

model_i = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)
optimizer_i = AdamW(model_i.parameters(), lr=LEARNING_RATE)

lr_scheduler_i = get_scheduler(
    "linear",
    optimizer=optimizer_i,
    num_warmup_steps=0,
    num_training_steps=num_training_steps_i,
)

## train
to_train(NUM_TRAIN_EPOCH, train_dataloader_i, device, model_i, optimizer_i, lr_scheduler_i)

## iteration i accuracy
print('iter_{} get_pred!'.format(i))
acc_scores_i = get_pred(eval_dataloader, device, val_features, val_data)
acc[i] = acc_scores_i['f1']
acc_em[i] = acc_scores_i['exact_match']
print('testing accuracy {}'.format(acc[i]))
print('testing accuracy em {}'.format(acc_em[i]))
# print('Time spent for training after querying:', (datetime.datetime.now() - time))
# time = datetime.datetime.now()
print('\n')

torch.cuda.empty_cache()

Training was performed using 50 query data, i.e. 101 data.


Training:   0%|          | 0/13 [00:00<?, ?it/s]

Training:   0%|          | 0/13 [00:00<?, ?it/s]

Training:   0%|          | 0/13 [00:00<?, ?it/s]

TRAIN done!
iter_2 get_pred!


Evaluating_pred:   0%|          | 0/21 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/150 [00:00<?, ?it/s]

testing accuracy 55.64636474636475
testing accuracy em 30.666666666666668




Iteration 3

In [30]:
i = 3
total_query = NUM_QUERY + extra
		
## query
if STRATEGY_NAME == 'RandomSampling':
    q_idxs = random_sampling_query(labeled_idxs, total_query)
elif STRATEGY_NAME == 'LeastConfidence':
    q_idxs = least_confidence_query(n_pool, labeled_idxs, train_dataset, train_features, train_data, device, total_query)
elif STRATEGY_NAME == 'LeastConfidenceDropout':
    q_idxs = least_confidence_dropout_query(n_pool, labeled_idxs, train_dataset, train_features, train_data, device, total_query)
elif STRATEGY_NAME == 'BALDDropout':
    q_idxs = bald_query(n_pool, labeled_idxs, train_dataset, train_features, train_data, device, total_query)
elif STRATEGY_NAME == 'KCenterGreedy':
    q_idxs = kcenter_greedy_query(n_pool, labeled_idxs, train_dataset, device, total_query)
elif STRATEGY_NAME == 'BadgeSampling':
    q_idxs = badge_query(n_pool, labeled_idxs, train_dataset, train_features, train_data, device, total_query)
else:
    raise NotImplementedError



LC querying starts!
Query 138 data from 1341 unlabeled training data.


Evaluating_prob:   0%|          | 0/168 [00:00<?, ?it/s]

  0%|          | 0/1354 [00:00<?, ?it/s]

Got probability!


Creating unique context idxs:   0%|          | 0/138 [00:00<?, ?it/s]

num_query_i in get_unique_sample in LOW_RES: 150
len(iter_i_labeled_idxs): 151
number of unique example id: 150
difference_i 0


TypeError: get_final_c_id() missing 1 required positional argument: 'context_dict'

In [31]:
## update
if UNIQ_CONTEXT:
    if LOW_RES:
        uc_q_idxs = get_unique_context(q_idxs, train_features, context_dict)
    else:
        uc_q_idxs = get_unique_context(q_idxs, train_features, context_dict, c_id)
    
    iter_i_labeled_idxs = get_unique_sample(labeled_idxs, uc_q_idxs, n_pool, train_features, i)
    c_id = get_final_c_id(iter_i_labeled_idxs, train_features, context_dict)
else:
    iter_i_labeled_idxs = get_unique_sample(labeled_idxs, q_idxs, n_pool, train_features, i)
    
train_dataloader_i = DataLoader(
    train_dataset.select(indices=iter_i_labeled_idxs),
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=MODEL_BATCH,
)

num_update_steps_per_epoch_i = len(train_dataloader_i)
num_training_steps_i = NUM_TRAIN_EPOCH * num_update_steps_per_epoch_i

model_i = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)
optimizer_i = AdamW(model_i.parameters(), lr=LEARNING_RATE)

lr_scheduler_i = get_scheduler(
    "linear",
    optimizer=optimizer_i,
    num_warmup_steps=0,
    num_training_steps=num_training_steps_i,
)

## train
to_train(NUM_TRAIN_EPOCH, train_dataloader_i, device, model_i, optimizer_i, lr_scheduler_i)

## iteration i accuracy
print('iter_{} get_pred!'.format(i))
acc_scores_i = get_pred(eval_dataloader, device, val_features, val_data)
acc[i] = acc_scores_i['f1']
acc_em[i] = acc_scores_i['exact_match']
print('testing accuracy {}'.format(acc[i]))
print('testing accuracy em {}'.format(acc_em[i]))
# print('Time spent for training after querying:', (datetime.datetime.now() - time))
# time = datetime.datetime.now()
print('\n')

torch.cuda.empty_cache()

Creating unique context idxs:   0%|          | 0/138 [00:00<?, ?it/s]

num_query_i in get_unique_sample in LOW_RES: 150
len(iter_i_labeled_idxs): 151
number of unique example id: 150
difference_i 0


Creating final context id:   0%|          | 0/151 [00:00<?, ?it/s]

Training was performed using 50 query data, i.e. 151 data.


Training:   0%|          | 0/19 [00:00<?, ?it/s]

Training:   0%|          | 0/19 [00:00<?, ?it/s]

Training:   0%|          | 0/19 [00:00<?, ?it/s]

TRAIN done!
iter_3 get_pred!


Evaluating_pred:   0%|          | 0/21 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/150 [00:00<?, ?it/s]

testing accuracy 52.76296296296296
testing accuracy em 26.666666666666668


