# Question Answering Model 
## no trainer

- dataset
- torch
- transformers
- transformers[torch]
- evaluate

import packages

In [1]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    default_data_collator,
    get_scheduler,
    AutoModelForQuestionAnswering,
    BertConfig
)
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch

import evaluate
import collections
from tqdm.auto import tqdm
import numpy as np
from sklearn.cluster import KMeans

import os
import re
import datetime

from torch.autograd import Variable
import torch.nn.functional as F
from copy import deepcopy

Set cache directory.

In [2]:
CACHE_DIR='/mount/arbeitsdaten31/studenten1/linku/.cache'
%set_env TRANSFORMERS_CACHE $CACHE_DIR
%set_env HF_MODULES_CACHE $CACHE_DIR
%set_env HF_DATASETS_CACHE $CACHE_DIR

model_dir = '/mount/arbeitsdaten31/studenten1/linku/models'

env: TRANSFORMERS_CACHE=/mount/arbeitsdaten31/studenten1/linku/.cache
env: HF_MODULES_CACHE=/mount/arbeitsdaten31/studenten1/linku/.cache
env: HF_DATASETS_CACHE=/mount/arbeitsdaten31/studenten1/linku/.cache


### arguments.py

args_input.

In [3]:
args_input_ALstrategy = 'RandomSampling'
args_input_initseed = 100 # 1000
args_input_quota = 90 # 1000
args_input_batch = 30 # 128
args_input_dataset_name = 'SQuAD'
args_input_iteration = 5
args_input_model_batch = 8 # already add in arguments.py
args_input_max_length = None
args_input_learning_rate = 1e-4
args_input_model = 'RoBERTa'

stride = 128

in main.py

In [4]:
MAX_LENGTH = args_input_max_length
NUM_QUERY = args_input_batch
NUM_INIT_LB = args_input_initseed
NUM_ROUND = int(args_input_quota / args_input_batch)
DATA_NAME = args_input_dataset_name
STRATEGY_NAME = args_input_ALstrategy
MODEL_NAME = args_input_model
LEARNING_RATE = args_input_learning_rate
strategy_model_dir = model_dir + '/' + str(NUM_INIT_LB) + '_' + str(args_input_quota) + '_' + STRATEGY_NAME + '_' + MODEL_NAME +  '_' + DATA_NAME

### load dataset

In [5]:
squad = load_dataset(args_input_dataset_name.lower(), cache_dir=CACHE_DIR)
# squad["train"] = squad["train"].shuffle(42).select(range(2000))
squad["train"] = squad["train"].select(range(4000))
squad["validation"] = squad["validation"].select(range(1200))
# squad["train"] = squad["train"].select(range(670, 680))

Found cached dataset squad (/mount/arbeitsdaten31/studenten1/linku/.cache/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
len(squad["train"])

4000

The function for preprocessing the dataset (training and evaluation data).

In [7]:
def preprocess_training_features(examples, tokenizer):
    # keep ["offset_mapping"], for compute_metrics()
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs["offset_mapping"]
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    example_ids = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        
        example_ids.append(examples["id"][sample_idx]) # newly added for used in unlabel data predict

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["example_id"] = example_ids
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [8]:
def preprocess_training_examples(examples, tokenizer):
    # no ['offset_mapping'], for .train() and .eval()
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    example_ids = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        
        example_ids.append(examples["id"][sample_idx]) # newly added for used in unlabel data predict

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [9]:
def preprocess_validation_examples(examples, tokenizer):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [10]:
# load tokenizer for dataset preprocessing
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# preprocess data
train_dataset = squad["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=squad["train"].column_names,
	fn_kwargs=dict(tokenizer=tokenizer)
)
train_features = squad["train"].map(
    preprocess_training_features,
    batched=True,
    remove_columns=squad["train"].column_names,
	fn_kwargs=dict(tokenizer=tokenizer)
)
val_dataset = squad["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["validation"].column_names,
	fn_kwargs=dict(tokenizer=tokenizer)
)
val_features = squad["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["validation"].column_names,
	fn_kwargs=dict(tokenizer=tokenizer)
)

Loading cached processed dataset at /mount/arbeitsdaten31/studenten1/linku/.cache/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-3faa3291c999dafc.arrow
Loading cached processed dataset at /mount/arbeitsdaten31/studenten1/linku/.cache/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-5109a7b4f5a5a8e4.arrow
Loading cached processed dataset at /mount/arbeitsdaten31/studenten1/linku/.cache/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-5fea3a5e35231e0f.arrow
Loading cached processed dataset at /mount/arbeitsdaten31/studenten1/linku/.cache/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-5fea3a5e35231e0f.arrow


In [11]:
train_dataset.set_format("torch")
train_features.set_format("torch")
val_dataset = val_dataset.remove_columns(["offset_mapping"])
val_dataset.set_format("torch")
val_features.set_format("torch")

# get the number of extra data after preprocessing
extra = len(train_dataset) - len(squad['train'])

In [12]:
len(train_dataset)

4009

## model.py

In [13]:
def to_train(num_train_epochs, train_dataloader, device, model, optimizer, lr_scheduler, record_loss=False):
	print('Training was performed using the sum of {} initial data and {} query data, i.e. {} data.'.format(NUM_INIT_LB, NUM_QUERY, len(train_dataloader.dataset)))
	for epoch in range(num_train_epochs):
		model.train()
		for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
			batch = {key: value.to(device) for key, value in batch.items()}
			outputs = model(**batch)
			loss = outputs.loss
			loss.backward()

			optimizer.step()
			lr_scheduler.step()
			optimizer.zero_grad()

		if record_loss:
			print('Train Epoch: {}\tLoss: {:.6f}'.format(epoch, loss.item()))

	model_to_save = model.module if hasattr(model, 'module') else model 
	model_to_save.save_pretrained(strategy_model_dir)
	print('TRAIN done!')

In [14]:
metric = evaluate.load("squad")

In [15]:
def get_pred(dataloader, device, features, examples, lowRes=False):
    if lowRes:
        model = AutoModelForQuestionAnswering.from_pretrained(pretrain_model_dir).to(device)
    else:
        model = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)
    
    model.eval()
    start_logits = []
    end_logits = []

    for batch in tqdm(dataloader, desc="Evaluating_pred"):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(features)]
    end_logits = end_logits[: len(features)]

    return compute_metrics(start_logits, end_logits, features, examples)


In [16]:
def get_prob(dataloader, device, features, examples, lowRes=False):
    if lowRes:
        model = AutoModelForQuestionAnswering.from_pretrained(pretrain_model_dir).to(device)
    else:
        model = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)

    model.eval()
    start_logits = []
    end_logits = []

    for batch in tqdm(dataloader, desc="Evaluating_prob"):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(features)]
    end_logits = end_logits[: len(features)]

    prob_dict = {}
    example_to_features = collections.defaultdict(list)
    max_answer_length = 30
    n_best = 20
    
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    for example in tqdm(examples):
        example_id = example["id"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answers.append(start_logit[start_index] + end_logit[end_index])
        
            if len(answers) > 1:
                prob_dict[feature_index] = softmax(answers)
            elif example_to_features[example_id] != []:
                prob_dict[feature_index] = np.array([0])
    
    return prob_dict

In [17]:
def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    max_answer_length = 30
    n_best = 20
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples, desc="Computing metrics"):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

## utils.py

In [18]:
def get_unlabel_data(n_pool, labeled_idxs, train_dataset):
    unlabeled_idxs = np.arange(n_pool)[~labeled_idxs]
    unlabeled_data = train_dataset.select(indices=unlabeled_idxs)
    return unlabeled_idxs, unlabeled_data

In [19]:
def softmax(x): 
    """Compute softmax values for each sets of scores in x."""
    softmax_num = np.exp(x) / np.sum(np.exp(x), axis=0)
    return np.round(softmax_num, decimals=4)

In [20]:
def get_aubc(quota, bsize, resseq):
	# it is equal to use np.trapz for calculation
	ressum = 0.0
	if quota % bsize == 0:
		for i in range(len(resseq)-1):
			ressum = ressum + (resseq[i+1] + resseq[i]) * bsize / 2

	else:
		for i in range(len(resseq)-2):
			ressum = ressum + (resseq[i+1] + resseq[i]) * bsize / 2
		k = quota % bsize
		ressum = ressum + ((resseq[-1] + resseq[-2]) * k / 2)
	ressum = round(ressum / quota,3)
	
	return ressum

In [21]:
def get_mean_stddev(datax):
	return round(np.mean(datax),4),round(np.std(datax),4)

## query.py

In [22]:
def random_sampling_query(labeled_idxs, n):
    return np.random.choice(np.where(labeled_idxs==0)[0], n, replace=False)

In [23]:
def least_confidence_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n):
    # deepAL+: unlabeled_idxs, unlabeled_data = self.dataset.get_unlabeled_data()
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
		unlabeled_data,
		shuffle=False,
		collate_fn=default_data_collator,
		batch_size=8,
	)
    # TODO: print for recording
    print('LC querying starts!')
    # deepAL+: probs = self.predict_prob(unlabeled_data)
    prob_dict = get_prob(unlabeled_dataloader, device, unlabeled_features, examples)
    # TODO: print for recording
    print('Got probability!')
    # deepAL+: uncertainties = probs.max(1)[0]
    confidence_dict = {}
    for idx, probs in prob_dict.items():
        if len(probs) > 1: # if prob_dict['probs'] is not 0
            confidence_dict[idx] = max(probs)
        elif idx:
            confidence_dict[idx] = np.array([0])

    # deepAL+: return unlabeled_idxs[uncertainties.sort()[1][:n]]
    sorted_confidence_list = sorted(confidence_dict.items, key=lambda x: x[1])
    return unlabeled_idxs[[idx for (idx, confidence) in sorted_confidence_list[:n]]]

# main.py

### seed and device

In [24]:
SEED = 1127
# os.environ['TORCH_HOME']='./basicmodel'
os.environ["CUDA_VISIBLE_DEVICES"] = str(2)

# fix random seed
np.random.seed(SEED)
torch.manual_seed(SEED)
# torch.backends.cudnn.enabled  = True
# torch.backends.cudnn.benchmark= True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### start experiment

In [25]:
ITERATION = args_input_iteration
MODEL_BATCH = args_input_model_batch
# NUM_TRAIN_EPOCH = args_input.train_epochs
NUM_TRAIN_EPOCH = 3

all_acc = []
acq_time = []

In [26]:
begin = datetime.datetime.now()

# repeate # iteration trials
while (ITERATION > 0): 
	ITERATION = ITERATION - 1
	
	start = datetime.datetime.now()

	## generate initial labeled pool
	n_pool = len(train_dataset)
	labeled_idxs = np.zeros(n_pool, dtype=bool)

	tmp_idxs = np.arange(n_pool)
	np.random.shuffle(tmp_idxs)
	
	difference_0 = 0
	num_set_ex_id_0 = 0

	while num_set_ex_id_0 != NUM_INIT_LB:        
		labeled_idxs[tmp_idxs[:NUM_INIT_LB + difference_0]] = True
		run_0_labeled_idxs = np.arange(n_pool)[labeled_idxs]

		run_0_samples = train_features.select(indices=run_0_labeled_idxs)
		num_set_ex_id_0 = len(set(run_0_samples['example_id']))

		difference_0 = NUM_INIT_LB - num_set_ex_id_0

	## record acc performance 
	acc = np.zeros(NUM_ROUND + 1) # quota/batch runs + run_0
	acc_em = np.zeros(NUM_ROUND + 1)

	## load the selected train data to DataLoader
	train_dataloader = DataLoader(
		train_dataset.select(indices=run_0_labeled_idxs),
		shuffle=True,
		collate_fn=default_data_collator,
		batch_size=MODEL_BATCH,
	)

	eval_dataloader = DataLoader(
		val_dataset, 
		collate_fn=default_data_collator, 
		batch_size=MODEL_BATCH
	)

	num_update_steps_per_epoch = len(train_dataloader)
	num_training_steps = NUM_TRAIN_EPOCH * num_update_steps_per_epoch

    ## network
	model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased").to(device)
	optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
	
	lr_scheduler = get_scheduler(
		"linear",
		optimizer=optimizer,
		num_warmup_steps=0,
		num_training_steps=num_training_steps,
	)

	## print info
	print(DATA_NAME)
	print(STRATEGY_NAME)
	
	## round 0 accuracy
	to_train(NUM_TRAIN_EPOCH, train_dataloader, device, model, optimizer, lr_scheduler)
	
	acc_scores_0 = get_pred(eval_dataloader, device, val_features, squad['validation']) # add rd=1 to use model from models_dir
	acc[0] = acc_scores_0['f1']
	acc_em[0] = acc_scores_0['exact_match']

	print('Round 0\ntesting accuracy {}'.format(acc[0]))
	print('testing accuracy em {}'.format(acc_em[0]))
	time = datetime.datetime.now()
	print('Time spent for init training:', (time - start))
	print('\n')
	
	## round 1 to rd
	for rd in range(1, NUM_ROUND+1):
		print('Round {} in Iteration {}'.format(rd, 5 - ITERATION))

		## use total_query (NUM_QUERY + extra) to query instead of just NUM_QUERY
		total_query = NUM_QUERY + extra
		
		## query
		if STRATEGY_NAME == 'RandomSampling':
			q_idxs = random_sampling_query(labeled_idxs, total_query)
		elif STRATEGY_NAME == 'LeastConfidence':
			q_idxs = least_confidence_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], device, total_query)
		else:
			raise NotImplementedError

		print('Time spent for querying:', (datetime.datetime.now() - time))
		time = datetime.datetime.now()

		## update
		 
		## goal of total query data: sum NUM_QUERY and the number of set run_0_data
		num_set_query_rd = NUM_QUERY * rd + NUM_INIT_LB
		
		difference_rd = 0
		num_set_ex_id_rd = 0

		while num_set_ex_id_rd != num_set_query_rd:        
			labeled_idxs[q_idxs[:NUM_QUERY + difference_rd]] = True
			run_rd_labeled_idxs = np.arange(n_pool)[labeled_idxs]

			run_rd_samples = train_features.select(indices=run_rd_labeled_idxs)
			num_set_ex_id_rd = len(set(run_rd_samples['example_id']))

			difference_rd = num_set_query_rd - num_set_ex_id_rd

		train_dataloader_rd = DataLoader(
			train_dataset.select(indices=run_rd_labeled_idxs),
			shuffle=True,
			collate_fn=default_data_collator,
			batch_size=MODEL_BATCH,
		)

		num_update_steps_per_epoch_rd = len(train_dataloader_rd)
		num_training_steps_rd = NUM_TRAIN_EPOCH * num_update_steps_per_epoch_rd

		model_rd = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)
		optimizer_rd = AdamW(model_rd.parameters(), lr=LEARNING_RATE)
		
		lr_scheduler_rd = get_scheduler(
			"linear",
			optimizer=optimizer_rd,
			num_warmup_steps=0,
			num_training_steps=num_training_steps_rd,
		)
		
		## train
		to_train(NUM_TRAIN_EPOCH, train_dataloader_rd, device, model_rd, optimizer_rd, lr_scheduler_rd)

		## round rd accuracy
		print('rd_{} get_pred!'.format(rd))
		acc_scores_rd = get_pred(eval_dataloader, device, val_features, squad['validation'])
		acc[rd] = acc_scores_rd['f1']
		acc_em[rd] = acc_scores_rd['exact_match']
		print('testing accuracy {}'.format(acc[rd]))
		print('testing accuracy em {}'.format(acc_em[rd]))
		print('Time spent for training after querying:', (datetime.datetime.now() - time))
		time = datetime.datetime.now()
		print('\n')

		torch.cuda.empty_cache()
	
	## print results
	print('SEED {}'.format(SEED))
	print(STRATEGY_NAME)
	print(acc)
	all_acc.append(acc)
	
	## save model and record acq time
	timestamp = re.sub('\.[0-9]*', '_', str(datetime.datetime.now())).replace(" ", "_").replace("-", "").replace(":","")
	final_model_dir = model_dir + '/' + timestamp + str(NUM_INIT_LB) + '_' + str(args_input_quota) + '_' + STRATEGY_NAME + '_' + MODEL_NAME + '_' + DATA_NAME
	os.makedirs(final_model_dir, exist_ok=True)
	end = datetime.datetime.now()
	print('Time spent in iteration {}: {}'.format(5 - ITERATION, datetime.datetime.now() - begin))
	acq_time.append(round(float((end-start).seconds), 3))

	final_model = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)
	model_to_save = final_model.module if hasattr(final_model, 'module') else final_model 
	model_to_save.save_pretrained(final_model_dir)

# cal mean & standard deviation
print('Time spent in total:', (datetime.datetime.now() - begin))
acc_m = []
file_name_res = str(NUM_INIT_LB) + '_' + str(args_input_quota) + '_' + STRATEGY_NAME + '_' + MODEL_NAME + '_' + DATA_NAME + '_normal_res.txt'
file_res =  open(os.path.join(os.path.abspath('') + '/results', '%s' % file_name_res),'w')

# save result
for i in range(len(all_acc)):
	acc_m.append(get_aubc(args_input_quota, NUM_QUERY, all_acc[i]))
	print(str(i) + ': ' + str(acc_m[i]))
	file_res.writelines(str(i) + ': ' + str(acc_m[i]) + '\n')
mean_acc, stddev_acc = get_mean_stddev(acc_m)
mean_time, stddev_time = get_mean_stddev(acq_time)

print('mean AUBC(acc): ' + str(mean_acc) + '. std dev AUBC(acc): ' + str(stddev_acc))
print('mean time: ' + str(mean_time) + '. std dev time: ' + str(stddev_time))

avg_acc = np.mean(np.array(all_acc),axis=0)
for i in range(len(avg_acc)):
	tmp = 'Size of training set is ' + str(NUM_INIT_LB + i * NUM_QUERY) + ', ' + 'accuracy is ' + str(round(avg_acc[i],4)) + '.' + '\n'
	file_res.writelines(tmp)

file_res.writelines('mean acc: ' + str(mean_acc) + '. std dev acc: ' + str(stddev_acc) + '\n')
file_res.writelines('mean time: ' + str(mean_time) + '. std dev acc: ' + str(stddev_time) + '\n')

file_res.close()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_out

SQuAD
RandomSampling
Training was performed using the sum of 100 initial data and 30 query data, i.e. 100 data.


Training:   0%|          | 0/13 [00:00<?, ?it/s]

Training:   0%|          | 0/13 [00:00<?, ?it/s]

Training:   0%|          | 0/13 [00:00<?, ?it/s]

TRAIN done!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

Round 0
testing accuracy 19.615624405461663
testing accuracy em 13.083333333333334
Time spent for init training: 0:01:28.881604


Round 1 in Iteration 1
Time spent for querying: 0:00:00.000888
Training was performed using the sum of 100 initial data and 30 query data, i.e. 130 data.


Training:   0%|          | 0/17 [00:00<?, ?it/s]

Training:   0%|          | 0/17 [00:00<?, ?it/s]

Training:   0%|          | 0/17 [00:00<?, ?it/s]

TRAIN done!
rd_1 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 29.60985733247591
testing accuracy em 22.0
Time spent for training after querying: 0:01:32.936987


Round 2 in Iteration 1
Time spent for querying: 0:00:00.289659
Training was performed using the sum of 100 initial data and 30 query data, i.e. 160 data.


Training:   0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/20 [00:00<?, ?it/s]

TRAIN done!
rd_2 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 40.80897464294515
testing accuracy em 31.583333333333332
Time spent for training after querying: 0:01:42.923820


Round 3 in Iteration 1
Time spent for querying: 0:00:00.281211
Training was performed using the sum of 100 initial data and 30 query data, i.e. 190 data.


Training:   0%|          | 0/24 [00:00<?, ?it/s]

Training:   0%|          | 0/24 [00:00<?, ?it/s]

Training:   0%|          | 0/24 [00:00<?, ?it/s]

TRAIN done!
rd_3 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 42.06004921442168
testing accuracy em 33.583333333333336
Time spent for training after querying: 0:01:49.175038


SEED 1127
RandomSampling
[19.61562441 29.60985733 40.80897464 42.06004921]
Time spent in iteration 1: 0:06:34.789439


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_out

SQuAD
RandomSampling
Training was performed using the sum of 100 initial data and 30 query data, i.e. 100 data.


Training:   0%|          | 0/13 [00:00<?, ?it/s]

Training:   0%|          | 0/13 [00:00<?, ?it/s]

Training:   0%|          | 0/13 [00:00<?, ?it/s]

TRAIN done!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

Round 0
testing accuracy 11.921907115317625
testing accuracy em 7.166666666666667
Time spent for init training: 0:01:26.528355


Round 1 in Iteration 2
Time spent for querying: 0:00:00.001152
Training was performed using the sum of 100 initial data and 30 query data, i.e. 130 data.


Training:   0%|          | 0/17 [00:00<?, ?it/s]

Training:   0%|          | 0/17 [00:00<?, ?it/s]

Training:   0%|          | 0/17 [00:00<?, ?it/s]

TRAIN done!
rd_1 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 15.724019813892113
testing accuracy em 10.75
Time spent for training after querying: 0:01:34.228541


Round 2 in Iteration 2
Time spent for querying: 0:00:00.325824
Training was performed using the sum of 100 initial data and 30 query data, i.e. 160 data.


Training:   0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/20 [00:00<?, ?it/s]

TRAIN done!
rd_2 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 20.846701033628175
testing accuracy em 15.5
Time spent for training after querying: 0:01:46.380528


Round 3 in Iteration 2
Time spent for querying: 0:00:00.288264
Training was performed using the sum of 100 initial data and 30 query data, i.e. 190 data.


Training:   0%|          | 0/24 [00:00<?, ?it/s]

Training:   0%|          | 0/24 [00:00<?, ?it/s]

Training:   0%|          | 0/24 [00:00<?, ?it/s]

TRAIN done!
rd_3 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 35.29815255375614
testing accuracy em 27.666666666666668
Time spent for training after querying: 0:01:49.708837


SEED 1127
RandomSampling
[11.92190712 15.72401981 20.84670103 35.29815255]
Time spent in iteration 2: 0:13:18.843675


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_out

SQuAD
RandomSampling
Training was performed using the sum of 100 initial data and 30 query data, i.e. 100 data.


Training:   0%|          | 0/13 [00:00<?, ?it/s]

Training:   0%|          | 0/13 [00:00<?, ?it/s]

Training:   0%|          | 0/13 [00:00<?, ?it/s]

TRAIN done!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

Round 0
testing accuracy 16.96558218089185
testing accuracy em 10.416666666666666
Time spent for init training: 0:01:26.225082


Round 1 in Iteration 3
Time spent for querying: 0:00:00.000804
Training was performed using the sum of 100 initial data and 30 query data, i.e. 130 data.


Training:   0%|          | 0/17 [00:00<?, ?it/s]

Training:   0%|          | 0/17 [00:00<?, ?it/s]

Training:   0%|          | 0/17 [00:00<?, ?it/s]

TRAIN done!
rd_1 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 28.151801593119618
testing accuracy em 22.083333333333332
Time spent for training after querying: 0:01:36.513933


Round 2 in Iteration 3
Time spent for querying: 0:00:00.317919
Training was performed using the sum of 100 initial data and 30 query data, i.e. 160 data.


Training:   0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/20 [00:00<?, ?it/s]

TRAIN done!
rd_2 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 37.35432185348445
testing accuracy em 29.833333333333332
Time spent for training after querying: 0:01:50.367065


Round 3 in Iteration 3
Time spent for querying: 0:00:00.327477
Training was performed using the sum of 100 initial data and 30 query data, i.e. 190 data.


Training:   0%|          | 0/24 [00:00<?, ?it/s]

Training:   0%|          | 0/24 [00:00<?, ?it/s]

Training:   0%|          | 0/24 [00:00<?, ?it/s]

TRAIN done!
rd_3 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 46.74545300741741
testing accuracy em 39.583333333333336
Time spent for training after querying: 0:01:48.434524


SEED 1127
RandomSampling
[16.96558218 28.15180159 37.35432185 46.74545301]
Time spent in iteration 3: 0:20:07.839963


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_out

SQuAD
RandomSampling
Training was performed using the sum of 100 initial data and 30 query data, i.e. 100 data.


Training:   0%|          | 0/13 [00:00<?, ?it/s]

Training:   0%|          | 0/13 [00:00<?, ?it/s]

Training:   0%|          | 0/13 [00:00<?, ?it/s]

TRAIN done!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

Round 0
testing accuracy 11.59979995202721
testing accuracy em 7.833333333333333
Time spent for init training: 0:01:26.242925


Round 1 in Iteration 4
Time spent for querying: 0:00:00.000932
Training was performed using the sum of 100 initial data and 30 query data, i.e. 130 data.


Training:   0%|          | 0/17 [00:00<?, ?it/s]

Training:   0%|          | 0/17 [00:00<?, ?it/s]

Training:   0%|          | 0/17 [00:00<?, ?it/s]

TRAIN done!
rd_1 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 17.19505919268186
testing accuracy em 12.666666666666666
Time spent for training after querying: 0:01:32.914909


Round 2 in Iteration 4
Time spent for querying: 0:00:00.313557
Training was performed using the sum of 100 initial data and 30 query data, i.e. 160 data.


Training:   0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/20 [00:00<?, ?it/s]

TRAIN done!
rd_2 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 28.208959186128315
testing accuracy em 21.416666666666668
Time spent for training after querying: 0:01:43.053982


Round 3 in Iteration 4
Time spent for querying: 0:00:00.280128
Training was performed using the sum of 100 initial data and 30 query data, i.e. 190 data.


Training:   0%|          | 0/24 [00:00<?, ?it/s]

Training:   0%|          | 0/24 [00:00<?, ?it/s]

Training:   0%|          | 0/24 [00:00<?, ?it/s]

TRAIN done!
rd_3 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 31.673036499827024
testing accuracy em 23.916666666666668
Time spent for training after querying: 0:01:53.032492


SEED 1127
RandomSampling
[11.59979995 17.19505919 28.20895919 31.6730365 ]
Time spent in iteration 4: 0:26:50.285096


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_out

SQuAD
RandomSampling
Training was performed using the sum of 100 initial data and 30 query data, i.e. 100 data.


Training:   0%|          | 0/13 [00:00<?, ?it/s]

Training:   0%|          | 0/13 [00:00<?, ?it/s]

Training:   0%|          | 0/13 [00:00<?, ?it/s]

TRAIN done!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

Round 0
testing accuracy 22.698700551223624
testing accuracy em 15.5
Time spent for init training: 0:01:29.311562


Round 1 in Iteration 5
Time spent for querying: 0:00:00.000872
Training was performed using the sum of 100 initial data and 30 query data, i.e. 130 data.


Training:   0%|          | 0/17 [00:00<?, ?it/s]

Training:   0%|          | 0/17 [00:00<?, ?it/s]

Training:   0%|          | 0/17 [00:00<?, ?it/s]

TRAIN done!
rd_1 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 32.21582186841597
testing accuracy em 22.666666666666668
Time spent for training after querying: 0:01:35.666288


Round 2 in Iteration 5
Time spent for querying: 0:00:00.302900
Training was performed using the sum of 100 initial data and 30 query data, i.e. 160 data.


Training:   0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/20 [00:00<?, ?it/s]

TRAIN done!
rd_2 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 36.851237826586065
testing accuracy em 28.916666666666668
Time spent for training after querying: 0:01:42.545570


Round 3 in Iteration 5
Time spent for querying: 0:00:00.274390
Training was performed using the sum of 100 initial data and 30 query data, i.e. 190 data.


Training:   0%|          | 0/24 [00:00<?, ?it/s]

Training:   0%|          | 0/24 [00:00<?, ?it/s]

Training:   0%|          | 0/24 [00:00<?, ?it/s]

TRAIN done!
rd_3 get_pred!


Evaluating_pred:   0%|          | 0/150 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/1200 [00:00<?, ?it/s]

testing accuracy 35.88593562640517
testing accuracy em 27.833333333333332
Time spent for training after querying: 0:01:54.154897


SEED 1127
RandomSampling
[22.69870055 32.21582187 36.85123783 35.88593563]
Time spent in iteration 5: 0:33:38.954103
Time spent in total: 0:33:44.804133
0: 33.752
1: 20.06
2: 32.454
3: 22.347
4: 32.786
mean AUBC(acc): 28.2798. std dev AUBC(acc): 5.8385
mean time: 398.2. std dev time: 3.2496
