# Question Answering Model 
## no trainer

- dataset
- torch
- transformers
- transformers[torch]
- evaluate

import packages

In [1]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DefaultDataCollator,
    default_data_collator,
    get_scheduler,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer
)
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch

import evaluate
import collections
from tqdm.auto import tqdm
import numpy as np

import os
import re
import datetime

  from .autonotebook import tqdm as notebook_tqdm


Set cache directory.

In [2]:
model_dir = '/mount/arbeitsdaten31/studenten1/linku/models'
CACHE_DIR='/mount/arbeitsdaten31/studenten1/linku/cache'
%set_env TRANSFORMERS_CACHE $CACHE_DIR
%set_env HF_MODULES_CACHE $CACHE_DIR
%set_env HF_DATASETS_CACHE $CACHE_DIR

env: TRANSFORMERS_CACHE=/mount/arbeitsdaten31/studenten1/linku/cache
env: HF_MODULES_CACHE=/mount/arbeitsdaten31/studenten1/linku/cache
env: HF_DATASETS_CACHE=/mount/arbeitsdaten31/studenten1/linku/cache


### arguments.py

args_input.

In [3]:
args_input_ALstrategy = 'MarginSampling'
args_input_initseed = 100 # 1000
args_input_quota = 100 # 1000
args_input_batch = 35 # 128
args_input_dataset_name = 'SQuAD'
args_input_iteration = 1
args_input_model_batch = 8 # already add in arguments.py

### load dataset

In [4]:
squad = load_dataset(args_input_dataset_name.lower())
# squad["train"] = squad["train"].shuffle(42).select(range(2000))
squad["train"] = squad["train"].select(range(3000))
squad["validation"] = squad["validation"].select(range(1000))

Found cached dataset squad (/home/users1/linku/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00, 117.08it/s]


Next we will preprocess the dataset (training and evaluation data).

In [5]:
def preprocess_training_features(examples):
    # keep ["offset_mapping"], for compute_metrics()
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs["offset_mapping"]
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    example_ids = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        
        example_ids.append(examples["id"][sample_idx]) # newly added for used in unlabel data predict

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["example_id"] = example_ids
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
def preprocess_training_examples(examples):
    # no ['offset_mapping'], for .train() and .eval()
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    example_ids = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        
        example_ids.append(examples["id"][sample_idx]) # newly added for used in unlabel data predict

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [7]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [8]:
max_length = 384
stride = 128

# load tokenizer for dataset preprocessing
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# preprocess data
train_dataset = squad["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=squad["train"].column_names,
)
train_features = squad["train"].map(
    preprocess_training_features,
    batched=True,
    remove_columns=squad["train"].column_names,
)
val_dataset = squad["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["validation"].column_names,
)
val_features = squad["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["validation"].column_names,
)

Loading cached processed dataset at /home/users1/linku/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-d10ddabde563b597.arrow
Loading cached processed dataset at /home/users1/linku/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-773e51dc4558cf67.arrow
Loading cached processed dataset at /home/users1/linku/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-0f087a92e258c27b.arrow
Loading cached processed dataset at /home/users1/linku/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-0f087a92e258c27b.arrow


In [9]:
train_dataset.set_format("torch")
train_features.set_format("torch")
val_dataset = val_dataset.remove_columns(["offset_mapping"])
val_dataset.set_format("torch")
val_features.set_format("torch")

## Evaluation

In [10]:
metric = evaluate.load("squad")

In [11]:
def compute_metrics(start_logits, end_logits, features, examples):
    
    example_to_features = collections.defaultdict(list)
    max_answer_length = 30
    n_best = 20
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [12]:
def get_pred(mod, dataloader, dev, feat, exa):
    mod.eval()
    start_logits = []
    end_logits = []
    # accelerator.print("Evaluation!")
    for batch in tqdm(dataloader):
        batch = {key: value.to(dev) for key, value in batch.items()}
        with torch.no_grad():
            outputs = mod(**batch)

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(feat)]
    end_logits = end_logits[: len(feat)]

    return compute_metrics(start_logits, end_logits, feat, exa)

In [13]:
def get_prob(model, eval_dataloader, device, features, examples):
    model.eval()
    start_logits = []
    end_logits = []
    # accelerator.print("Evaluation!")
    for batch in tqdm(eval_dataloader):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(features)]
    end_logits = end_logits[: len(features)]

    prob_dict = {}
    example_to_features = collections.defaultdict(list)
    max_answer_length = 30
    n_best = 20 # TODO: if set n_best as 5, will it effect the time??
    
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    for example in tqdm(examples):
        example_id = example["id"]
        # context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answers.append(start_logit[start_index] + end_logit[end_index])
        
        if len(answers) > 1:
            if example_to_features[example_id][0] not in prob_dict:
                prob_dict[example_to_features[example_id][0]] = softmax(answers)
            else:
                prob_dict[example_to_features[example_id][0]] += softmax(answers)
        elif example_to_features[example_id] != []:
            if example_to_features[example_id][0] not in prob_dict:
                prob_dict[example_to_features[example_id][0]] = np.array([0])
    
    return prob_dict
# move to evaluation.py

In [14]:
def get_prob_dropout(model, eval_dataloader, device, features, examples, n_drop=10):
    model.train()
    
    prob_dict = {}
    for_check = []
    
    for i in range(n_drop):
        
        start_logits = []
        end_logits = []
        # accelerator.print("Evaluation!")
        for batch in tqdm(eval_dataloader):
            batch = {key: value.to(device) for key, value in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            start_logits.append(outputs.start_logits.cpu().numpy())
            end_logits.append(outputs.end_logits.cpu().numpy())

        start_logits = np.concatenate(start_logits)
        end_logits = np.concatenate(end_logits)
        start_logits = start_logits[: len(features)]
        end_logits = end_logits[: len(features)]

        example_to_features = collections.defaultdict(list)
        max_answer_length = 30
        n_best = 20
            
        for idx, feature in enumerate(features):
            example_to_features[feature["example_id"]].append(idx)

        n = 0
        for example in tqdm(examples):
            example_id = example["id"]
            # context = example["context"]
            answers = []

            # Loop through all features associated with that example
            for feature_index in example_to_features[example_id]:
                start_logit = start_logits[feature_index]
                end_logit = end_logits[feature_index]
                offsets = features[feature_index]["offset_mapping"]

                start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
                end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
                for start_index in start_indexes:
                    for end_index in end_indexes:
                        # Skip answers that are not fully in the context
                        if offsets[start_index] is None or offsets[end_index] is None:
                            continue
                        # Skip answers with a length that is either < 0 or > max_answer_length
                        if (
                            end_index < start_index
                            or end_index - start_index + 1 > max_answer_length
                        ):
                            continue

                        answers.append(start_logit[start_index] + end_logit[end_index])

            
            if 1 < len(answers) < 150:
                zero_list = [0] * (150 - len(answers))
                answers.extend(zero_list)
            else:
                answers[:150]

            if len(answers) > 1:
                if example_to_features[example_id][0] not in prob_dict:
                    prob_dict[example_to_features[example_id][0]] = softmax(answers)
                else:
                    prob_dict[example_to_features[example_id][0]] += softmax(answers)
            elif example_to_features[example_id] != []:
                if example_to_features[example_id][0] not in prob_dict:
                    prob_dict[example_to_features[example_id][0]] = np.array([0])
            # if n == 0 and len(softmax(answers)) > 1:
            #     for_check.append(answers[:5])
            #     n += 1      

    for key in prob_dict.keys():
        prob_dict[key] /= n_drop
    # return prob_dict, for_check
    return prob_dict
# move to evaluation.py

## utils.py

In [15]:
def get_unlabel_data(n_pool, labeled_idxs, train_dataset):
    unlabeled_idxs = np.arange(n_pool)[~labeled_idxs]
    unlabeled_data = train_dataset.select(indices=unlabeled_idxs)
    return unlabeled_idxs, unlabeled_data

# move to utils.py

In [16]:
def softmax(x): 
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)
# move to utils.py

## Query

In [17]:
def margin_sampling_query(n_pool, labeled_idxs, train_dataset, train_features, examples, model, device, n):
    # deepAL+: unlabeled_idxs, unlabeled_data = self.dataset.get_unlabeled_data()
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    
    unlabeled_dataloader = DataLoader(
		unlabeled_data,
		shuffle=True,
		collate_fn=default_data_collator,
		batch_size=8,
	)
    print('GET PROB!!')
    print('num_dataset:', len(unlabeled_dataloader.dataset))
    # deepAL+: probs = self.predict_prob(unlabeled_data)
    prob_dict = get_prob(model, unlabeled_dataloader, device, unlabeled_features, examples)
    
    # deepAL+: probs_sorted, _ = probs.sort(descending=True)
    # deepAL+: uncertainties = probs_sorted[:, 0] - probs_sorted[:,1]
    uncertainties_dict = {}
    for idx, probs in prob_dict.items():
        if len(probs) > 1: # if prob_dict['probs'] is not 0
            sort_probs = np.sort(probs)[::-1] # This method returns a copy of the array, leaving the original array unchanged.
            uncertainties = sort_probs[0] - sort_probs[1]
            uncertainties_dict[idx] = uncertainties
        elif idx:
            uncertainties_dict[idx] = np.array([0])

    # deepAL+: return unlabeled_idxs[uncertainties.sort()[1][:n]] 
    sorted_uncertainties_list = sorted(uncertainties_dict.items(), key=lambda x: x[1], reverse=True)
    
    return unlabeled_idxs[[idx for (idx, uncertainties) in sorted_uncertainties_list[:n]]]

# main.py

### parameters

In [18]:
NUM_QUERY = args_input_batch
NUM_INIT_LB = args_input_initseed
NUM_ROUND = int(args_input_quota / args_input_batch)
DATA_NAME = args_input_dataset_name
STRATEGY_NAME = args_input_ALstrategy

### seed and device

In [19]:
SEED = 4666
# os.environ['TORCH_HOME']='./basicmodel'
os.environ["CUDA_VISIBLE_DEVICES"] = str(2)

# fix random seed
np.random.seed(SEED)
torch.manual_seed(SEED)
# torch.backends.cudnn.enabled  = True
# torch.backends.cudnn.benchmark= True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### start experiment

In [20]:
iteration = args_input_iteration
model_batch = args_input_model_batch
num_train_epochs = 3

all_acc = []
acq_time = []

# Change "fp16_training" to True to support automatic mixed precision training (fp16)	
fp16_training = False

if fp16_training:
    !pip install accelerate==0.2.0
    from accelerate import Accelerator
    accelerator = Accelerator(fp16=True)
    device = accelerator.device

In [21]:
def model_train(num_training_steps, num_train_epochs, dataloader, dev, mod, opt, lr_sche):
	print('TRAIN!!')
	print('num_dataset:', len(dataloader.dataset))
	progress_bar = tqdm(range(num_training_steps))
	for epoch in range(num_train_epochs):
		# Training
		mod.train()
		for step, batch in enumerate(dataloader):
			batch = {key: value.to(dev) for key, value in batch.items()}
			outputs = mod(**batch)
			loss = outputs.loss
			loss.backward()

			opt.step()
			lr_sche.step()
			opt.zero_grad()
			progress_bar.update(1)

In [22]:
# model_0 = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased").to(device)
# optimizer_0 = AdamW(model_0.parameters(), lr=1e-4)
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased").to(device)
optimizer = AdamW(model.parameters(), lr=1e-4)

start = datetime.datetime.now()

## generate initial labeled pool
n_pool = len(train_dataset)
labeled_idxs = np.zeros(n_pool, dtype=bool)

tmp_idxs = np.arange(n_pool)
np.random.shuffle(tmp_idxs)
labeled_idxs[tmp_idxs[:args_input_initseed]] = True

run_0_labeled_idxs = np.arange(n_pool)[labeled_idxs] 

## record acc performance 
acc = np.zeros(NUM_ROUND + 1) # build 3 runs + run_0 # origin 10 runs + run_0
acc_em = np.zeros(NUM_ROUND + 1)

## load the selected train data to DataLoader
train_dataloader_0 = DataLoader(
    train_dataset.select(indices=run_0_labeled_idxs),
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8,
)

eval_dataloader = DataLoader(
    val_dataset, 
    collate_fn=default_data_collator, 
    batch_size=8
)

num_update_steps_per_epoch = len(train_dataloader_0)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# lr_scheduler_0 = get_scheduler(
#     "linear",
#     optimizer=optimizer_0,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )


## print info
print(DATA_NAME)
# print('RANDOM SEED {}'.format(SEED))
print(STRATEGY_NAME) # print(type(strategy).__name__)

## round 0 accuracy 
# model_train(num_training_steps, num_train_epochs, train_dataloader_0, device, model_0, optimizer_0, lr_scheduler_0)
model_train(num_training_steps, num_train_epochs, train_dataloader_0, device, model, optimizer, lr_scheduler)

acc[0] = get_pred(model, eval_dataloader, device, val_features, squad['validation'])['f1']
acc_em[0] = get_pred(model, eval_dataloader, device, val_features, squad['validation'])['exact_match']
print(acc, acc_em)
# acc[0] = get_pred(model_0, eval_dataloader, device, val_features, squad['validation'])['f1']

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_out

SQuAD
MarginSampling
TRAIN!!
num_dataset: 100


100%|██████████| 39/39 [00:19<00:00,  1.99it/s]
100%|██████████| 128/128 [00:24<00:00,  5.28it/s]
100%|██████████| 1000/1000 [00:09<00:00, 103.60it/s]
100%|██████████| 128/128 [00:24<00:00,  5.29it/s]
100%|██████████| 1000/1000 [00:09<00:00, 101.98it/s]


[16.34348008  0.          0.        ] [9.6 0.  0. ]


In [26]:
run_0_labeled_idxs

array([  25,  127,  187,  211,  222,  238,  239,  245,  273,  313,  331,
        349,  351,  368,  416,  441,  570,  576,  616,  629,  645,  689,
        692,  739,  750,  777,  778,  837,  850,  905,  947,  950,  951,
        981, 1069, 1073, 1081, 1098, 1113, 1132, 1182, 1190, 1226, 1233,
       1246, 1293, 1371, 1395, 1436, 1438, 1445, 1477, 1495, 1521, 1557,
       1595, 1600, 1640, 1798, 1810, 1836, 1840, 1860, 1882, 1915, 1973,
       1980, 1996, 2007, 2082, 2138, 2185, 2247, 2272, 2297, 2306, 2319,
       2323, 2374, 2398, 2423, 2430, 2475, 2506, 2527, 2533, 2555, 2580,
       2633, 2688, 2714, 2747, 2792, 2828, 2836, 2887, 2947, 2978, 2985,
       2990])

In [27]:
rd = 1
# model_rd = AutoModelForQuestionAnswering.from_pretrained(model_saved_dir_rd).to(device)
# optimizer_rd = AdamW(model_rd.parameters(), lr=1e-4)

# lr_scheduler_rd = get_scheduler(
#     "linear",
#     optimizer=optimizer_rd,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )

## query
# q_idxs = margin_sampling_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], model_rd, device, NUM_QUERY)
q_idxs = margin_sampling_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], model, device, NUM_QUERY)

labeled_idxs[q_idxs] = True
run_rd_labeled_idxs = np.arange(n_pool)[labeled_idxs]

## load the selected train data to DataLoader
train_dataloader_rd = DataLoader(
    train_dataset.select(indices=run_rd_labeled_idxs),
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8,
)

num_update_steps_per_epoch = len(train_dataloader_rd)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

print('train_dataloader_rd len:', len(train_dataloader_rd.dataset))
## train
# model_train(num_training_steps, num_train_epochs, train_dataloader_rd, device, model_rd, optimizer_rd, lr_scheduler_rd)
print('TRAIN!!')
print('num_dataset:', len(train_dataloader_rd.dataset))
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader_rd):
        batch = {key: value.to(device) for key, value in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

## round rd accuracy
# acc[rd] = get_pred(model_rd, eval_dataloader, device, val_features, squad['validation'])['f1']
# acc[rd] = get_pred(model, eval_dataloader, device, val_features, squad['validation'])['f1']
# acc_em[rd] = get_pred(model, eval_dataloader, device, val_features, squad['validation'])['exact_match']
model.eval()
start_logits = []
end_logits = []
# accelerator.print("Evaluation!")
for batch in tqdm(eval_dataloader):
    batch = {key: value.to(device) for key, value in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    start_logits.append(outputs.start_logits.cpu().numpy())
    end_logits.append(outputs.end_logits.cpu().numpy())

start_logits = np.concatenate(start_logits)
end_logits = np.concatenate(end_logits)
start_logits = start_logits[: len(val_features)]
end_logits = end_logits[: len(val_features)]

acc[rd] = compute_metrics(start_logits, end_logits, val_features, squad['validation'])['f1']
acc_em[rd] = compute_metrics(start_logits, end_logits, val_features, squad['validation'])['exact_match']
print(acc, acc_em)
print('testing accuracy {}'.format(acc[rd]))
print('\n')

torch.cuda.empty_cache()

GET PROB!!
num_dataset: 2939


  0%|          | 0/368 [00:00<?, ?it/s]

100%|██████████| 368/368 [01:09<00:00,  5.26it/s]
100%|██████████| 3000/3000 [00:56<00:00, 52.90it/s]


train_dataloader_rd len: 170
TRAIN!!
num_dataset: 170


100%|██████████| 128/128 [00:24<00:00,  5.19it/s]
100%|██████████| 1000/1000 [00:10<00:00, 99.52it/s]
100%|██████████| 1000/1000 [00:10<00:00, 95.25it/s]


[16.34348008 16.34348008  0.        ] [9.6 9.6 0. ]
testing accuracy 16.34348007927272




In [25]:
run_rd_labeled_idxs

array([  25,   40,   57,  127,  135,  187,  211,  222,  238,  239,  245,
        273,  313,  331,  349,  351,  368,  416,  423,  441,  570,  576,
        616,  629,  645,  689,  692,  739,  740,  750,  777,  778,  837,
        850,  905,  924,  947,  950,  951,  958,  981,  996, 1069, 1073,
       1081, 1098, 1113, 1132, 1182, 1190, 1214, 1226, 1233, 1246, 1293,
       1329, 1360, 1371, 1395, 1409, 1436, 1438, 1445, 1460, 1477, 1495,
       1499, 1521, 1550, 1557, 1595, 1599, 1600, 1612, 1629, 1640, 1652,
       1798, 1810, 1830, 1836, 1840, 1860, 1882, 1915, 1960, 1973, 1980,
       1996, 2007, 2082, 2111, 2129, 2138, 2185, 2247, 2272, 2297, 2306,
       2319, 2323, 2374, 2380, 2391, 2398, 2423, 2430, 2475, 2506, 2527,
       2533, 2555, 2558, 2560, 2580, 2609, 2615, 2633, 2688, 2714, 2747,
       2792, 2821, 2822, 2828, 2836, 2887, 2903, 2941, 2947, 2971, 2978,
       2985, 2990, 3020])

In [24]:
rd = 2
# model_rd = AutoModelForQuestionAnswering.from_pretrained(model_saved_dir_rd).to(device)
# optimizer_rd = AdamW(model_rd.parameters(), lr=1e-4)

# lr_scheduler_rd = get_scheduler(
#     "linear",
#     optimizer=optimizer_rd,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )

## query
# q_idxs = margin_sampling_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], model_rd, device, NUM_QUERY)
q_idxs = margin_sampling_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], model, device, NUM_QUERY)

labeled_idxs[q_idxs] = True
run_rd_labeled_idxs = np.arange(n_pool)[labeled_idxs]

## load the selected train data to DataLoader
train_dataloader_rd = DataLoader(
    train_dataset.select(indices=run_rd_labeled_idxs),
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8,
)

num_update_steps_per_epoch = len(train_dataloader_rd)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

print('train_dataloader_rd len:', len(train_dataloader_rd.dataset))
## train
# model_train(num_training_steps, num_train_epochs, train_dataloader_rd, device, model_rd, optimizer_rd, lr_scheduler_rd)
model_train(num_training_steps, num_train_epochs, train_dataloader_rd, device, model, optimizer, lr_scheduler)

## round rd accuracy
# acc[rd] = get_pred(model_rd, eval_dataloader, device, val_features, squad['validation'])['f1']
acc[rd] = get_pred(model, eval_dataloader, device, val_features, squad['validation'])['f1']
acc_em[rd] = get_pred(model, eval_dataloader, device, val_features, squad['validation'])['exact_match']
print(acc, acc_em)
print('testing accuracy {}'.format(acc[rd]))
print('\n')

torch.cuda.empty_cache()

GET PROB!!
num_dataset: 2939


 21%|██        | 78/368 [00:15<00:56,  5.15it/s]


KeyboardInterrupt: 

## query workspace

In [None]:
# get unlable data
unlabeled_idxs = np.arange(n_pool)[~labeled_idxs]
unlabeled_data = train_dataset.select(indices=unlabeled_idxs)
len(unlabeled_idxs)

### test: query 5 data from 20 unlabeled_data

In [None]:
# smaller data
unlabeled_idxs_20 = unlabeled_idxs[20:40]
unlabeled_data_20 = train_dataset.select(unlabeled_idxs_20)
unlabeled_feature_20 = train_features.select(unlabeled_idxs_20)
len(unlabeled_data_20)

In [None]:
unlabeled_idxs_20

In [None]:
unlabeled_dataloader = DataLoader(
		unlabeled_data_20,
		shuffle=True,
		collate_fn=default_data_collator,
		batch_size=8,
	)
len(unlabeled_dataloader.dataset)

In [None]:
probs_list_dict_20 = get_prob(model, unlabeled_dataloader, device, unlabeled_feature_20, squad['train'])
# len(probs_list_dict_20)

In [None]:
probs_list_dict_20

In [None]:
uncertainties_dict = {}
for idx, probs in probs_list_dict_20.items():
    if len(probs) > 1: # if prob_dict['probs'] is not 0
        sort_probs = np.sort(probs)[::-1] # This method returns a copy of the array, leaving the original array unchanged.
        uncertainties = sort_probs[0] - sort_probs[1]
        uncertainties_dict[idx] = uncertainties
    elif idx:
        uncertainties_dict[idx] = np.array([0])
print('sort_probs:\n', sort_probs)
print('uncertainties_dict:\n', uncertainties_dict)
# deepAL+: return unlabeled_idxs[uncertainties.sort()[1][:n]] 
sorted_uncertainties_list = sorted(uncertainties_dict.items(), key=lambda x: x[1], reverse=True)
    
unlabeled_idxs[[idx for (idx, uncertainties) in sorted_uncertainties_list[:5]]]

In [None]:
probs_list_dict_20_dropout = get_prob_dropout(model, unlabeled_dataloader, device, unlabeled_data_20, squad['train'])

In [None]:
for_check
# the prediction are the same

In [None]:
probs_list_dict_20_dropout

In [None]:
probs_list_dict_20

In [None]:
confidence_list_dict = []
for d in probs_list_dict_20:
    if len(d['probs']) > 1: # if prob_dict['probs'] is not 0
        confidence = max(d['probs'])
        confidence_list_dict.append(
            {'idx': d['idx'], 
                'confidence': confidence}
                )
    elif d['idx']:
        confidence_list_dict.append(
            {'idx': d['idx'], 
                'confidence': np.array([0])}
                )
# deepAL+: return unlabeled_idxs[uncertainties.sort()[1][:n]]
sorted_confidence_dict = sorted(confidence_list_dict, key=lambda d: d['confidence'])   
unlabeled_idxs_20[[confidence_dict['idx'][0] for confidence_dict in sorted_confidence_dict[:5]]]

In [None]:
q_idxs_20 = get_entropy(probs_list_dict_20, 5)
q_idxs_20

In [None]:
preds, _, _ = trainer_qs.predict(unlabeled_data_20)

In [None]:
start_logits, end_logits = preds
start_logits

In [None]:
# predict with unlable data
preds, _, _ = trainer_qs.predict(unlabeled_data_20)

In [None]:
start_logits, end_logits = preds
start_logits

In [None]:
q_idxs = get_entropy(probs_list_dict, NUM_QUERY)
q_idxs