# Question Answering Model 
## no trainer

- dataset
- torch
- transformers
- transformers[torch]
- evaluate

import packages

In [1]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    default_data_collator,
    get_scheduler,
    AutoModelForQuestionAnswering,
    BertConfig
)
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch

import evaluate
import collections
from tqdm.auto import tqdm
import numpy as np
from sklearn.cluster import KMeans

import os
import re
import datetime

Set cache directory.

In [2]:
CACHE_DIR='/mount/arbeitsdaten31/studenten1/linku/cache'
%set_env TRANSFORMERS_CACHE $CACHE_DIR
%set_env HF_MODULES_CACHE $CACHE_DIR
%set_env HF_DATASETS_CACHE $CACHE_DIR

env: TRANSFORMERS_CACHE=/mount/arbeitsdaten31/studenten1/linku/cache
env: HF_MODULES_CACHE=/mount/arbeitsdaten31/studenten1/linku/cache
env: HF_DATASETS_CACHE=/mount/arbeitsdaten31/studenten1/linku/cache


### arguments.py

args_input.

In [3]:
args_input_ALstrategy = 'KMeansSampling'
args_input_initseed = 100 # 1000
args_input_quota = 100 # 1000
args_input_batch = 35 # 128
args_input_dataset_name = 'SQuAD'
args_input_iteration = 1
args_input_model_batch = 8 # already add in arguments.py
args_input_max_length = 384

stride = 128

in main.py

In [4]:
MAX_LENGTH = args_input_max_length
NUM_QUERY = args_input_batch
NUM_INIT_LB = args_input_initseed
NUM_ROUND = int(args_input_quota / args_input_batch)
DATA_NAME = args_input_dataset_name
STRATEGY_NAME = args_input_ALstrategy

In [5]:
model_dir = '/mount/arbeitsdaten31/studenten1/linku/models'
pretrain_model_dir = model_dir + '/' + 'SQuAD_100_Bert'

### load dataset

In [6]:
squad = load_dataset(args_input_dataset_name.lower())
# squad["train"] = squad["train"].shuffle(42).select(range(2000))
squad["train"] = squad["train"].select(range(4000))
squad["validation"] = squad["validation"].select(range(1500))

Found cached dataset squad (/home/users1/linku/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

Next we will preprocess the dataset (training and evaluation data).

In [7]:
def preprocess_training_features(examples):
    # keep ["offset_mapping"], for compute_metrics()
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs["offset_mapping"]
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    example_ids = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        
        example_ids.append(examples["id"][sample_idx]) # newly added for used in unlabel data predict

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["example_id"] = example_ids
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [8]:
def preprocess_training_examples(examples):
    # no ['offset_mapping'], for .train() and .eval()
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    example_ids = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        
        example_ids.append(examples["id"][sample_idx]) # newly added for used in unlabel data predict

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [9]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [10]:
# load tokenizer for dataset preprocessing
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# preprocess data
train_dataset = squad["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=squad["train"].column_names,
)
train_features = squad["train"].map(
    preprocess_training_features,
    batched=True,
    remove_columns=squad["train"].column_names,
)
val_dataset = squad["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["validation"].column_names,
)
val_features = squad["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["validation"].column_names,
)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Loading cached processed dataset at /home/users1/linku/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-9456706e0dd13065.arrow


In [11]:
train_dataset.set_format("torch")
train_features.set_format("torch")
val_dataset = val_dataset.remove_columns(["offset_mapping"])
val_dataset.set_format("torch")
val_features.set_format("torch")

## model.py

In [12]:
def to_train(num_train_epochs, train_dataloader, device, model, optimizer, lr_scheduler, record_loss=False):
	print('Num of train dataset:', len(train_dataloader.dataset))
	for epoch in range(num_train_epochs):
		model.train()
		for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
			batch = {key: value.to(device) for key, value in batch.items()}
			outputs = model(**batch)
			loss = outputs.loss
			loss.backward()

			optimizer.step()
			lr_scheduler.step()
			optimizer.zero_grad()

		if record_loss:
			print('Train Epoch: {}\tLoss: {:.6f}'.format(epoch, loss.item()))

	model_to_save = model.module if hasattr(model, 'module') else model 
	model_to_save.save_pretrained(model_dir)
	print('TRAIN done!')

In [13]:
metric = evaluate.load("squad")

In [14]:
def compute_metrics(start_logits, end_logits, features, examples):
    
    example_to_features = collections.defaultdict(list)
    max_answer_length = 30
    n_best = 20
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples, desc="Computing metrics"):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [15]:
def get_pred(eval_dataloader, device, features, examples, record_loss=False, rd_0=False):
    if rd_0:
        config = BertConfig.from_pretrained(pretrain_model_dir, output_hidden_states=True)
    else:
        config = BertConfig.from_pretrained(model_dir, output_hidden_states=True)
    model = AutoModelForQuestionAnswering.from_config(config).to(device)
    
    test_loss = []
    model.eval()
    start_logits = []
    end_logits = []
    for batch in tqdm(eval_dataloader, desc="Evaluating_pred"):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            print(outputs)
            print(outputs.loss)
            test_loss.append(outputs.loss)

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(features)]
    end_logits = end_logits[: len(features)]

    if record_loss:
        test_loss /= len(eval_dataloader.dataset)
    print('\nTest set: Average loss: {:.4f}\n'.format(test_loss))

    return compute_metrics(start_logits, end_logits, features, examples)

In [16]:
def get_prob(eval_dataloader, device, features, examples):
    model = AutoModelForQuestionAnswering.from_pretrained(model_dir).to(device)

    model.eval()
    start_logits = []
    end_logits = []

    for batch in tqdm(eval_dataloader, desc="Evaluating_prob"):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(features)]
    end_logits = end_logits[: len(features)]

    prob_dict = {}
    example_to_features = collections.defaultdict(list)
    max_answer_length = 30
    n_best = 20 # TODO: if set n_best as 5, will it effect the time??
    
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    for example in tqdm(examples, desc="Computing metrics"):
        example_id = example["id"]
        # context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answers.append(start_logit[start_index] + end_logit[end_index])
        
            if len(answers) > 1:
                prob_dict[feature_index] = softmax(answers)
            elif example_to_features[example_id] != []:
                prob_dict[feature_index] = np.array([0])
    
    return prob_dict

In [17]:
def get_prob_dropout(eval_dataloader, device, features, examples, n_drop=10):
    # deepAL+: self.clf.train()
    model = AutoModelForQuestionAnswering.from_pretrained(model_dir).to(device)
    model.train()
    # deepAL+: probs = torch.zeros([len(data), len(np.unique(data.Y))])
    # deepAL+: loader = DataLoader(data, shuffle=False, **self.params['loader_te_args'])
    prob_dict = {}
    # deepAL+: for i in range(n_drop):
    # deepAL+:     with torch.no_grad():
    # deepAL+:         for x, y, idxs in loader:
    # deepAL+:             x, y = x.to(self.device), y.to(self.device)
    # deepAL+:             out, e1 = self.clf(x)
    # deepAL+:             prob = F.softmax(out, dim=1)
    # deepAL+:             probs[idxs] += prob.cpu()
    for i in range(n_drop):
        start_logits = []
        end_logits = []
        for batch in tqdm(eval_dataloader, desc="Evaluating_prob_dropout"):
            batch = {key: value.to(device) for key, value in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            start_logits.append(outputs.start_logits.cpu().numpy())
            end_logits.append(outputs.end_logits.cpu().numpy())

        start_logits = np.concatenate(start_logits)
        end_logits = np.concatenate(end_logits)
        start_logits = start_logits[: len(features)]
        end_logits = end_logits[: len(features)]

        example_to_features = collections.defaultdict(list)
        max_answer_length = 30
        n_best = 20
            
        for idx, feature in enumerate(features):
            example_to_features[feature["example_id"]].append(idx)

        n = 0
        for example in tqdm(examples):
            example_id = example["id"]
            answers = []

            # Loop through all features associated with that example
            for feature_index in example_to_features[example_id]:
                start_logit = start_logits[feature_index]
                end_logit = end_logits[feature_index]
                offsets = features[feature_index]["offset_mapping"]

                start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
                end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
                for start_index in start_indexes:
                    for end_index in end_indexes:
                        # Skip answers that are not fully in the context
                        if offsets[start_index] is None or offsets[end_index] is None:
                            continue
                        # Skip answers with a length that is either < 0 or > max_answer_length
                        if (
                            end_index < start_index
                            or end_index - start_index + 1 > max_answer_length
                        ):
                            continue

                        answers.append(start_logit[start_index] + end_logit[end_index])

            if 1 < len(answers) < 200: # pad to same numbers of possible answers
                zero_list = [0] * (200 - len(answers))
                answers.extend(zero_list)
            elif len(answers) >= 200:
                answers = answers[:200]

            if len(answers) > 1:
                if feature_index not in prob_dict:
                    prob_dict[feature_index] = softmax(answers)
                else:
                    prob_dict[feature_index] += softmax(answers)
            elif example_to_features[example_id] != []:
                if feature_index not in prob_dict:
                    prob_dict[feature_index] = np.array([0])   

    for key in prob_dict.keys():
        prob_dict[key] /= n_drop

    return prob_dict

In [18]:
def get_prob_dropout_split(eval_dataloader, device, features, examples, n_drop=10):
    ## use tensor to save the answers
    
    # deepAL+: self.clf.train()
    model = AutoModelForQuestionAnswering.from_pretrained(model_dir).to(device)
    model.train()
    # deepAL+: probs = torch.zeros([len(data), len(np.unique(data.Y))])
    # deepAL+: loader = DataLoader(data, shuffle=False, **self.params['loader_te_args'])
    probs = torch.zeros([n_drop, len(eval_dataloader.dataset), 200])
    for_check = []
    # deepAL+: for i in range(n_drop):
    # deepAL+:     with torch.no_grad():
    # deepAL+:         for x, y, idxs in loader:
    # deepAL+:             x, y = x.to(self.device), y.to(self.device)
    # deepAL+:             out, e1 = self.clf(x)
    # deepAL+:             prob = F.softmax(out, dim=1)
    # deepAL+:             probs[i][idxs] += F.softmax(out, dim=1).cpu()
    for i in range(n_drop):
        prob_dict = {}
        start_logits = []
        end_logits = []
        for batch in tqdm(eval_dataloader, desc="Evaluating_prob_dropout"):
            batch = {key: value.to(device) for key, value in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            start_logits.append(outputs.start_logits.cpu().numpy())
            end_logits.append(outputs.end_logits.cpu().numpy())

        start_logits = np.concatenate(start_logits)
        end_logits = np.concatenate(end_logits)
        start_logits = start_logits[: len(features)]
        end_logits = end_logits[: len(features)]

        example_to_features = collections.defaultdict(list)
        max_answer_length = 30
        n_best = 20
            
        for idx, feature in enumerate(features):
            example_to_features[feature["example_id"]].append(idx)

        n = 0
        for example in tqdm(examples, desc="Computing metrics"):
            example_id = example["id"]
            # context = example["context"]
            answers = []

            # Loop through all features associated with that example
            for feature_index in example_to_features[example_id]:
                start_logit = start_logits[feature_index]
                end_logit = end_logits[feature_index]
                offsets = features[feature_index]["offset_mapping"]

                start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
                end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
                for start_index in start_indexes:
                    for end_index in end_indexes:
                        # Skip answers that are not fully in the context
                        if offsets[start_index] is None or offsets[end_index] is None:
                            continue
                        # Skip answers with a length that is either < 0 or > max_answer_length
                        if (
                            end_index < start_index
                            or end_index - start_index + 1 > max_answer_length
                        ):
                            continue

                        answers.append(start_logit[start_index] + end_logit[end_index])

            
                if 1 < len(answers) < 200: # pad to same numbers of possible answers
                    zero_list = [0] * (200 - len(answers))
                    answers.extend(zero_list)
                elif len(answers) >= 200:
                    answers = answers[:200]

                probs[i][feature_index] += torch.tensor(softmax(answers))

            # if n == 0 and len(softmax(answers)) > 1:
            #     for_check.append(answers[:5])
            #     n += 1 

    # return prob_dict, for_check
    return probs

In [59]:
def get_embeddings(eval_dataloader, device, features, examples, rd):
    if rd == 1:
        config = BertConfig.from_pretrained(pretrain_model_dir, output_hidden_states=True)
    else: 
        config = BertConfig.from_pretrained(model_dir, output_hidden_states=True)
    model = AutoModelForQuestionAnswering.from_config(config).to(device)

    model.eval()
    embeddings = torch.zeros([len(eval_dataloader.dataset), model.config.to_dict()['hidden_size']])
    idxs_start = 0

    for batch in tqdm(eval_dataloader, desc="Evaluating_prob"):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            # print('len_output:', len(outputs)) # 4
            # print('outputs:', outputs) # (loss, start_logits, end_logits, hidden_states)

        hidden_states = outputs.hidden_states
        # print('len_hidden_states:', len(hidden_states)) # 13 # each one has: (batch_size, sequence_length, hidden_size)
        # # hidden_states[0] -> last hidden states
        # print('len_hidden_states[0]:', len(hidden_states[0])) # 8, 8, 4
        # print('len_hidden_states[0][0]:', len(hidden_states[0][0])) # 384, 384, 384 # tokens in each sequence
        # print('len_hidden_states[0][0][0]:', len(hidden_states[0][0][0])) # 768, 768, 768 # number of hidden units
        # print('hidden_states:', hidden_states) 

        # TODO: Question!!!!! 
        embedding_of_last_layer = hidden_states[-1][:, 0, :] # [:, 0, :] -> to get [cls], but all the same
        print(embedding_of_last_layer)
        idxs_end = idxs_start + len(hidden_states[-1])
        # print(idxs_start)
        # print(idxs_end)
        embeddings[idxs_start:idxs_end] = embedding_of_last_layer.cpu()
        idxs_start = idxs_end
        
    return embeddings 

In [60]:
embeddings = get_embeddings(unlabeled_dataloader, device, unlabeled_feature_20, squad['train'], rd=1)

Evaluating_prob:   0%|          | 0/13 [00:00<?, ?it/s]

tensor([[ 0.3821, -0.2329, -0.7528,  ...,  0.3398,  0.3708, -0.3504],
        [ 0.3757, -0.2713, -0.7949,  ...,  0.3613,  0.3396, -0.3583],
        [ 0.2734, -0.4379, -0.8023,  ...,  0.3172,  0.3971, -0.3816],
        ...,
        [ 0.3346, -0.3376, -0.9554,  ...,  0.3716,  0.3823, -0.4001],
        [ 0.3462, -0.1903, -0.8239,  ...,  0.3848,  0.4839, -0.4290],
        [ 0.3775, -0.2474, -0.7431,  ...,  0.3860,  0.3522, -0.3633]],
       device='cuda:0')
tensor([[ 0.2751, -0.2796, -0.7901,  ...,  0.4045,  0.4772, -0.2615],
        [ 0.3696, -0.2389, -0.8716,  ...,  0.4472,  0.4582, -0.4561],
        [ 0.3333, -0.1796, -0.8043,  ...,  0.2796,  0.3625, -0.3216],
        ...,
        [ 0.3022, -0.2155, -0.8702,  ...,  0.2496,  0.4065, -0.3365],
        [ 0.2407, -0.1008, -0.9183,  ...,  0.3682,  0.5604, -0.2810],
        [ 0.3983, -0.4316, -0.7011,  ...,  0.5084,  0.3698, -0.3399]],
       device='cuda:0')
tensor([[ 0.2109, -0.2321, -0.9000,  ...,  0.3921,  0.5575, -0.2895],
        [ 0.32

## utils.py

In [20]:
def get_unlabel_data(n_pool, labeled_idxs, train_dataset):
    unlabeled_idxs = np.arange(n_pool)[~labeled_idxs]
    unlabeled_data = train_dataset.select(indices=unlabeled_idxs)
    return unlabeled_idxs, unlabeled_data

In [21]:
def softmax(x): 
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

## Query

In [22]:
def random_sampling_query(labeled_idxs, n, rd):
    return np.random.choice(np.where(labeled_idxs==0)[0], n, replace=False)

In [23]:
def margin_sampling_query(n_pool, labeled_idxs, train_dataset, train_features, examples, model, device, n, rd):
    # deepAL+: unlabeled_idxs, unlabeled_data = self.dataset.get_unlabeled_data()
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
		unlabeled_data,
		shuffle=True,
		collate_fn=default_data_collator,
		batch_size=8,
	)
    # TODO: print for recording
    print('Margin querying starts!')
    # deepAL+: probs = self.predict_prob(unlabeled_data)
    prob_dict = get_prob(unlabeled_dataloader, device, unlabeled_features, examples)
    # TODO: print for recording
    print('Got probability!')
    # deepAL+: probs_sorted, _ = probs.sort(descending=True)
    # deepAL+: uncertainties = probs_sorted[:, 0] - probs_sorted[:,1]
    uncertainties_dict = {}
    for idx, probs in prob_dict.items():
        if len(probs) > 1: # if prob_dict['probs'] is not 0
            sort_probs = np.sort(probs)[::-1] # This method returns a copy of the array, leaving the original array unchanged.
            uncertainties_dict[idx] = sort_probs[0] - sort_probs[1]
        elif idx:
            uncertainties_dict[idx] = np.array([0])

    # deepAL+: return unlabeled_idxs[uncertainties.sort()[1][:n]] 
    sorted_uncertainties_list = sorted(uncertainties_dict.items(), key=lambda x: x[1], reverse=True)
    
    return unlabeled_idxs[[idx for (idx, uncertainties) in sorted_uncertainties_list[:n]]]

In [24]:
def least_confidence_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n, rd):
    # deepAL+: unlabeled_idxs, unlabeled_data = self.dataset.get_unlabeled_data()
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
		unlabeled_data,
		shuffle=True,
		collate_fn=default_data_collator,
		batch_size=8,
	)
    # TODO: print for recording
    print('LC querying starts!')
    # deepAL+: probs = self.predict_prob(unlabeled_data)
    prob_dict = get_prob(unlabeled_dataloader, device, unlabeled_features, examples)
    # TODO: print for recording
    print('Got probability!')
    # deepAL+: uncertainties = probs.max(1)[0]
    confidence_dict = {}
    for idx, probs in prob_dict.items():
        if len(probs) > 1: # if prob_dict['probs'] is not 0
            confidence_dict[idx] = max(probs)
        elif idx:
            confidence_dict[idx] = np.array([0])

    # deepAL+: return unlabeled_idxs[uncertainties.sort()[1][:n]]
    sorted_confidence_list = sorted(confidence_dict.items, key=lambda x: x[1])
    return unlabeled_idxs[[idx for (idx, confidence) in sorted_confidence_list[:n]]]

In [25]:
def var_ratio_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n, rd):
    # deepAL+: unlabeled_idxs, unlabeled_data = self.dataset.get_unlabeled_data()
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
		unlabeled_data,
		shuffle=True,
		collate_fn=default_data_collator,
		batch_size=8,
	)
    # TODO: print for recording
    print('Var Ratio querying starts!')
    # deepAL+: probs = self.predict_prob(unlabeled_data)
    prob_dict = get_prob(unlabeled_dataloader, device, unlabeled_features, examples)
    # TODO: print for recording
    print('Got probability!')
    # deepAL+: preds = torch.max(probs, 1)[0]
    # deepAL+: uncertainties = 1.0 - preds
    confidence_dict = {}
    for idx, probs in prob_dict.items():
        if len(probs) > 1: # if prob_dict['probs'] is not 0
            confidence_dict[idx] = 1.0 - max(probs)
        elif idx:
            confidence_dict[idx] = np.array([0])

    # deepAL+: return unlabeled_idxs[uncertainties.sort(descending=True)[1][:n]]
    sorted_confidence_list = sorted(confidence_dict.items, key=lambda x: x[1], reverse=True)
    return unlabeled_idxs[[idx for (idx, confidence) in sorted_confidence_list[:n]]]
# comment for the same query as LC

In [26]:
def entropy_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n, rd):
    # deepAL+: unlabeled_idxs, unlabeled_data = self.dataset.get_unlabeled_data()
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
		unlabeled_data,
		shuffle=True,
		collate_fn=default_data_collator,
		batch_size=8,
	)
    # deepAL+: probs = self.predict_prob(unlabeled_data)
    # TODO: print for recording
    print('Entropy querying starts!')
    prob_dict = get_prob(unlabeled_dataloader, device, unlabeled_features, examples)
    # TODO: print for recording
    print('Got probability!')
    # deepAL+: log_probs = torch.log(probs)
    # deepAL+: uncertainties = (probs*log_probs).sum(1)
    entropy_dict = {}
    for idx, probs in prob_dict.items():
        if len(probs) > 1: # if prob_dict['probs'] is not 0
            log_probs = np.log(probs)
            entropy_dict[idx] = (probs*log_probs).sum()
        elif idx:
            entropy_dict[idx] = np.array([0])
    # deepAL+: return unlabeled_idxs[uncertainties.sort()[1][:n]]
    sorted_entropy_list = sorted(entropy_dict.items(), key=lambda x: x[1])
    return unlabeled_idxs[[idx for (idx, entropy) in sorted_entropy_list[:n]]]

In [27]:
def margin_sampling_dropout_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n, rd):
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
		unlabeled_data,
		shuffle=True,
		collate_fn=default_data_collator,
		batch_size=8,
	)
    # TODO: print for recording
    print('Margin dropout querying starts!')
    prob_dict = get_prob_dropout(unlabeled_dataloader, device, unlabeled_features, examples)
    # TODO: print for recording
    print('Got probability!')
    uncertainties_dict = {}
    for idx, probs in prob_dict.items():
        if len(probs) > 1: # if prob_dict['probs'] is not 0
            sort_probs = np.sort(probs)[::-1] # This method returns a copy of the array, leaving the original array unchanged.
            uncertainties_dict[idx] = sort_probs[0] - sort_probs[1]
        elif idx:
            uncertainties_dict[idx] = np.array([0])

    sorted_uncertainties_list = sorted(uncertainties_dict.items(), key=lambda x: x[1], reverse=True)
    return unlabeled_idxs[[idx for (idx, uncertainties) in sorted_uncertainties_list[:n]]]

In [28]:
def mean_std_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n, rd):
    # deepAL+: unlabeled_idxs, unlabeled_data = self.dataset.get_unlabeled_data()
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
  		unlabeled_data,
      shuffle=True,
      collate_fn=default_data_collator,
      batch_size=8,
    )
    # TODO: print for recording
    print('Mean STD querying starts!')
    # deepAL+: probs = self.predict_prob_dropout_split(unlabeled_data, n_drop=self.n_drop).numpy()
    probs = get_prob_dropout_split(unlabeled_dataloader, device, unlabeled_features, examples).numpy()
    # TODO: print for recording
    print('Got probability!')
    # deepAL+: sigma_c = np.std(probs, axis=0)
    sigma_c = np.std(probs, axis=0)
    # deepAL+: uncertainties = torch.from_numpy(np.mean(sigma_c, axis=-1))
    uncertainties = torch.from_numpy(np.mean(sigma_c, axis=-1)) # use tensor.sort() will sort the data and produce sorted indexes
    # deepAL+: return unlabeled_idxs[uncertainties.sort(descending=True)[1][:n]] # [1]: to get sorted data's indexes
    return unlabeled_idxs[uncertainties.sort(descending=True)[1][:n]]

In [29]:
def bayesian_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n, rd):
    # deepAL+: unlabeled_idxs, unlabeled_data = self.dataset.get_unlabeled_data()
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
      unlabeled_data,
      shuffle=True,
      collate_fn=default_data_collator,
      batch_size=8,
    )
    # deepAL+: probs = self.predict_prob_dropout_split(unlabeled_data, n_drop=self.n_drop)
    probs = get_prob_dropout_split(unlabeled_dataloader, device, unlabeled_features, examples)
    # deepAL+: pb = probs.mean(0)
    probs_mean = probs.mean(0)
    # deepAL+: entropy1 = (-pb*torch.log(pb)).sum(1)
    entropy1 = (-probs_mean*torch.log(probs_mean)).sum(1)
    # deepAL+: entropy2 = (-probs*torch.log(probs)).sum(2).mean(0)
    entropy2 = (-probs*torch.log(probs)).sum(2).mean(0)
    # deepAL+: uncertainties = entropy2 - entropy1
    uncertainties = entropy2 - entropy1
    # later on, we can use batch
    # deepAL+: return unlabeled_idxs[uncertainties.sort()[1][:n]]
    return unlabeled_idxs[uncertainties.sort()[1][:n]]

In [30]:
def bayesian_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n, rd):
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
      unlabeled_data,
      shuffle=True,
      collate_fn=default_data_collator,
      batch_size=8,
    )
    # TODO: print for recording
    print('BALD querying starts!')
    probs = get_prob_dropout_split(unlabeled_dataloader, device, unlabeled_features, examples)
    # TODO: print for recording
    print('Got probability!')
    probs_mean = probs.mean(0)
    entropy1 = (-probs_mean*torch.log(probs_mean)).sum(1)
    entropy2 = (-probs*torch.log(probs)).sum(2).mean(0)
    uncertainties = entropy2 - entropy1
    # later on, we can use batch
    return unlabeled_idxs[uncertainties.sort()[1][:n]]

In [31]:
def mean_std_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n, rd):
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
  		unlabeled_data,
      shuffle=True,
      collate_fn=default_data_collator,
      batch_size=8,
    )
    # TODO: print for recording
    print('Mean STD querying starts!')
    probs = get_prob_dropout_split(unlabeled_dataloader, device, unlabeled_features, examples).numpy()
    # TODO: print for recording
    print('Got probability!')
    sigma_c = np.std(probs, axis=0)
    uncertainties = torch.from_numpy(np.mean(sigma_c, axis=-1)) # use tensor.sort() will sort the data and produce sorted indexes
    return unlabeled_idxs[uncertainties.sort(descending=True)[1][:n]]

In [32]:
def kmeans_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n, rd):
    # deepAL+: unlabeled_idxs, unlabeled_data = self.dataset.get_unlabeled_data()
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
      unlabeled_data,
      shuffle=True,
      collate_fn=default_data_collator,
      batch_size=8,
    )
    # deepAL+: embeddings = get_embeddings(unlabeled_data)
    embeddings = get_embeddings(unlabeled_dataloader, device, unlabeled_features, examples, rd=1)
    # deepAL+: embeddings = embeddings.numpy()
    embeddings = embeddings.numpy()
    # print(embeddings.shape)
    # deepAL+: cluster_learner = KMeans(n_clusters=n)
    cluster_learner = KMeans(n_clusters=n)
    # deepAL+: cluster_learner.fit(embeddings)
    cluster_learner.fit(embeddings)
    # deepAL+: cluster_idxs = cluster_learner.predict(embeddings)
    cluster_idxs = cluster_learner.predict(embeddings)
    # deepAL+: centers = cluster_learner.cluster_centers_[cluster_idxs]
    centers = cluster_learner.cluster_centers_[cluster_idxs]
    # deepAL+: dis = (embeddings - centers)**2
    dis = (embeddings - centers)**2
    # deepAL+: dis = dis.sum(axis=1)
    dis = dis.sum(axis=1)
    # deepAL+: q_idxs = np.array([np.arange(embeddings.shape[0])[cluster_idxs==i][dis[cluster_idxs==i].argmin()] for i in range(n)])
    q_idxs = np.array([np.arange(embeddings.shape[0])[cluster_idxs==i][dis[cluster_idxs==i].argmin()] for i in range(n)])

    # deepAL+: return unlabeled_idxs[q_idxs]
    return unlabeled_idxs[q_idxs]
    
    

# main.py

### seed and device

In [33]:
SEED = 4666
# os.environ['TORCH_HOME']='./basicmodel'
os.environ["CUDA_VISIBLE_DEVICES"] = str(3)

# fix random seed
np.random.seed(SEED)
torch.manual_seed(SEED)
# torch.backends.cudnn.enabled  = True
# torch.backends.cudnn.benchmark= True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### start experiment

In [34]:
iteration = args_input_iteration
model_batch = args_input_model_batch
num_train_epochs = 3

all_acc = []
acq_time = []

# Change "fp16_training" to True to support automatic mixed precision training (fp16)	
fp16_training = False

if fp16_training:
    !pip install accelerate==0.2.0
    from accelerate import Accelerator
    accelerator = Accelerator(fp16=True)
    device = accelerator.device

In [35]:
# repeate # iteration trials
while (iteration > 0): 
	iteration = iteration - 1

	start = datetime.datetime.now()

	## generate initial labeled pool
	n_pool = len(train_dataset)
	labeled_idxs = np.zeros(n_pool, dtype=bool)

	tmp_idxs = np.arange(n_pool)
	np.random.shuffle(tmp_idxs)
	labeled_idxs[tmp_idxs[:NUM_INIT_LB]] = True

	run_0_labeled_idxs = np.arange(n_pool)[labeled_idxs]

	## record acc performance 
	acc = np.zeros(NUM_ROUND + 1) # quota/batch runs + run_0

	## data
	# eval_dataloader = DataLoader(
	# 	val_dataset, 
	# 	collate_fn=default_data_collator, 
	# 	batch_size=8
	# )

	## print info
	print(DATA_NAME)
	print(STRATEGY_NAME)
	
	## round 0 accuracy
	# acc[0] = get_pred(eval_dataloader, device, val_features, squad['validation'], rd_0=True)['f1']
	# acc[0] = 77.96450701 # init=4000
	acc[0] = 13.8252 # init=100

	print('Round 0\ntesting accuracy {}'.format(acc[0]))
	print('\n')
	
	# ## round 1 to rd
	# for rd in range(1, NUM_ROUND+1):
	# 	print('Round {}'.format(rd))

		# ## query
		# if STRATEGY_NAME == 'RandomSampling':
		# 	q_idxs = random_sampling_query(labeled_idxs, NUM_QUERY, rd)
		# elif STRATEGY_NAME == 'MarginSampling':
		# 	q_idxs = margin_sampling_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], device, NUM_QUERY, rd)
		# elif STRATEGY_NAME == 'LeastConfidence':
		# 	q_idxs = least_confidence_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], device, NUM_QUERY, rd)
		# elif STRATEGY_NAME == 'EntropySampling':
		# 	q_idxs = entropy_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], device, NUM_QUERY, rd)
		# elif STRATEGY_NAME == 'MarginSamplingDropout':
		# 	q_idxs = margin_sampling_dropout_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], device, NUM_QUERY, rd)
		# elif STRATEGY_NAME == 'LeastConfidenceDropout':
		# 	q_idxs = least_confidence_dropout_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], device, NUM_QUERY, rd)
		# elif STRATEGY_NAME == 'EntropySamplingDropout':
		# 	q_idxs = entropy_dropout_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], device, NUM_QUERY, rd)
		# elif STRATEGY_NAME == 'VarRatio':
		# 	q_idxs = var_ratio_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], device, NUM_QUERY, rd)
		# elif STRATEGY_NAME == 'KMeansSampling':
		# 	q_idxs = KMeans_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], device, NUM_QUERY, rd)
		# elif STRATEGY_NAME == 'KCenterGreedy':
		# 	q_idxs = kcenter_query()
		# elif STRATEGY_NAME == 'KCenterGreedyPCA': # not sure
		# 	q_idxs = 
		# elif STRATEGY_NAME == 'BALDDropout':
		# 	q_idxs = bayesian_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], device, NUM_QUERY, rd)
		# elif STRATEGY_NAME == 'MeanSTD':
		# 	q_idxs = mean_std_query(n_pool, labeled_idxs, train_dataset, train_features, squad['train'], device, NUM_QUERY, rd)
		# elif STRATEGY_NAME == 'BadgeSampling':
		# 	q_idxs = badge_query()
		# elif STRATEGY_NAME == 'LossPredictionLoss':
		# 	# different net!
		# 	q_idxs = loss_prediction_query()
		# elif STRATEGY_NAME == 'CEALSampling':
		# 	# why use 'CEALSampling' in STRATEGY_NAME
		# 	q_idxs = ceal_query()
		# else:
		# 	raise NotImplementedError

	# 	## update
	# 	labeled_idxs[q_idxs] = True
	# 	run_rd_labeled_idxs = np.arange(n_pool)[labeled_idxs]

	# 	train_dataloader_rd = DataLoader(
	# 		train_dataset.select(indices=run_rd_labeled_idxs),
	# 		shuffle=True,
	# 		collate_fn=default_data_collator,
	# 		batch_size=8,
	# 	)

	# 	num_update_steps_per_epoch_rd = len(train_dataloader_rd)
	# 	num_training_steps_rd = num_train_epochs * num_update_steps_per_epoch_rd

	# 	lr_scheduler_rd = get_scheduler(
	# 		"linear",
	# 		optimizer=optimizer,
	# 		num_warmup_steps=0,
	# 		num_training_steps=num_training_steps_rd,
	# 	)

	# 	model_rd = AutoModelForQuestionAnswering.from_pretrained(model_dir).to(device)
	# 	optimizer_rd = AdamW(model_rd.parameters(), lr=1e-4)

	# 	## train
	# 	to_train(num_train_epochs, train_dataloader_rd, device, model_rd, optimizer_rd, lr_scheduler_rd)

	# 	## round rd accuracy
	# 	acc[rd] = get_pred(eval_dataloader, device, val_features, squad['validation'])['f1']
	# 	print('testing accuracy {}'.format(acc[rd]))
	# 	print('\n')

	# 	torch.cuda.empty_cache()
	
	# ## print results
	# print('SEED {}'.format(SEED))
	# print(STRATEGY_NAME)
	# print(acc)
	# all_acc.append(acc)
	
	# ## save model and record acq time
	# timestamp = re.sub('\.[0-9]*','_',str(datetime.datetime.now())).replace(" ", "_").replace("-", "").replace(":","")
	# final_model_dir = model_dir + '/' + timestamp + DATA_NAME+ '_'  + STRATEGY_NAME + '_' + str(NUM_QUERY) + '_' + str(NUM_INIT_LB) +  '_' + str(args_input.quota)
	# os.makedirs(final_model_dir, exist_ok=True)
	# end = datetime.datetime.now()
	# acq_time.append(round(float((end-start).seconds), 3))

	# final_model = AutoModelForQuestionAnswering.from_pretrained(model_dir).to(device)
	# model_to_save = final_model.module if hasattr(final_model, 'module') else final_model 
	# model_to_save.save_pretrained(final_model_dir)

SQuAD
KMeansSampling
Round 0
testing accuracy 13.8252




In [36]:
all_acc

[]

## query workspace

In [37]:
# # get unlable data
# unlabeled_idxs = np.arange(n_pool)[~labeled_idxs]
# unlabeled_data = train_dataset.select(indices=unlabeled_idxs)
# len(unlabeled_idxs)

In [38]:
# get unlable data
unlabeled_idxs = np.arange(n_pool)[~labeled_idxs]
# smaller data
unlabeled_idxs_20 = unlabeled_idxs[20:40]
unlabeled_data_20 = train_dataset.select(unlabeled_idxs_20)
len(unlabeled_data_20)

20

In [39]:
unlabeled_features_20 = train_features.select(unlabeled_idxs_20)
unlabeled_dataloader_20 = DataLoader(
		unlabeled_data_20,
		shuffle=False,
		collate_fn=default_data_collator,
		batch_size=8,
	)

In [40]:
get_prob(unlabeled_dataloader_20, device, unlabeled_features_20, squad['train'])

Evaluating_prob:   0%|          | 0/3 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/4000 [00:00<?, ?it/s]

{0: array([5.92328131e-01, 8.99790507e-03, 3.19611863e-03, 1.26474025e-03,
        1.10692240e-03, 8.35207582e-04, 6.53406081e-04, 1.70150568e-04,
        3.31909984e-01, 5.04196156e-03, 4.77753766e-03, 7.08695035e-04,
        6.20262115e-04, 4.68007172e-04, 3.66135006e-04, 1.06273947e-04,
        9.53436174e-05, 2.27802270e-03, 8.09169607e-04, 3.20197549e-04,
        2.11451639e-04, 4.07111421e-02, 6.18432765e-04, 2.19671623e-04,
        8.69265423e-05, 7.60796029e-05, 5.74044352e-05, 4.49090876e-05,
        1.30352637e-05, 1.16945785e-05, 5.13553778e-06, 4.90996581e-06,
        4.42525698e-06, 2.85630108e-06, 1.15006953e-03, 1.74704201e-05,
        6.20561241e-06, 2.45563024e-06, 2.14921101e-06, 1.62164599e-06,
        1.26865859e-06, 3.30365992e-07, 8.86360613e-06, 3.14841304e-06,
        1.24586359e-06, 8.22741754e-07, 2.99569976e-04, 4.55069085e-06,
        1.61643788e-06, 6.39642565e-07, 5.59826503e-07, 4.22406345e-07,
        3.30460210e-07, 9.59190842e-08, 8.60536602e-08, 3.812

### test: query 5 data from 20 unlabeled_data

In [41]:
from sklearn.cluster import KMeans
# def kmeans_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n, rd):
#     # deepAL+: unlabeled_idxs, unlabeled_data = self.dataset.get_unlabeled_data()
#     unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
#     unlabeled_features = train_features.select(unlabeled_idxs)
#     unlabeled_dataloader = DataLoader(
#       unlabeled_data,
#       shuffle=True,
#       collate_fn=default_data_collator,
#       batch_size=8,
#     )
n = 5

In [42]:
# smaller data
unlabeled_idxs_20 = unlabeled_idxs[20:120]
unlabeled_data_20 = train_dataset.select(unlabeled_idxs_20)
unlabeled_feature_20 = train_features.select(unlabeled_idxs_20)
len(unlabeled_data_20)

100

In [43]:
unlabeled_idxs_20

array([ 21,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,
        35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,
        61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,
        74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  85,  86,  87,
        88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100,
       101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
       114, 115, 116, 117, 118, 119, 120, 121, 122])

In [44]:
unlabeled_dataloader = DataLoader(
		unlabeled_data_20,
		shuffle=True,
		collate_fn=default_data_collator,
		batch_size=8,
	)
len(unlabeled_dataloader.dataset)

100

In [56]:
    # # deepAL+: embeddings = get_embeddings(unlabeled_data)
    # embeddings = get_embeddings(unlabeled_dataloader, device, unlabeled_features, examples, rd=1)
embeddings = get_embeddings(unlabeled_dataloader, device, unlabeled_feature_20, squad['train'], rd=1)

Evaluating_prob:   0%|          | 0/13 [00:00<?, ?it/s]

AttributeError: 'QuestionAnsweringModelOutput' object has no attribute 'last_hidden_state'

In [46]:
print(embeddings)

tensor([[-0.9169,  2.2937,  0.6952,  ..., -0.0484,  0.1575,  0.0433],
        [-0.9169,  2.2937,  0.6952,  ..., -0.0484,  0.1575,  0.0433],
        [-0.9169,  2.2937,  0.6952,  ..., -0.0484,  0.1575,  0.0433],
        ...,
        [-0.9169,  2.2937,  0.6952,  ..., -0.0484,  0.1575,  0.0433],
        [-0.9169,  2.2937,  0.6952,  ..., -0.0484,  0.1575,  0.0433],
        [-0.9169,  2.2937,  0.6952,  ..., -0.0484,  0.1575,  0.0433]])


In [47]:
    # deepAL+: embeddings = embeddings.numpy()
# embeddings = embeddings.numpy()
print(embeddings.shape)
    # deepAL+: cluster_learner = KMeans(n_clusters=n)
cluster_learner = KMeans(n_clusters=n)
    # deepAL+: cluster_learner.fit(embeddings)
cluster_learner.fit(embeddings)

torch.Size([100, 768])


  super()._check_params_vs_input(X, default_n_init=10)


  return fit_method(estimator, *args, **kwargs)


In [48]:
    # deepAL+: cluster_idxs = cluster_learner.predict(embeddings)
cluster_idxs = cluster_learner.predict(embeddings)
    # deepAL+: centers = cluster_learner.cluster_centers_[cluster_idxs]
centers = cluster_learner.cluster_centers_[cluster_idxs]
centers # len = 20

array([[-0.9169113 ,  2.29374051,  0.69522274, ..., -0.04836455,
         0.15746017,  0.04329722],
       [-0.9169113 ,  2.29374051,  0.69522274, ..., -0.04836455,
         0.15746017,  0.04329722],
       [-0.9169113 ,  2.29374051,  0.69522274, ..., -0.04836455,
         0.15746017,  0.04329722],
       ...,
       [-0.9169113 ,  2.29374051,  0.69522274, ..., -0.04836455,
         0.15746017,  0.04329722],
       [-0.9169113 ,  2.29374051,  0.69522274, ..., -0.04836455,
         0.15746017,  0.04329722],
       [-0.9169113 ,  2.29374051,  0.69522274, ..., -0.04836455,
         0.15746017,  0.04329722]])

In [49]:
cluster_idxs

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [50]:
    # deepAL+: dis = (embeddings - centers)**2
dis = (embeddings - centers)**2
dis

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

In [51]:
    # deepAL+: dis = dis.sum(axis=1)
dis = dis.sum(axis=1)
    # deepAL+: q_idxs = np.array([np.arange(embeddings.shape[0])[cluster_idxs==i][dis[cluster_idxs==i].argmin()] for i in range(n)])
q_idxs = np.array([np.arange(embeddings.shape[0])[cluster_idxs==i][dis[cluster_idxs==i].argmin()] for i in range(n)])

    # deepAL+: return unlabeled_idxs[q_idxs]
print(unlabeled_idxs[q_idxs])

IndexError: argmin(): Expected reduction dim to be specified for input.numel() == 0.