# Question Answering Model 
## no trainer

- dataset
- torch
- transformers
- transformers[torch]
- evaluate

import packages

In [1]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    default_data_collator,
    get_scheduler,
    AutoModelForQuestionAnswering,
    BertConfig
)
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch

import evaluate
import collections
from tqdm.auto import tqdm
import numpy as np
from sklearn.cluster import KMeans

import os
import re
import datetime

from torch.autograd import Variable
import torch.nn.functional as F
from copy import deepcopy

Set cache directory.

In [2]:
CACHE_DIR='/mount/arbeitsdaten31/studenten1/linku/.cache'
%set_env TRANSFORMERS_CACHE $CACHE_DIR
%set_env HF_MODULES_CACHE $CACHE_DIR
%set_env HF_DATASETS_CACHE $CACHE_DIR

env: TRANSFORMERS_CACHE=/mount/arbeitsdaten31/studenten1/linku/.cache
env: HF_MODULES_CACHE=/mount/arbeitsdaten31/studenten1/linku/.cache
env: HF_DATASETS_CACHE=/mount/arbeitsdaten31/studenten1/linku/.cache


### arguments.py

args_input.

In [3]:
args_input_ALstrategy = 'RandomSampling'
args_input_initseed = 600 # 1000
args_input_quota = 600 # 1000
args_input_batch = 200 # 128
args_input_dataset_name = 'SQuAD'
args_input_iteration = 1
args_input_model_batch = 8 # already add in arguments.py
args_input_max_length = 384

stride = 128

in main.py

In [4]:
MAX_LENGTH = args_input_max_length
NUM_QUERY = args_input_batch
NUM_INIT_LB = args_input_initseed
NUM_ROUND = int(args_input_quota / args_input_batch)
DATA_NAME = args_input_dataset_name
STRATEGY_NAME = args_input_ALstrategy

### load dataset

In [8]:
squad = load_dataset(args_input_dataset_name.lower())
# squad["train"] = squad["train"].shuffle(42).select(range(2000))
squad["train"] = squad["train"].select(range(5000))
squad["validation"] = squad["validation"].select(range(1500))
# squad["train"] = squad["train"].select(range(670, 680))

Found cached dataset squad (/home/users1/linku/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
len(squad["train"])

5000

The function for preprocessing the dataset (training and evaluation data).

In [7]:
def preprocess_training_features(examples):
    # keep ["offset_mapping"], for compute_metrics()
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs["offset_mapping"]
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    example_ids = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        
        example_ids.append(examples["id"][sample_idx]) # newly added for used in unlabel data predict

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["example_id"] = example_ids
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [10]:
def preprocess_training_examples(examples):
    # no ['offset_mapping'], for .train() and .eval()
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    example_ids = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        
        example_ids.append(examples["id"][sample_idx]) # newly added for used in unlabel data predict

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [11]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [13]:
# load tokenizer for dataset preprocessing
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# preprocess data
train_dataset = squad["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=squad["train"].column_names,
)
train_features = squad["train"].map(
    preprocess_training_features,
    batched=True,
    remove_columns=squad["train"].column_names,
)
val_dataset = squad["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["validation"].column_names,
)
val_features = squad["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["validation"].column_names,
)

Loading cached processed dataset at /home/users1/linku/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-907c66fab175d3c8.arrow


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Loading cached processed dataset at /home/users1/linku/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-9456706e0dd13065.arrow
Loading cached processed dataset at /home/users1/linku/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-9456706e0dd13065.arrow


In [14]:
train_dataset.set_format("torch")
train_features.set_format("torch")
val_dataset = val_dataset.remove_columns(["offset_mapping"])
val_dataset.set_format("torch")
val_features.set_format("torch")

In [15]:
len(train_dataset)

5094

## model.py

In [16]:
def to_train(num_train_epochs, train_dataloader, device, model, optimizer, lr_scheduler, record_loss=False):
	print('Training was performed using the sum of {} initial data and {} query data, i.e. {} data.'.format(NUM_INIT_LB, NUM_QUERY, len(train_dataloader.dataset)))
	for epoch in range(num_train_epochs):
		model.train()
		for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
			batch = {key: value.to(device) for key, value in batch.items()}
			outputs = model(**batch)
			loss = outputs.loss
			loss.backward()

			optimizer.step()
			lr_scheduler.step()
			optimizer.zero_grad()

		if record_loss:
			print('Train Epoch: {}\tLoss: {:.6f}'.format(epoch, loss.item()))

	model_to_save = model.module if hasattr(model, 'module') else model 
	model_to_save.save_pretrained(strategy_model_dir)
	print('TRAIN done!')

In [17]:
metric = evaluate.load("squad")

In [18]:
def get_pred(dataloader, device, features, examples, lowRes=False):
    if lowRes:
        model = AutoModelForQuestionAnswering.from_pretrained(pretrain_model_dir).to(device)
    else:
        model = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)
    
    model.eval()
    start_logits = []
    end_logits = []

    for batch in tqdm(dataloader, desc="Evaluating_pred"):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(features)]
    end_logits = end_logits[: len(features)]

    return compute_metrics(start_logits, end_logits, features, examples)


In [19]:
def get_prob(dataloader, device, features, examples, lowRes=False):
    if lowRes:
        model = AutoModelForQuestionAnswering.from_pretrained(pretrain_model_dir).to(device)
    else:
        model = AutoModelForQuestionAnswering.from_pretrained(strategy_model_dir).to(device)

    model.eval()
    start_logits = []
    end_logits = []

    for batch in tqdm(dataloader, desc="Evaluating_prob"):
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(features)]
    end_logits = end_logits[: len(features)]

    prob_dict = {}
    example_to_features = collections.defaultdict(list)
    max_answer_length = 30
    n_best = 20
    
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    for example in tqdm(examples):
        example_id = example["id"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answers.append(start_logit[start_index] + end_logit[end_index])
        
            if len(answers) > 1:
                prob_dict[feature_index] = softmax(answers)
            elif example_to_features[example_id] != []:
                prob_dict[feature_index] = np.array([0])
    
    return prob_dict

## utils.py

In [20]:
def get_unlabel_data(n_pool, labeled_idxs, train_dataset):
    unlabeled_idxs = np.arange(n_pool)[~labeled_idxs]
    unlabeled_data = train_dataset.select(indices=unlabeled_idxs)
    return unlabeled_idxs, unlabeled_data

In [21]:
def softmax(x): 
    """Compute softmax values for each sets of scores in x."""
    softmax_num = np.exp(x) / np.sum(np.exp(x), axis=0)
    return np.round(softmax_num, decimals=4)

## query.py

In [22]:
def random_sampling_query(labeled_idxs, n):
    return np.random.choice(np.where(labeled_idxs==0)[0], n, replace=False)

In [23]:
def least_confidence_query(n_pool, labeled_idxs, train_dataset, train_features, examples, device, n):
    # deepAL+: unlabeled_idxs, unlabeled_data = self.dataset.get_unlabeled_data()
    unlabeled_idxs, unlabeled_data = get_unlabel_data(n_pool, labeled_idxs, train_dataset)
    unlabeled_features = train_features.select(unlabeled_idxs)
    unlabeled_dataloader = DataLoader(
		unlabeled_data,
		shuffle=False,
		collate_fn=default_data_collator,
		batch_size=8,
	)
    # TODO: print for recording
    print('LC querying starts!')
    # deepAL+: probs = self.predict_prob(unlabeled_data)
    prob_dict = get_prob(unlabeled_dataloader, device, unlabeled_features, examples)
    # TODO: print for recording
    print('Got probability!')
    # deepAL+: uncertainties = probs.max(1)[0]
    confidence_dict = {}
    for idx, probs in prob_dict.items():
        if len(probs) > 1: # if prob_dict['probs'] is not 0
            confidence_dict[idx] = max(probs)
        elif idx:
            confidence_dict[idx] = np.array([0])

    # deepAL+: return unlabeled_idxs[uncertainties.sort()[1][:n]]
    sorted_confidence_list = sorted(confidence_dict.items, key=lambda x: x[1])
    return unlabeled_idxs[[idx for (idx, confidence) in sorted_confidence_list[:n]]]

# main.py

### seed and device

In [24]:
SEED = 1127
# os.environ['TORCH_HOME']='./basicmodel'
os.environ["CUDA_VISIBLE_DEVICES"] = str(5)

# fix random seed
np.random.seed(SEED)
torch.manual_seed(SEED)
# torch.backends.cudnn.enabled  = True
# torch.backends.cudnn.benchmark= True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### start experiment

In [25]:
iteration = args_input_iteration
model_batch = args_input_model_batch
num_train_epochs = 3

all_acc = []
acq_time = []

# Change "fp16_training" to True to support automatic mixed precision training (fp16)	
# fp16_training = False

# if fp16_training:
#     !pip install accelerate==0.2.0
#     from accelerate import Accelerator
#     accelerator = Accelerator(fp16=True)
#     device = accelerator.device

In [29]:
## from main.py 114-126
## no iteration setting
## try with only first iterateion

## generate initial labeled pool
n_pool = len(train_dataset)
labeled_idxs = np.zeros(n_pool, dtype=bool)

tmp_idxs = np.arange(n_pool)
np.random.shuffle(tmp_idxs)

difference = 0
num_set_ex_id = 0

while num_set_ex_id != NUM_INIT_LB:        
    labeled_idxs[tmp_idxs[:NUM_INIT_LB + difference]] = True
    run_0_labeled_idxs = np.arange(n_pool)[labeled_idxs]

    run_0_samples = train_features.select(indices=run_0_labeled_idxs)
    num_set_ex_id = len(set(run_0_samples['example_id']))
    print('num_set_ex_id:', num_set_ex_id)

    difference = NUM_INIT_LB - num_set_ex_id
    print('difference:', difference)

num_set_ex_id: 599
difference: 1
num_set_ex_id: 600
difference: 0


In [30]:
len(run_0_labeled_idxs)

601

In [31]:
## record acc performance 
acc = np.zeros(NUM_ROUND + 1) # quota/batch runs + run_0
acc_em = np.zeros(NUM_ROUND + 1)

## data
train_dataloader = DataLoader(
	train_dataset.select(indices=run_0_labeled_idxs),
	shuffle=True,
	collate_fn=default_data_collator,
	batch_size=8,
)

eval_dataloader = DataLoader(
	val_dataset, 
	collate_fn=default_data_collator, 
	batch_size=8
)

## print info
print(DATA_NAME)
print(STRATEGY_NAME)

## round 0 accuracy
# acc[0] = get_pred(eval_dataloader, device, val_features, squad['validation'], rd_0=True)['f1']
# acc[0] = 77.96450701 # init=4000
acc[0] = 13.8252 # init=100

print('Round 0\ntesting accuracy {}'.format(acc[0]))
print('\n')

SQuAD
RandomSampling
Round 0
testing accuracy 13.8252




# query workspace

### test: query 5 data from 20 unlabeled_data

In [43]:
# get unlable data
unlabeled_idxs = np.arange(n_pool)[~labeled_idxs]
# smaller data
unlabeled_idxs_20 = unlabeled_idxs[20:40]
unlabeled_data_20 = train_dataset.select(unlabeled_idxs_20)
len(unlabeled_data_20)

20

In [44]:
unlabeled_features_20 = train_features.select(unlabeled_idxs_20)
unlabeled_dataloader_20 = DataLoader(
		unlabeled_data_20,
		shuffle=False,
		collate_fn=default_data_collator,
		batch_size=8,
	)

In [46]:
dataloader = unlabeled_dataloader_20
features = unlabeled_features_20
examples = squad['train']
rd = 1

In [102]:
get_prob_dict = get_prob(unlabeled_dataloader_20, device, unlabeled_features_20, squad['train'], rd)

Evaluating_prob:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

In [103]:
get_prob_2_dict = get_prob_2(dataloader, device, features, examples, rd)

Evaluating_prob:   0%|          | 0/3 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/4000 [00:00<?, ?it/s]

In [107]:
for k in get_prob_dict.keys():
    print(np.array_equal(get_prob_dict[k], get_prob_2_dict[k]))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [104]:
get_prob_dict[16]

array([0.0932, 0.0314, 0.0264, 0.0238, 0.0128, 0.0081, 0.0064, 0.0706,
       0.0238, 0.02  , 0.0097, 0.0062, 0.0051, 0.0048, 0.0585, 0.0304,
       0.0149, 0.008 , 0.0064, 0.0049, 0.1007, 0.0215, 0.0082, 0.0072,
       0.0052, 0.0582, 0.0124, 0.0047, 0.003 , 0.054 , 0.0115, 0.0044,
       0.0039, 0.0028, 0.0037, 0.0288, 0.0087, 0.0061, 0.0023, 0.0021,
       0.0018, 0.0015, 0.0014, 0.0273, 0.0058, 0.0022, 0.002 , 0.0014,
       0.0134, 0.0045, 0.0038, 0.0034, 0.0018, 0.0012, 0.0009, 0.0045,
       0.0017, 0.0086, 0.0029, 0.0024, 0.0022, 0.0012, 0.0008, 0.0148,
       0.0044, 0.0031, 0.0012, 0.0011, 0.0008, 0.0007, 0.0082, 0.0028,
       0.0023, 0.0021, 0.0011, 0.0009, 0.0017, 0.0008, 0.0006, 0.0005,
       0.0005, 0.0027, 0.001 , 0.007 , 0.0024, 0.002 , 0.0016, 0.0006,
       0.0006, 0.0005, 0.0005, 0.0056, 0.0019, 0.0016, 0.0014, 0.0008,
       0.0005, 0.005 , 0.0017, 0.0014, 0.0007, 0.0004, 0.0004, 0.0003],
      dtype=float32)

In [105]:
get_prob_2_dict[16]

array([0.0932, 0.0314, 0.0264, 0.0238, 0.0128, 0.0081, 0.0064, 0.0706,
       0.0238, 0.02  , 0.0097, 0.0062, 0.0051, 0.0048, 0.0585, 0.0304,
       0.0149, 0.008 , 0.0064, 0.0049, 0.1007, 0.0215, 0.0082, 0.0072,
       0.0052, 0.0582, 0.0124, 0.0047, 0.003 , 0.054 , 0.0115, 0.0044,
       0.0039, 0.0028, 0.0037, 0.0288, 0.0087, 0.0061, 0.0023, 0.0021,
       0.0018, 0.0015, 0.0014, 0.0273, 0.0058, 0.0022, 0.002 , 0.0014,
       0.0134, 0.0045, 0.0038, 0.0034, 0.0018, 0.0012, 0.0009, 0.0045,
       0.0017, 0.0086, 0.0029, 0.0024, 0.0022, 0.0012, 0.0008, 0.0148,
       0.0044, 0.0031, 0.0012, 0.0011, 0.0008, 0.0007, 0.0082, 0.0028,
       0.0023, 0.0021, 0.0011, 0.0009, 0.0017, 0.0008, 0.0006, 0.0005,
       0.0005, 0.0027, 0.001 , 0.007 , 0.0024, 0.002 , 0.0016, 0.0006,
       0.0006, 0.0005, 0.0005, 0.0056, 0.0019, 0.0016, 0.0014, 0.0008,
       0.0005, 0.005 , 0.0017, 0.0014, 0.0007, 0.0004, 0.0004, 0.0003],
      dtype=float32)

In [194]:
from torch.autograd import Variable
import torch.nn.functional as F
from copy import deepcopy
# prob_dict = get_prob(unlabeled_dataloader_20, device, unlabeled_features_20, squad['train'], rd)
dataloader = unlabeled_dataloader_20
features = unlabeled_features_20
examples = squad['train']
rd = 1

if rd == 1:
    config = BertConfig.from_pretrained(pretrain_model_dir, output_hidden_states=True)
else: 
    config = BertConfig.from_pretrained(model_dir, output_hidden_states=True)
model = AutoModelForQuestionAnswering.from_config(config).to(device)
model.eval()

# deepAL+: nLab = self.params['num_class']
nLab = 200
embDim = model.config.to_dict()['hidden_size']
embeddings = np.zeros([len(dataloader.dataset), embDim * nLab])

prob_dict = []
data_len_start = 0

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Evaluating_prob"):
        batch = {key: Variable(value.to(device)) for key, value in batch.items()}
            
        # deepAL+: out, e1 = self.clf(x)
        outputs = model(**batch)
        # deepAL+: e1 = e1.data.cpu().numpy()
        hidden_states = outputs.hidden_states
        embedding_of_last_layer = hidden_states[-2][:, 0, :]
        embedding_of_last_layer = embedding_of_last_layer.data.cpu().numpy()

        # idxs_end = idxs_start + len(hidden_states[0])
        # # deepAL+: embeddings[idxs] = e1.cpu()
        # embeddings[idxs_start:idxs_end] = embedding_of_last_layer.cpu()
        # idxs_start = idxs_end

        # matually create features batch
        data_len_batch = len(outputs.start_logits)
        data_len_end = data_len_start + data_len_batch
        batch_idx = list(range(data_len_start, data_len_end))
        batch_feat = unlabeled_features_20.select(batch_idx)
        data_len_start = data_len_end

        # deepAL+: batchProbs = F.softmax(out, dim=1).data.cpu().numpy()
        # deepAL+: maxInds = np.argmax(batchProbs, 1)
        out = logits_to_prob(outputs.start_logits.cpu().numpy(), outputs.end_logits.cpu().numpy(), batch_feat, batch_idx, examples, num_classes=True)
        batchProbs = F.softmax(out, dim=1).data.cpu().numpy()
        maxInds = np.argmax(batchProbs, 1)

        for j in range(data_len_batch):
            for c in range(nLab):
                if c == maxInds[j]:
                    embeddings[batch_idx[j]][embDim * c : embDim * (c+1)] = deepcopy(embedding_of_last_layer[j]) * (1 - batchProbs[j][c]) * -1.0
                else:
                    embeddings[batch_idx[j]][embDim * c : embDim * (c+1)] = deepcopy(embedding_of_last_layer[j]) * (-1 * batchProbs[j][c]) * -1.0
        # prob_dict.append(maxInds)


Evaluating_prob:   0%|          | 0/3 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing metrics:   0%|          | 0/4000 [00:00<?, ?it/s]

In [196]:
embeddings.shape

(20, 153600)

In [198]:
# chosen = init_centers(gradEmbedding, n)
chosen = init_centers(embeddings, 5)
unlabeled_idxs[chosen]

#Samps	Total Distance
1	719.9505246232893
2	575.2702332799374
3	502.7837294584695
4	480.8306760112471


array([ 4, 12, 11,  1,  3])

In [111]:
int(100 / 30)

3