In [1]:
import random

from datasets import interleave_datasets, load_dataset
import torch
import transformers

from life_after_bert import evaluate_encoder, MCDataset

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "roberta-large"
num_choices = 2

In [3]:
model = transformers.AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [4]:
dataset = load_dataset("KevinZ/psycholinguistic_eval", "CPRAG")["test"]

def preprocess_CPRAG(example):
    question = f'{example["context_s1"]} {example["context_s2"]} [MASK]'
    choices = [example[key] for key in ["expected", "within_category", "between_category"]]
    random.shuffle(choices)
    answer_id = choices.index(example["expected"])

    return {
        "questions": question,
        "choices": choices,
        "answer_ids": answer_id,
    }

dataset = dataset.map(preprocess_CPRAG)
dataset = MCDataset(dataset["questions"], dataset["choices"], dataset["answer_ids"], num_choices, tokenizer, max_length=36)



  0%|          | 0/1 [00:00<?, ?it/s]



In [5]:
sensitivity, (all_answers, all_preds, all_top_preds) = evaluate_encoder(model, tokenizer, dataset, device=device, output_topk=5)
f"CPRAG Sensitivity: {sensitivity}"

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

'CPRAG Sensitivity: 0.8529411764705882'

In [6]:
num_correct = 0
tokenized_answers = torch.stack(dataset.choice_ids).gather(1, dataset.answer_ids.unsqueeze(1))

for i, tokenized_answer in enumerate(tokenized_answers):
    num_correct += 1 if tokenized_answer in all_top_preds[i] else 0
    
f"CPRAG Top 5 Accuracy: {num_correct / len(tokenized_answers)}"

'CPRAG Top 5 Accuracy: 0.6470588235294118'

In [7]:
dataset = load_dataset("KevinZ/psycholinguistic_eval", "ROLE")["test"]
dataset = dataset.map(lambda ex: {
    "questions": f"{ex['context']} [MASK]",
    "choices": ex["expected"].split("|") + [tokenizer.mask_token] * (9 - ex["expected"].count("|")),
    "answer_ids": 0,
})

dataset = MCDataset(dataset["questions"], dataset["choices"], dataset["answer_ids"], 10, tokenizer, max_length=36)



  0%|          | 0/1 [00:00<?, ?it/s]



In [8]:
sensitivity, (all_answers, all_preds, all_top_preds) = evaluate_encoder(model, tokenizer, dataset, device=device, output_topk=5)

Evaluating:   0%|          | 0/6 [00:00<?, ?it/s]

In [9]:
num_correct = 0

for i, tokenized_answer_tensor in enumerate(dataset.choice_ids):
    tokenized_answers = list(filter((tokenizer.mask_token_id).__ne__, tokenized_answer_tensor.tolist()))
    num_correct += 1 if not set(tokenized_answer_tensor.tolist()).isdisjoint(all_top_preds[i].tolist()) else 0
    
f"ROLE Top 5 Accuracy: {num_correct / len(dataset.choice_ids)}"

'ROLE Top 5 Accuracy: 0.4659090909090909'

In [10]:
dataset = load_dataset("KevinZ/psycholinguistic_eval", "NEG-NAT")["test"]



  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
def preprocess_NEGNAT_aff(example):
    question = f'{example["context_aff"]} [MASK]'
    choices = [example[key] for key in ["target_aff", "target_neg"]]
    random.shuffle(choices)
    answer_id = choices.index(example["target_aff"])

    return {
        "questions": question,
        "choices": choices,
        "answer_ids": answer_id,
    }

def preprocess_NEGNAT_neg(example):
    question = f'{example["context_neg"]} [MASK]'
    choices = [example[key] for key in ["target_aff", "target_neg"]]
    random.shuffle(choices)
    answer_id = choices.index(example["target_neg"])

    return {
        "questions": question,
        "choices": choices,
        "answer_ids": answer_id,
    }

dataset = interleave_datasets([dataset.map(preprocess_NEGNAT_aff), dataset.map(preprocess_NEGNAT_neg)])
dataset = MCDataset(dataset["questions"], dataset["choices"], dataset["answer_ids"], num_choices, tokenizer, max_length=36)



In [12]:
sensitivity, (all_answers, all_preds, all_top_preds) = evaluate_encoder(model, tokenizer, dataset, device=device, output_topk=5)
f"NEG-NAT Sensitivity: {sensitivity}"

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

'NEG-NAT Sensitivity: 0.65625'

In [13]:
num_correct = 0
tokenized_answers = torch.stack(dataset.choice_ids).gather(1, dataset.answer_ids.unsqueeze(1))

for i, tokenized_answer in enumerate(tokenized_answers):
    num_correct += 1 if tokenized_answer in all_top_preds[i] else 0
    
f"NEG-NAT Top 5 Accuracy: {num_correct / len(tokenized_answers)}"

'NEG-NAT Top 5 Accuracy: 0.46875'

In [14]:
dataset = load_dataset("KevinZ/psycholinguistic_eval", "NEG-SIMP")["test"]



  0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
def preprocess_NEGSIMP_aff(example):
    question = f'{example["context_aff"][:-7]} {"an" if example["target_aff"][0] in "aeiou" else "a"} [MASK]'
    choices = [example[key] for key in ["target_aff", "target_neg"]]
    random.shuffle(choices)
    answer_id = choices.index(example["target_aff"])

    return {
        "questions": question,
        "choices": choices,
        "answer_ids": answer_id,
    }

def preprocess_NEGSIMP_neg(example):
    question = f'{example["context_neg"][:-7]} {"an" if example["target_neg"][0] in "aeiou" else "a"} [MASK]'
    choices = [example[key] for key in ["target_aff", "target_neg"]]
    random.shuffle(choices)
    answer_id = choices.index(example["target_neg"])

    return {
        "questions": question,
        "choices": choices,
        "answer_ids": answer_id,
    }

dataset = interleave_datasets([dataset.map(preprocess_NEGSIMP_aff), dataset.map(preprocess_NEGSIMP_neg)])
dataset = MCDataset(dataset["questions"], dataset["choices"], dataset["answer_ids"], num_choices, tokenizer, max_length=36)



In [16]:
sensitivity, (all_answers, all_preds, all_top_preds) = evaluate_encoder(model, tokenizer, dataset, device=device, output_topk=5)
f"NEG-SIMP Sensitivity: {sensitivity}"

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

'NEG-SIMP Sensitivity: 0.5833333333333334'

In [17]:
num_correct = 0
tokenized_answers = torch.stack(dataset.choice_ids).gather(1, dataset.answer_ids.unsqueeze(1))

for i, tokenized_answer in enumerate(tokenized_answers):
    num_correct += 1 if tokenized_answer in all_top_preds[i] else 0
    
f"NEG-SIMP Top 5 Accuracy: {num_correct / len(tokenized_answers)}"

'NEG-SIMP Top 5 Accuracy: 0.4444444444444444'