# Importing the Dataset

In [146]:
%pip install datasets transformers --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
from datasets import load_dataset
from typing import Literal

cbt_dataset_id = "cam-cst/cbt"
cbt_dataset_name : Literal["CN", "NE", "P", "V"] = "CN"

# Load CBT-CN dataset
dataset = load_dataset(
    cbt_dataset_id,
    cbt_dataset_name,
    split="test"
)

# Load Winogrande dataset
# dataset = load_dataset("automated-research-group/winogrande", split="validation")

In [257]:
# Preview Dataset
dataset

Dataset({
    features: ['sentences', 'question', 'answer', 'options'],
    num_rows: 2500
})

In [205]:
# Preview Dataset Entry
dataset[0]

{'sentences': ['-LRB- 3 -RRB- Hollenmadchen .',
  "` But where is he to find the Witch-maiden ? '",
  'said the first bird .',
  '` She has no settled dwelling , but is here to-day and gone to-morrow .',
  "He might as well try to catch the wind . '",
  "The other replied , ' I do not know , certainly , where she is at present , but in three nights from now she will come to the spring to wash her face , as she does every month when the moon is full , in order that she may never grow old nor wrinkled , but may always keep the bloom of youth . '",
  "` Well , ' said the first bird , ` the spring is not far from here .",
  "Shall we go and see how it is she does it ? '",
  "` Willingly , if you like , ' said the other .",
  'The youth immediately resolved to follow the birds to the spring , only two things made him uneasy : first , lest he might be asleep when the birds went , and secondly , lest he might lose sight of them , since he had not wings to carry him along so swiftly .',
  'He 

In [271]:
# Prepare Dataset with QA format and tokenization
from transformers import AutoTokenizer
from typing import Dict, List, Any

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token_id = tokenizer.eos_token_id

# CAPITAL A ASCII

CAP_A_ASCII = 65

def get_full_text(entry: Dict[str, Any]) -> Dict[str, Any]:
    full_text_input = (
        "Context: " + "".join(entry["sentences"]) + "\n" + "Question: " + entry["question"].replace("XXXXX", "_") + "\n"
    )
    full_text_input += "\n".join([f"{chr(id + CAP_A_ASCII)}. {choice}" for id, choice in enumerate(entry["options"])])
    full_text_input += "\nAnswer: "
    return {"full_text_input": full_text_input}
def tokenize_fn(entry: Dict[str, List]) -> Dict[str, List]:
    # Get tokenized input
    tokenized_input = tokenizer.encode(entry["full_text_input"])
    return {"tokenized_input": tokenized_input}

In [None]:
# Run the following cell if using CBT-CN dataset
# dataset = dataset.map(get_full_text, batched=False)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

# Running the tests on the Benchmark Model

In [None]:
from transformers import AutoModelForCausalLM, pipeline

# Load in model, can be any model that supports question answering
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
import torch
from random import randint

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

random_test_case = dataset[randint(0, len(dataset) - 1)]
field = "full_text_input" # For CBT-CN dataset
# field = "request" # For Winogrande dataset

with torch.no_grad():
    output = pipe(random_test_case["request"])


# Output Benchmark Logits with GPT2

## Ignore the code below this. This section needs to be changed when we find a working implementation.

In [None]:
import os
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")  # Suppress all other warnings
os.environ["TRANSFORMERS_VERBOSITY"] = "error"  # Suppress transformer warnings

true = 0
output_logits = torch.Tensor([])
NUM_TESTS = 1000
for i in tqdm(range(NUM_TESTS)):
    inp = dataset[i]["tokenized_input"]
    ans = dataset[i]["answer"]
    
    with torch.no_grad():
        output = model.generate(
            input_ids=inp.unsqueeze(0),
            max_new_tokens=2,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    output_logits = torch.cat((output_logits, output[0]))
    if tokenizer.decode(output[0][-1]).strip() == ans:
        true += 1

In [None]:
print(f"Accuracy: {true/NUM_TESTS: .2f}")