In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, GPT2LMHeadModel, GPT2DoubleHeadsModel

from tqdm.autonotebook import tqdm

from src.settings import MODELS_DIR
from src.data.utils.ARC_utils import construct_ARC_prompt
from src.data.encoders.ARC_encoder import ARCInputsEncoder

In [3]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda', index=0)

In [4]:
MODEL = "gpt2"
DATASET = "ai2_arc"
SUBSET = "ARC-Challenge"

In [5]:
dataset = load_dataset(DATASET, SUBSET, split="test")
dataset

Dataset({
    features: ['id', 'question', 'choices', 'answerKey'],
    num_rows: 1172
})

In [6]:
dataset[0]

{'id': 'Mercury_7175875',
 'question': 'An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?',
 'choices': {'text': ['Planetary density will decrease.',
   'Planetary years will become longer.',
   'Planetary days will become shorter.',
   'Planetary gravity will become stronger.'],
  'label': ['A', 'B', 'C', 'D']},
 'answerKey': 'C'}

In [7]:
model = AutoModelForCausalLM.from_pretrained(MODEL).to(DEVICE)
# model = GPT2DoubleHeadsModel.from_pretrained(MODEL).to(DEVICE)

# Padding to right instead of left, because:
# https://huggingface.co/docs/transformers/model_doc/gpt2#:~:text=GPT%2D2%20is%20a%20model%20with%20absolute%20position%20embeddings%20so%20it%E2%80%99s%20usually%20advised%20to%20pad%20the%20inputs%20on%20the%20right%20rather%20than%20the%20left.
tokenizer = AutoTokenizer.from_pretrained(MODEL, padding_side="right")
tokenizer.pad_token = tokenizer.eos_token

In [8]:
tokenizer.pad_token

'<|endoftext|>'

In [9]:
prompt = construct_ARC_prompt(
    question=dataset[0]["question"],
    options=dataset[0]["choices"]["text"],
    enum_chars=dataset[0]["choices"]["label"]
)
print(prompt)

Answer the following question. Pick one from the given options, A, B, C or D?

Question: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?
Options:
	A. Planetary density will decrease.
	B. Planetary years will become longer.
	C. Planetary days will become shorter.
	D. Planetary gravity will become stronger.

Answer: 


In [10]:
loader_columns = [
    'datasets_idx',
    'input_ids',
    'token_type_ids',
    'attention_mask',
    'start_positions',
    'end_positions',
    'labels'
]
columns_to_ignore = [c for c in dataset.column_names if c not in loader_columns]
columns_to_ignore

['id', 'question', 'choices', 'answerKey']

In [11]:
encoder = ARCInputsEncoder(tokenizer=tokenizer, max_seq_length=384)

dataset_transformed = dataset.map(
    encoder,
    batched=True,
    remove_columns=columns_to_ignore,
)

In [12]:
BATCH_SIZE = 32

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)
test_dl = DataLoader(
    dataset_transformed, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator, pin_memory=True
)

In [13]:
next(iter(test_dl)).keys()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


dict_keys(['input_ids', 'attention_mask', 'labels'])

In [14]:
with torch.no_grad():
    for batch in tqdm(test_dl):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        predictions = model.generate(
            input_ids=batch["input_ids"],
            max_length=encoder.max_seq_length,
            attention_mask=batch["attention_mask"],
        )
        decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        for i, pred in enumerate(decoded_predictions):
            if i == 4:
                break
            print("-" * 80, "\n", pred)

        break

  0%|          | 0/37 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


-------------------------------------------------------------------------------- 
 Answer the following question. Pick one from the given options, A, B, C or D?

Question: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?

Options:
	A. Planetary density will decrease.
	B. Planetary years will become longer.
	C. Planetary days will become shorter.
	D. Planetary gravity will become stronger.

The correct answer is The planet rotates faster after a meteorite impact.

Answer: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?

Option: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?

Answer: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotat