In [1]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, GPT2LMHeadModel, GPT2DoubleHeadsModel
from transformers import pipeline

import numpy as np
from tqdm.autonotebook import tqdm
from typing import Dict, Any, List, Tuple, Optional
from dataclasses import dataclass

from src.settings import MODELS_DIR

In [2]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda', index=0)

In [3]:
MODEL = "gpt2"
DATASET = "ai2_arc"
SUBSET = "ARC-Challenge"

In [4]:
dataset = load_dataset(DATASET, SUBSET, split="test")
dataset

Dataset({
    features: ['id', 'question', 'choices', 'answerKey'],
    num_rows: 1172
})

In [5]:
dataset[0]

{'id': 'Mercury_7175875',
 'question': 'An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?',
 'choices': {'text': ['Planetary density will decrease.',
   'Planetary years will become longer.',
   'Planetary days will become shorter.',
   'Planetary gravity will become stronger.'],
  'label': ['A', 'B', 'C', 'D']},
 'answerKey': 'C'}

In [6]:
# model = AutoModelForCausalLM.from_pretrained(MODEL).to(DEVICE)
model = GPT2DoubleHeadsModel.from_pretrained(MODEL).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

Some weights of GPT2DoubleHeadsModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['multiple_choice_head.summary.bias', 'multiple_choice_head.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def constrruct_prompt(question: str, options: List[str], enum_lettters="ABCD"):
    instructions = (
        f"Answer the following question. Pick one from the given options ({', '.join(enum_lettters[:-1])} or {enum_lettters[-1]})."
    )
    prompt = instructions + "\n\n" + "Question: {question}\n\nOptions:\n\t{options}\n\nThe correct answer is "
    options_str = "\n\t".join(f"{letter}. {choice}" for letter, choice in zip(enum_lettters, options))
    return prompt.format(question=question, options=options_str)

prompt = constrruct_prompt(
    question=dataset[0]["question"],
    options=dataset[0]["choices"]["text"],
    enum_lettters=dataset[0]["choices"]["label"]
)
print(prompt)

Answer the following question. Pick one from the given options (A, B, C or D).

Question: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?

Options:
	A. Planetary density will decrease.
	B. Planetary years will become longer.
	C. Planetary days will become shorter.
	D. Planetary gravity will become stronger.

The correct answer is 


In [8]:
def preprocess_arc_batch(examples: Dict[str, Any]) -> Tuple[List[str], List[str]]:
    questions = examples["question"]
    choices = examples["choices"]
    choices_text_batch, choices_labels_batch = zip(*[(choice["text"], choice["label"]) for choice in choices])

    inputs = [
        constrruct_prompt(question, options, option_labels)
        for question, options, option_labels in zip(questions, choices_text_batch, choices_labels_batch)
    ]
    targets = examples["answerKey"]
    return inputs, targets


@dataclass
class InputsEncoder:
    tokenizer: AutoTokenizer
    max_seq_length: int

    def convert_to_features_train(
        self,
        example_batch: Dict[str, Any],
        indices: Optional[List[int]] = None
    ) -> Any:
        inputs, text_target = preprocess_arc_batch(example_batch)

        model_inputs = self.tokenizer(
            inputs, text_target=text_target, max_length=self.max_seq_length, truncation=True
        )
        return model_inputs

    def __call__(
        self,
        example_batch: Dict[str, Any],
        indices: Optional[List[int]] = None
    ) -> Any:
        return self.convert_to_features_train(
            example_batch=example_batch, indices=indices
        )

In [9]:
loader_columns = [
    'datasets_idx',
    'input_ids',
    'token_type_ids',
    'attention_mask',
    'start_positions',
    'end_positions',
    'labels'
]
columns_to_ignore = [c for c in dataset.column_names if c not in loader_columns]
columns_to_ignore

['id', 'question', 'choices', 'answerKey']

In [10]:
encoder = InputsEncoder(tokenizer=tokenizer, max_seq_length=384)

dataset_transformed = dataset.map(
    encoder,
    batched=True,
    remove_columns=columns_to_ignore,
)

Map:   0%|          | 0/1172 [00:00<?, ? examples/s]

In [11]:
BATCH_SIZE = 32

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)
test_dl = DataLoader(
    dataset_transformed, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator, pin_memory=True
)

In [12]:
next(iter(test_dl)).keys()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


dict_keys(['input_ids', 'attention_mask', 'labels'])

In [13]:

with torch.no_grad():
    for batch in tqdm(test_dl):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        predictions = model.generate(
            input_ids=batch["input_ids"],
            max_length=encoder.max_seq_length,
            attention_mask=batch["attention_mask"],
        )
        decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        for i, pred in enumerate(decoded_predictions):
            if i == 4:
                break
            print("-" * 80, "\n", pred)

        break

  0%|          | 0/37 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


-------------------------------------------------------------------------------- 
 Answer the following question. Pick one from the given options (A, B, C or D).

Question: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?

Options:
	A. Planetary density will decrease.
	B. Planetary years will become longer.
	C. Planetary days will become shorter.
	D. Planetary gravity will become stronger.

The correct answer is ____.

Answer: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?

Option:

IreA. The planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?

Option:

IreB. The planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?

Option:

IreC. The planet rotates faster after a meteorite impact. Which is t