In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, GPT2LMHeadModel, GPT2DoubleHeadsModel

from tqdm.autonotebook import tqdm

from src.settings import MODELS_DIR
from src.data.utils.ARC_utils import construct_ARC_prompt
from src.data.encoders.ARC_encoder import ARCInputsEncoder

In [3]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda', index=0)

In [4]:
MODEL = "TheBloke/phi-2-GPTQ"
DATASET = "ai2_arc"
SUBSET = "ARC-Challenge"

In [5]:
dataset = load_dataset(DATASET, SUBSET, split="test")
dataset

Dataset({
    features: ['id', 'question', 'choices', 'answerKey'],
    num_rows: 1172
})

In [6]:
dataset[0]

{'id': 'Mercury_7175875',
 'question': 'An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?',
 'choices': {'text': ['Planetary density will decrease.',
   'Planetary years will become longer.',
   'Planetary days will become shorter.',
   'Planetary gravity will become stronger.'],
  'label': ['A', 'B', 'C', 'D']},
 'answerKey': 'C'}

In [7]:
model = AutoModelForCausalLM.from_pretrained(MODEL,
    device_map=DEVICE,
    trust_remote_code=True,
    revision="main"
)

tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

CUDA extension not installed.
CUDA extension not installed.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
tokenizer.pad_token

'<|endoftext|>'

In [9]:
prompt = construct_ARC_prompt(
    question=dataset[0]["question"],
    options=dataset[0]["choices"]["text"],
    enum_chars=dataset[0]["choices"]["label"]
)
print(prompt)

Pick one from the given options, A, B, C or D?

Question: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?
Options:
	A. Planetary density will decrease.
	B. Planetary years will become longer.
	C. Planetary days will become shorter.
	D. Planetary gravity will become stronger.

Answer: 


In [10]:
# loader_columns = [
#     'datasets_idx',
#     'input_ids',
#     'token_type_ids',
#     'attention_mask',
#     'start_positions',
#     'end_positions',
#     'labels'
# ]
# columns_to_ignore = [c for c in dataset.column_names if c not in loader_columns]
# columns_to_ignore

In [11]:
# encoder = ARCInputsEncoder(tokenizer=tokenizer, max_seq_length=384)

# dataset_transformed = dataset.map(
#     encoder,
#     batched=True,
#     remove_columns=columns_to_ignore,
# )

In [12]:
# BATCH_SIZE = 32

# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=False,
# )
# test_dl = DataLoader(
#     dataset_transformed, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator, pin_memory=True
# )

In [13]:
# next(iter(test_dl)).keys()

In [14]:
# next(iter(test_dl)).input_ids

In [15]:
prompt = """\
Answer the following question. Pick one from the given options, A, B, C or D?

Question: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?
Options:
	A. Planetary density will decrease.
	B. Planetary years will become longer.
	C. Planetary days will become shorter.
	D. Planetary gravity will become stronger.

Answer:
"""

In [16]:
encoded_prompt = tokenizer(prompt, return_tensors="pt")
encoded_prompt

{'input_ids': tensor([[33706,   262,  1708,  1808,    13, 12346,   530,   422,   262,  1813,
          3689,    11,   317,    11,   347,    11,   327,   393,   360,    30,
           198,   198, 24361,    25,  1052, 47603, 34526,   326,   257,  5440,
          5724,   689,  5443,   706,   257, 19999,   578,  2928,    13,  9022,
           318,   262,   749,  1884,  1245,   286,   428,  2620,   287, 13179,
            30,   198, 29046,    25,   198,   197,    32,    13, 43800, 12109,
           481, 10070,    13,   198,   197,    33,    13, 43800,   812,   481,
          1716,  2392,    13,   198,   197,    34,    13, 43800,  1528,   481,
          1716, 12238,    13,   198,   197,    35,    13, 43800, 13522,   481,
          1716,  7387,    13,   198,   198, 33706,    25,   198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 

In [17]:
pred = model.generate(
    inputs=encoded_prompt.input_ids.to(DEVICE),
    temperature=0.7,
    do_sample=True,
    top_p=0.95,
    top_k=40,
    max_new_tokens=512
)
pred



tensor([[33706,   262,  1708,  1808,    13, 12346,   530,   422,   262,  1813,
          3689,    11,   317,    11,   347,    11,   327,   393,   360,    30,
           198,   198, 24361,    25,  1052, 47603, 34526,   326,   257,  5440,
          5724,   689,  5443,   706,   257, 19999,   578,  2928,    13,  9022,
           318,   262,   749,  1884,  1245,   286,   428,  2620,   287, 13179,
            30,   198, 29046,    25,   198,   197,    32,    13, 43800, 12109,
           481, 10070,    13,   198,   197,    33,    13, 43800,   812,   481,
          1716,  2392,    13,   198,   197,    34,    13, 43800,  1528,   481,
          1716, 12238,    13,   198,   197,    35,    13, 43800, 13522,   481,
          1716,  7387,    13,   198,   198, 33706,    25,   198,   198,    33,
            13, 43800,   812,   481,  1716,  2392,    13,   198,   198,  3109,
         11578,   341,    25,   198,   198, 19722,   317,   318,   407,   257,
         12219,  7664,   355,   262, 12109,   286,  

In [18]:
decoded_pred = tokenizer.batch_decode(pred, skip_special_tokens=True)
print(decoded_pred[0])

Answer the following question. Pick one from the given options, A, B, C or D?

Question: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?
Options:
	A. Planetary density will decrease.
	B. Planetary years will become longer.
	C. Planetary days will become shorter.
	D. Planetary gravity will become stronger.

Answer:

B. Planetary years will become longer.

Explanation:

Option A is not a logical conclusion as the density of a planet does not have a direct relationship with its rotation. Option C is a direct opposite of the given statement as it contradicts the concept of a planet's rotation. Option D is not directly related to the given statement. Option B is the only logical answer as the rotation of a planet affects the length of its day. When a planet rotates faster, it takes less time to complete one rotation, making the days shorter. Since the planet's year is a multiple of the number of days

In [19]:
# with torch.no_grad():
#     for batch in tqdm(test_dl):
#         batch = {k: v.to(DEVICE) for k, v in batch.items()}
#         # predictions = model.generate(
#         #     input_ids=batch["input_ids"],
#         #     max_length=encoder.max_seq_length,
#         #     attention_mask=batch["attention_mask"],
#         # )
#         predictions = model.generate(
#             inputs=batch["input_ids"],
#             temperature=0.7,
#             do_sample=True,
#             top_p=0.95,
#             top_k=40,
#             max_new_tokens=512
#         )
#         decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#         for i, pred in enumerate(decoded_predictions):
#             if i == 4:
#                 break
#             print("-" * 80, "\n", pred)

#         break