In [2]:
import os
cache_dir = "/scratches/dialfs/alta/hln35/.cache"
os.environ['TRANSFORMERS_CACHE'] = '/scratches/dialfs/alta/hln35/.cache'

In [3]:
model_small = "google/flan-t5-small"

In [4]:
from datasets import load_dataset

data_points = load_dataset("ai2_arc", "ARC-Easy", cache_dir=cache_dir)

In [5]:
data_points

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 2251
    })
    test: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 2376
    })
    validation: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 570
    })
})

In [6]:
data_points["test"][0]

{'id': 'Mercury_417466',
 'question': 'Which statement best explains why photosynthesis is the foundation of most food webs?',
 'choices': {'text': ['Sunlight is the source of energy for nearly all ecosystems.',
   'Most ecosystems are found on land instead of in water.',
   'Carbon dioxide is more available than other gases.',
   'The producers in all ecosystems are plants.'],
  'label': ['A', 'B', 'C', 'D']},
 'answerKey': 'A'}

In [23]:
data_points = data_points.filter(lambda x: len(x['choices']['label']) == 4)

Filter:   0%|          | 0/2251 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2376 [00:00<?, ? examples/s]

Filter:   0%|          | 0/570 [00:00<?, ? examples/s]

In [24]:
data_points

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 2241
    })
    test: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 2365
    })
    validation: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 567
    })
})

In [28]:
index_to_ans = {0: "A", 1: "B", 2: "C", 3: "D"}
ans_to_index = {"A" : "0", "B" : "1", "C" : "2", "D": "3"}
ans_id_dict = {71: "A", 272: "B", 205: "C", 309: "D"}

In [25]:
prefix = "context: for each questions there are a few choices of answer. answer this question by choosing the best choice either A, B, C, or D: "


def preprocess_function(data_points):
    inputs = []
    for i in range(len(data_points["question"])):
        if len(data_points["choices"][i]["label"]) != 4:
            continue
        labels = [index_to_ans[int(t)-1] if t.isdigit() else t for t in data_points["choices"][i]["label"]]
        q = data_points["question"][i]
        choices = ""
        choice = ""
        for t in range(len(labels)):
            choices += labels[t] + " " + data_points["choices"][i]["text"][t] + ". "
            
        text = prefix + q + "Choices: " + choices
        inputs.append(text)
    model_inputs = tokenizer(inputs, truncation=True)
    
    return model_inputs

In [9]:
from transformers import AutoTokenizer
import torch
from transformers import AutoModelForQuestionAnswering, AutoModel, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_small)
model = AutoModelForSeq2SeqLM.from_pretrained(model_small)
model1 = AutoModel.from_pretrained(model_small)
model2 = AutoModelForQuestionAnswering.from_pretrained(model_small)



Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
import json, re
import numpy as np

In [26]:
tokenized_datasets = data_points.map(preprocess_function, batched=True)

Map:   0%|          | 0/2241 [00:00<?, ? examples/s]

Map:   0%|          | 0/2365 [00:00<?, ? examples/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

In [33]:
test_input_ids = tokenized_datasets["test"]["input_ids"]
model_outputs = []
results = {}
for i in range(0, len(test_input_ids)):
        test_tensor = torch.tensor([test_input_ids[i]])
        preds = model(input_ids=test_tensor, decoder_input_ids=torch.tensor([[model.config.decoder_start_token_id]]))      
        preds_prob = []
        for t in ans_id_dict.keys():
            preds_prob.append(preds.logits[...,t][0][0].item())
            
        model_outputs.append(index_to_ans[np.argmax(preds_prob)])         

# with open("QA_large_model_probability_output.txt", "w") as fp:
#     json.dump(large_model_outputs, fp)

In [34]:
labels = tokenized_datasets["test"]['answerKey']

result = 0
for i in range(min(len(model_outputs), len(labels))):
    if model_outputs[i] == labels[i] or ans_to_index[model_outputs[i]] == labels[i]:
        result += 1
result

749

In [36]:
model_output_set = set(model_outputs)
model_output_set

{'A', 'B', 'C', 'D'}

In [38]:
print(f"Total points are {len(data_points['test'])}. Number of correct answers is {result}.")

Total points are 2365. Number of correct answers is 749.


In [39]:
model_large = "google/flan-t5-large"
tokenizer_large = AutoTokenizer.from_pretrained(model_large)
model_large = AutoModelForSeq2SeqLM.from_pretrained(model_large)

In [40]:
model_large_outputs = []
for i in range(0, len(test_input_ids)):
        test_tensor = torch.tensor([test_input_ids[i]])
        preds = model_large(input_ids=test_tensor, decoder_input_ids=torch.tensor([[model.config.decoder_start_token_id]]))      
        preds_prob = []
        for t in ans_id_dict.keys():
            preds_prob.append(preds.logits[...,t][0][0].item())
            
        model_large_outputs.append(index_to_ans[np.argmax(preds_prob)])    

labels = tokenized_datasets["test"]['answerKey']

result_large = 0
for i in range(min(len(model_large_outputs), len(labels))):
    if model_large_outputs[i] == labels[i] or ans_to_index[model_large_outputs[i]] == labels[i]:
        result_large += 1
result_large

1697

In [41]:
print(f"Total points are {len(data_points['test'])}. Number of correct answers is {result_large}.")

Total points are 2365. Number of correct answers is 1697.
