In [1]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, GPT2Model, GPT2Tokenizer, GPT2LMHeadModel

In [2]:
def extract_data(dataset, mode):
  dict_mode=dataset[mode]
  option=dict_mode['options']
  answer=dict_mode['answer']
  article=dict_mode['article']
  question=dict_mode['question']

  return option, answer, article, question


In [3]:
def answer_engineering(answer, options):
  complete_answer=[]
  option_index = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4}
  for i in range(len(answer)):
    option_number=answer[i]
    complete_answer.append(options[i][option_index[option_number]])
  print(f'Total length of the dataset: {len(complete_answer)}')
  return complete_answer

In [4]:
def prompt_generation(article, question):
  complete_prompt=[
      'Answer the following question based on the passage - \n'
      + art + '\n'
      + 'Question: '+ ques + '\n'
      + 'Answer: '
      for art,ques in zip(article,question) 
  ]
  return complete_prompt

In [5]:
def get_y_hat(qn, ans):
    if '_' in qn:
            return qn.replace('_', ' ' + ans) # original dataset does not have spaces before the '_'
    return ans

def y_hat_generation(questions, answers):
    return [get_y_hat(qn, answers[idx]) for idx, qn in enumerate(questions)]

In [6]:
# Loading dataset
dataset = load_dataset("ehovy/race", 'all', ignore_verifications=True)

Found cached dataset parquet (/home/harsh/.cache/huggingface/datasets/ehovy___parquet/all-53c1c61cb0110145/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
# Extracting the dataset specifics
train_options, train_answer, train_article, train_question = extract_data(dataset, 'train')
test_options, test_answer, test_article, test_question = extract_data(dataset, 'test')
validation_options, validation_answer, validation_article, validation_question = extract_data(dataset, 'validation')

# Getting the correct answers
train_correct_answer = answer_engineering(train_answer, train_options)
test_correct_answer = answer_engineering(test_answer, test_options)
validation_correct_answer = answer_engineering(validation_answer, validation_options)

# Creating the prompts
train_prompt = prompt_generation(train_article, train_question)
test_prompt = prompt_generation(test_article, test_question)
validation_prompt = prompt_generation(validation_article, validation_question)

# Creating the y_hats
train_y_hat = y_hat_generation(train_question, train_correct_answer)
test_y_hat = y_hat_generation(test_question, test_correct_answer)
validation_y_hat = y_hat_generation(validation_question, validation_correct_answer)

Total length of the dataset: 175732
Total length of the dataset: 9868
Total length of the dataset: 9774


In [8]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [9]:
model_name = 'vicgalle/gpt2-alpaca-gpt4'
tokenizer = GPT2Tokenizer.from_pretrained(model_name, device=device) # need to change tokenizer based off model
model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id).to(device)

In [10]:
ex = 1
sequence = train_prompt[ex]
question = train_question[ex]
y_hat = train_y_hat[ex]

In [16]:
inputs = tokenizer.encode(sequence, return_tensors='pt').to(device)
outputs = model.generate(inputs, max_length=len(inputs[0]) + 50, do_sample=True, num_beams=2, no_repeat_ngram_size=2, early_stopping=True)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
y_pred = text.split('\nAnswer:')[-1]

In [17]:
print(f"Question: {question}\nCorrect: {y_hat}\nGenerated: {y_pred}")

Question: Many graduates today turn to cosmetic surgery to_.
Correct: Many graduates today turn to cosmetic surgery to get an advantage over others in job-hunting.
Generated:   It is not uncommon for graduates to choose to undergo cosmetic surgeries, often with the help of a family member. Some people may also opt for surgery on their own, without the knowledge or consent of their parents or guardians. The choice is up to the
