In [16]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, GPT2Model, GPT2Tokenizer, GPT2LMHeadModel

In [2]:
def extract_data(dataset, mode):
  dict_mode=dataset[mode]
  option=dict_mode['options']
  answer=dict_mode['answer']
  article=dict_mode['article']
  question=dict_mode['question']

  return option, answer, article, question


In [3]:
def answer_engineering(answer, options):
  complete_answer=[]
  option_index = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4}
  for i in range(len(answer)):
    option_number=answer[i]
    complete_answer.append(options[i][option_index[option_number]])
  print(f'Total length of the dataset: {len(complete_answer)}')
  return complete_answer

In [6]:
def prompt_generation(article, question):
  complete_prompt=[
      'Answer the following question based on the passage - \n'
      + art + '\n'
      + 'Question: '+ ques + '\n'
      + 'Answer: '
      for art,ques in zip(article,question) 
  ]
  return complete_prompt

In [8]:
# Loading dataset
dataset = load_dataset("ehovy/race", 'all', ignore_verifications=True)

# Extracting the dataset specifics
train_options, train_answer, train_article, train_question = extract_data(dataset, 'train')
test_options, test_answer, test_article, test_question = extract_data(dataset, 'test')
validation_options, validation_answer, validation_article, validation_question = extract_data(dataset, 'validation')

# Getting the correct answers
train_correct_answer = answer_engineering(train_answer, train_options)
test_correct_answer = answer_engineering(test_answer, test_options)
validation_correct_answer = answer_engineering(validation_answer, validation_options)

# Creating the prompts
train_prompt = prompt_generation(train_article, train_question)
test_prompt = prompt_generation(test_article, test_question)
validation_prompt = prompt_generation(validation_article, validation_question)

Found cached dataset parquet (/home/harsh/.cache/huggingface/datasets/ehovy___parquet/all-53c1c61cb0110145/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

Total length of the dataset: 175732
Total length of the dataset: 9868
Total length of the dataset: 9774


In [9]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [10]:
train_prompt[0]

'Answer the following question based on the passage - \nLast week I talked with some of my students about what they wanted to do after they graduated, and what kind of job prospects  they thought they had.\nGiven that I teach students who are training to be doctors, I was surprised do find that most thought that they would not be able to get the jobs they wanted without "outside help". "What kind of help is that?" I asked, expecting them to tell me that they would need a   or family friend to help them out.\n"Surgery ," one replied.\nI was pretty alarmed by that response. It seems that the graduates of today are increasingly willing to go under the knife to get ahead of others when it comes to getting a job .\nOne girl told me that she was considering surgery to increase her height. "They break your legs, put in special extending screws, and slowly expand the gap between the two ends of the bone as it re-grows, you can get at least 5 cm taller!"\nAt that point, I was shocked. I am shor

In [23]:
model_name = 'vicgalle/gpt2-alpaca-gpt4'
tokenizer = GPT2Tokenizer.from_pretrained(model_name, device=device) # need to change tokenizer based off model
model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id).to(device)

In [28]:
sequence = train_prompt[0]
y_hat = train_correct_answer[0]

In [30]:
inputs = tokenizer.encode(sequence, return_tensors='pt').to(device)
outputs = model.generate(inputs, max_length=len(inputs[0]) + 10, do_sample=True, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
y_pred = text.split('\nAnswer:')[-1]

In [32]:
print(f"Correct: {y_hat}\nGenerated: {y_pred}")

Correct: teacher
Generated:   The author is not a doctor. He/She
