In [70]:
import torch
from transformers import AutoTokenizer, GPT2Tokenizer, GPT2LMHeadModel, T5Tokenizer, T5ForConditionalGeneration, pipeline, set_seed, PhrasalConstraint

class BaseLM(torch.nn.Module):
    def __init__(self, model:str = "gpt2", seed:int = 0, max_len:int = 15, num_returns:int = 1, num_beams:int = 5):
        super(BaseLM, self).__init__()
        if model == "gpt2":
            self.tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
            self.model = GPT2LMHeadModel.from_pretrained('gpt2-xl', pad_token_id=self.tokenizer.eos_token_id)
            self.generator = pipeline('text-generation', model='gpt2-xl')
        elif model == "t5":
            self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
            self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
            self.generator = None
        else:
            raise ValueError(f"Model type ' {model} ' not supported. [BaseLM __init__()]")
        
        set_seed(seed)
        self.max_len = max_len
        self.beams = num_beams
        self.model_type = model
        self.num_returns = num_returns
    
    def decode(self, text:str, constrained:bool = False, concepts:list = [], use_beam=True):
        if self.generator:
            if not constrained:
                return self.generator(text, max_new_tokens=self.max_len, num_return_sequences=self.num_returns, do_sample=use_beam)
            
            print("Cannot perform constrained generation with generator. Generating manually.")
        
        inputs = self.tokenizer(text, return_tensors="pt")
        
        if constrained:
            constraints = [
                PhrasalConstraint(
                    self.tokenizer(token, add_special_tokens=False).input_ids
                )
                for token in concepts
            ]
            
            output = self.model.generate(
                inputs["input_ids"],
                constraints=constraints,
                num_beams=self.beams,
                num_return_sequences=self.num_returns,
                no_repeat_ngram_size=1,
                remove_invalid_values=True,
                do_sample=use_beam
            )
        else:
            output = self.model.generate(inputs["input_ids"], max_new_tokens=self.max_len, do_sample=use_beam)
            
        output_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
        return output_text

# lm = BaseLM(model="gpt2", max_len=100)
# print(lm.decode("What is the third planet from the sun?", constrained=True, concepts=["planet", "third", "sun"]))

In [2]:
import pandas as pd
import numpy as np
import json

In [3]:
gpt2 = BaseLM('gpt2')
dev = pd.read_csv('DEVsplit.csv')
dev = dev.drop(columns = ['Unnamed: 0']) # the CSVs were saved with a leading index column that we can ignore

In [77]:
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium", pad_token_id = tokenizer.eos_token_id)

Downloading (…)lve/main/config.json: 100%|██████████| 718/718 [00:00<00:00, 538kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 9.56MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 9.44MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 11.5MB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.52G/1.52G [01:02<00:00, 24.5MB/s]
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<00:00, 81.1kB/s]


In [90]:
''' 
Try standard prompting
'''
example_prompt = "Question: The only baggage the woman checked was a drawstring bag, where was she heading with it?\nChoices: garbage can, military, jewelry store, safe, airport\nAnswer: airport\n"
example_prompt += "Question: Sammy wanted to go to where the people were. Where might he go?\nChoices: race track, populated areas, the desert, apartment, roadblock\nAnswer: populated areas\n"
example_prompt += "Question: To locate a choker not located in a jewelry box or boutique where would you go?\nChoices: jewelry store, neck, jewelry box, boutique, bedroom\nAnswer: jewelry store\n"
example_prompt += "Question: Google Maps and other highway and street GPS services have replaced what?\nChoices: united states, mexico, countryside, atlas, oceans\nAnswer: atlas\n"
example_prompt += "Question: The fox walked from the city into the forest, what was it looking for?\nChoices: pretty flowers, hen house, natural habitat, storybook, dense forest\nAnswer: natural habitat\n"
example_prompt += "Question: What home entertainment equipment requires cable?\nChoices: radio shack, substation, cabinet, television, desk\nAnswer: television\n"

In [1]:
# Keep track of model's answers
answers = ['' for _ in range(dev.shape[0])]

# Query the model for each of its answers
for i, row in dev.iterrows():
    '''
    Create the prompt for the model. They will look like the following example (without a newline):

    A revolving door is convenient for two direction travel, but it also serves as a security measure at a what? 
    A: bank. B: library. C: department store. D: mall. E: new york.
    '''
    prompt = example_prompt
    prompt += "Question: " + row['question.stem'] + "\n"
    prompt += "Choices: "
    
    # Load the row. They were saved as strings, so this is a little wonky. I decided to use
    # json.loads, which expects double quoted property keys. Since the question stem was saved
    # as one huge json string with single quoted keys, we have to be careful to overwrite these 
    # without blindly overwriting single quotes in the choices (e.g. inside a contraction)
    choices_str = row['question.choices']
    choices_str = choices_str.replace("'label'", '"label"')
    choices_str = choices_str.replace("'text'", '"text"')
    choices_str = choices_str.replace('"label": \'', '"label": "')
    choices_str = choices_str.replace('"text": \'', '"text": "')
    choices_str = choices_str.replace('\', "text"', '", "text"')
    choices_str = choices_str.replace('\'}', '"}')
    choices = json.loads(choices_str)

    for choice in choices: # Append the choices to the prompt
        if choice['label'] == 'E':
            #prompt += f"or {choice['label']}: {choice['text']}. " # includes label
            prompt += f"{choice['text']}\n" # excludes label
        else:
            #prompt += f"{choice['label']}: {choice['text']}, " # includes label
            prompt += f"{choice['text']}, "
    
    prompt += "Answer:"

    print('-'*100)
    print(prompt)

    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    attention_mask = torch.ones_like(input_ids)

    # Is the open-end generation the problem?
    answer = tokenizer.decode(model.generate(input_ids, 
                                             attention_mask=attention_mask,
                                             pad_token_id=tokenizer.pad_token_id,
                                             min_length=1,
                                             max_length=input_ids.shape[1] + 50, 
                                             num_beams=10, 
                                             top_k=100,
                                             top_p=0.90,
                                             early_stopping=True,
                                             do_sample=True)[0], 
                              skip_special_tokens=True)

    print('-'*100)
    print(answer)

    break
    print()
    
    if i == 10: break

NameError: name 'dev' is not defined