In [1]:
from pathlib import Path

import sys
import os

scripts_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'scripts'))

if scripts_dir not in sys.path:
    sys.path.append(scripts_dir)

from utils import *

In [2]:
candor_convo_path = '../data/candor/sample/0020a0c5-1658-4747-99c1-2839e736b481/'

models = {
    ('left', 'sentence'): '../models/gpt2/left_sentence/checkpoint-76066',
    ('right', 'sentence'): '../models/gpt2/right_sentence/checkpoint-76066',
    ('bidi', 'sentence'): '../models/gpt2/bidi_sentence/checkpoint-76066',
    ('left', 'bigram'): '../models/gpt2/left_bigram/checkpoint-63819',
    ('right', 'bigram'): '../models/gpt2/right_bigram/checkpoint-63819',
    ('bidi', 'bigram'): '../models/gpt2/bidi_bigram/checkpoint-100000',
}

context_direction, context_size = 'left', 'sentence'
model_dir = models[(context_direction, context_size)]

In [3]:
candor_convo_path = Path(candor_convo_path)

model = load_pretrained_model(model_dir)
tokenizer = load_pretrained_tokenizer(
    'gpt2', 
    context_size=context_size, 
    context_direction=context_direction, 
    add_prefix_space=True
)
model.resize_token_embeddings(len(tokenizer))

Loading pretrained model from ../models/gpt2/left_sentence/checkpoint-76066...
...done

Loading pretrained tokenizer from gpt2...
Vocabulary size: 50257
Max Model Input Sizes: 1024
Special tokens: ['[BOS]', '[EOS]', '<|endoftext|>']
...done



Embedding(50259, 768)

In [40]:
input_text = ['[BOS]'] + "The best thing".split(' ')
inputs = tokenizer(input_text, is_split_into_words=True, return_tensors='pt')
inputs

{'input_ids': tensor([[50257,   383,  1266,  1517]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [41]:
outputs = model.generate(inputs['input_ids'], max_length=50, num_return_sequences=1)
print(outputs)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
generated_text

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257,   383,  1266,  1517,   546,   340,   318,   326,   262,   661,
           508,   389,   287,   262,  2422,   389,   407,  1016,   284,   307,
          1498,   284,   466,   326,    13,   366,   290,   366,   383,    12,
            54,   861, 16288,   366,   318,  1016,   284,   307,   257,   845,
           922,   530,    13,   366,   290,   366,   383,   968,  1971,  3782]])


'[BOS] The best thing about it is that the people who are in the military are not going to be able to do that. " and " The-Wertheimer " is going to be a very good one. " and " The New York Times'

In [42]:
from transformers import set_seed

In [43]:
set_seed(2)

In [44]:
outputs = model.generate(
    inputs['input_ids'],
    max_new_tokens=40,
    do_sample=True,
    top_k=50,
    # temperature=0.6,
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_text

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' The best thing I can do to help ourselves, the most difficult thing to do is to get out there and get an investigation into the case. " I\'m, I\'ve been looking for some time with any evidence'