In [1]:
import sys
from transformers import AutoTokenizer, AutoModelWithLMHead, top_k_top_p_filtering, GPT2Model, GPT2Config, GPT2Tokenizer
from torchvision  import torch
from torch import nn

In [11]:
# Import Hugging Face transformer models if not already installed

if 'transformers' not in sys.modules:
    !pip install transformers;
    import transformers

[This notebook](https://github.com/huggingface/notebooks/blob/master/transformers_doc/task_summary.ipynb), in the section 'Masked Language Models', has a really helpful guide.

## Investigating the Process of Using the BERT Model

In [18]:
# AutoTokenizer is a method included in the transformers module
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# More info about autoModelWithLMHead https://huggingface.c/transformers/v3.0.2/model_doc/auto.html#automodelwithlmhead
bert_model = AutoModelWithLMHead.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [41]:
# We write a sentence with a masked word
# The masked word, for BERT, has to be [MASK]
# This can be found by running 'bert_tokenizer.mask_token'

sentence = "The doctor walked across the hospital on [MASK] own"

In [42]:
model_input = bert_tokenizer.encode(sentence, return_tensors="pt")
model_input

tensor([[ 101, 1996, 3460, 2939, 2408, 1996, 2902, 2006,  103, 2219,  102]])

In [43]:
mask_token_index = torch.where(model_input == bert_tokenizer.mask_token_id)[1]

In [47]:
bert_token_logits = bert_model(model_input)[0]

In [51]:
bert_mask_token_logits = bert_token_logits[0, mask_token_index, :]
bert_mask_token_logits

In [54]:
bert_top_token = torch.topk(bert_mask_token_logits, 1, dim=1).indices[0].tolist()

In [59]:
print(sentence.replace(bert_tokenizer.mask_token, bert_tokenizer.decode([bert_top_token[0]])))

The doctor walked across the hospital on his own


## Automating the use of the BERT Model

In [60]:
def find_masked_token_using_bert(sentence):
    
    # Creating the Model
    bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    bert_model = AutoModelWithLMHead.from_pretrained("bert-base-uncased")
    
    # Passing the sentence to the BERT Tokenizer
    model_input = bert_tokenizer.encode(sentence, return_tensors="pt")
    
    # Processing the input
    mask_token_index = torch.where(model_input == bert_tokenizer.mask_token_id)[1]
    bert_token_logits = bert_model(model_input)[0]
    bert_mask_token_logits = bert_token_logits[0, mask_token_index, :]
    
    # Finding the top most likely token
    bert_top_token = torch.topk(bert_mask_token_logits, 1, dim=1).indices[0].tolist()
    
    return sentence.replace(bert_tokenizer.mask_token, bert_tokenizer.decode([bert_top_token[0]]))

In [61]:
find_masked_token_using_bert(sentence)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


'The doctor walked across the hospital on his own'

This method is not efficient as the bert model is being created for each sentence that will be passed, this might work better if the function only accounts for sentence-specific processing.

In [64]:
def find_masked_token(sentence, tokenizer, model):
    
    # Passing the sentence to the BERT Tokenizer
    model_input = tokenizer.encode(sentence, return_tensors="pt")
    
    # Processing the input
    mask_token_index = torch.where(model_input == tokenizer.mask_token_id)[1]
    token_logits = model(model_input)[0]
    mask_token_logits = token_logits[0, mask_token_index, :]
    
    # Finding the top most likely token
    top_token = torch.topk(mask_token_logits, 1, dim=1).indices[0].tolist()
    
    return sentence.replace(tokenizer.mask_token, tokenizer.decode([top_token[0]]))

In [62]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelWithLMHead.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [66]:
sentences = [
    'The doctor walked across the hospital on [MASK] own',
    'The nurse walked across the hospital on [MASK] own',
    'The doctor walked across the maternity hospital on [MASK] own',
    'The nurse walked across the battlefield on [MASK] own',
]

In [68]:
for sentence in sentences:
    print(find_masked_token(sentence, tokenizer, model))

The doctor walked across the hospital on his own
The nurse walked across the hospital on her own
The doctor walked across the maternity hospital on his own
The nurse walked across the battlefield on her own


## Investigating Usage of GPT-2 Model

In [69]:
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2") 

HBox(children=(IntProgress(value=0, description='Downloading', max=665, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=1042301, style=ProgressStyle(description_wi…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=1355256, style=ProgressStyle(description_wi…




In [70]:
gpt2_model = AutoModelWithLMHead.from_pretrained("gpt2")

HBox(children=(IntProgress(value=0, description='Downloading', max=548118077, style=ProgressStyle(description_…




In [75]:
gpt2_tokenizer

PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'mask_token': '[MASK]'})

In [76]:
find_masked_token("I want a big", gpt2_tokenizer, gpt2_model)

IndexError: index 0 is out of bounds for dimension 0 with size 0

In [159]:
sequence = "I want to buy butter and"

In [160]:
# Passing the sentence to the GPT Tokenizer
model_input = tokenizer(sequence, return_tensors="pt")

In [161]:
input_ids = model_input["input_ids"]

In [162]:
# get logits of last hidden state
next_token_logits = gpt2_model(**model_input).logits[:, -1, :]

In [163]:
# filter
filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)

In [164]:
# sample
probs = nn.functional.softmax(filtered_next_token_logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)

generated = torch.cat([input_ids, next_token], dim=-1)

resulting_string = tokenizer.decode(generated.tolist()[0])
print(resulting_string)

[CLS] i want to buy butter and [SEP] [PAD]


## Retrying GPT-2 Model

In [166]:
# Initializing a GPT2 configuration
configuration = GPT2Config()

# Initializing a model from the configuration
model = GPT2Model(configuration)

# Accessing the model configuration
configuration = model.config

In [169]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [170]:
model = GPT2Model.from_pretrained('gpt2')

In [175]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

In [176]:
logits = outputs.logits

AttributeError: 'BaseModelOutputWithPastAndCrossAttentions' object has no attribute 'logits'

## Attempt 3
[Useful link](https://huggingface.co/gpt2)

In [177]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model, I'm writing a new language for you. But first, I'd like to tell you about the language itself"},
 {'generated_text': "Hello, I'm a language model, and I'm trying to be as expressive as possible. In order to be expressive, it is necessary to know"},
 {'generated_text': "Hello, I'm a language model, so I don't get much of a license anymore, but I'm probably more familiar with other languages on that"},
 {'generated_text': "Hello, I'm a language model, a functional model... It's not me, it's me!\n\nI won't bore you with how"},
 {'generated_text': "Hello, I'm a language model, not an object model.\n\nIn a nutshell, I need to give language model a set of properties that"}]

In [178]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [185]:
tokenizer.decode(output[0][0][0])

TypeError: sequence item 64: expected str instance, NoneType found

# Again

In [2]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer, tf_top_k_top_p_filtering
import tensorflow as tf

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = TFAutoModelForCausalLM.from_pretrained("gpt2")

sequence = f"Hugging Face is based in DUMBO, New York City, and"

inputs = tokenizer(sequence, return_tensors="tf")
input_ids = inputs["input_ids"]

# get logits of last hidden state
next_token_logits = model(**inputs).logits[:, -1, :]

# filter
filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)

# sample
next_token = tf.random.categorical(filtered_next_token_logits, dtype=tf.int32, num_samples=1)

generated = tf.concat([input_ids, next_token], axis=1)

resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
print(resulting_string)

HBox(children=(IntProgress(value=0, description='Downloading', max=497933648, style=ProgressStyle(description_…




All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Hugging Face is based in DUMBO, New York City, and is


In [8]:
def find_masked_token_gpt(sentence, tokenizer, model):
    
    inputs = tokenizer(sentence, return_tensors="tf")
    input_ids = inputs["input_ids"]

    # get logits of last hidden state
    next_token_logits = model(**inputs).logits[:, -1, :]

    # filter
    filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)

    # sample
    next_token = tf.random.categorical(filtered_next_token_logits, dtype=tf.int32, num_samples=1)

    generated = tf.concat([input_ids, next_token], axis=1)

    resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
    return resulting_string

In [18]:
find_masked_token_gpt("I will buy dogs and", tokenizer, model)

'I will buy dogs and cats'