In [1]:
%%capture
pip install transformers

In [2]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import numpy as np

# BERT MLM

In [3]:
bert = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")
bert.eval()
tokenizer  = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [4]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [5]:
masked_text = "paris is the [MASK] of france."
tokenized_masked_text = tokenizer(masked_text, return_tensors="pt")["input_ids"]
print("Text input:")
print(tokenizer.decode(tokenized_masked_text[0], skip_special_tokens=False))

Text input:
[CLS] paris is the [MASK] of france. [SEP]


In [6]:
tokenizer.mask_token_id

103

In [7]:
tokenized_masked_text

tensor([[ 101, 3000, 2003, 1996,  103, 1997, 2605, 1012,  102]])

In [8]:
output = bert(tokenized_masked_text)
logits = output.logits


In [9]:
logits.shape

torch.Size([1, 9, 30522])

In [10]:
def print_argtopk(logits, tokenizer, id, k=1):
    topk = torch.topk(logits, axis=-1, k=k, sorted=True)
    indices = topk.indices
    values = topk.values

    for k in range(k):
        print(tokenizer.decode(indices[0, id:id+1, k]), np.round(values[0, id, k].item(), 3))

In [11]:
print_argtopk(logits.detach(), tokenizer, 4, k=5)

capital 17.351
birthplace 11.668
northernmost 10.505
centre 10.466
southernmost 10.214


# BERT Next sentence prediction

In [12]:
from transformers import BertForNextSentencePrediction

In [13]:
bert = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
bert.eval();


In [14]:
s1 = "paris is the capital of france."
s2 = "the malayan tiger is native to peninsular malaysia."

encoded_sentences = tokenizer(s1, s2, return_token_type_ids=True, return_tensors="pt")
print(encoded_sentences)

{'input_ids': tensor([[  101,  3000,  2003,  1996,  3007,  1997,  2605,  1012,   102,  1996,
         19979,  2078,  6816,  2003,  3128,  2000, 22682,  6027,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [15]:
out = bert(**encoded_sentences)

In [16]:
out.logits

tensor([[-3.4455,  6.6671]], grad_fn=<AddmmBackward0>)

In [17]:
s1 = "the dog is barking."
s2 = "he saw a cat."

encoded_sentences = tokenizer(s1, s2, return_token_type_ids=True, return_tensors="pt")
out = bert(**encoded_sentences)
out.logits


tensor([[ 3.8689, -2.8081]], grad_fn=<AddmmBackward0>)

# GPT-2

In [18]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tabulate import tabulate

In [19]:
gpt = GPT2LMHeadModel.from_pretrained("gpt2-medium")
gpt.eval();

In [20]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

In [21]:
text = "Paris is the capital of"
tokenized_text = tokenizer.encode(text)
print(tokenizer.decode(tokenized_text, add_special_tokens=True))
print(tokenizer.tokenize(text))

Paris is the capital of
['Paris', 'Ġis', 'Ġthe', 'Ġcapital', 'Ġof']


In [22]:
text = "Paris is the capital of"
tokenized_text = tokenizer.encode(text)
input_tensor = torch.tensor([tokenizer.bos_token_id] + tokenized_text)[None, :]
with torch.no_grad():
    out = gpt(input_tensor)
logits = out.logits.detach().numpy()
logits_sorted = np.argsort(logits[0], axis=-1)
top_5_logits = logits_sorted[:, -5:][:, ::-1]


In [23]:
logits.shape

(1, 6, 50257)

In [24]:
top_5_logits.shape

(6, 5)

In [25]:
tokenizer.batch_decode(top_5_logits)

['TheAThisInIt',
 ' (, Saint:-',
 ' the a set to home',
 ' capital most world city latest',
 ' of city and, for',
 ' France the Europe French Paris']

In [26]:
list_tokens = []
input_tokens = [tokenizer.bos_token] + tokenizer.tokenize(text)
for i in range(top_5_logits.shape[0]):
    tokens = [input_tokens[i]] + [tokenizer.decode(tok) for tok in top_5_logits[i]]
    tokens = [t.replace("Ġ", " ") for t in tokens]
    list_tokens.append(tokens)


In [27]:
table = np.array(list_tokens)
print("Encoded inputs:")
print(tabulate(table, headers=["Input tokens"] + [f"Top {i}" for i in range(1, 6)], tablefmt="fancy_grid"))

Encoded inputs:
╒════════════════╤═════════╤═════════╤═════════╤═════════╤═════════╕
│ Input tokens   │ Top 1   │ Top 2   │ Top 3   │ Top 4   │ Top 5   │
╞════════════════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ <|endoftext|>  │ The     │ A       │ This    │ In      │ It      │
├────────────────┼─────────┼─────────┼─────────┼─────────┼─────────┤
│ Paris          │ (       │ ,       │ Saint   │ :       │ -       │
├────────────────┼─────────┼─────────┼─────────┼─────────┼─────────┤
│ is             │ the     │ a       │ set     │ to      │ home    │
├────────────────┼─────────┼─────────┼─────────┼─────────┼─────────┤
│ the            │ capital │ most    │ world   │ city    │ latest  │
├────────────────┼─────────┼─────────┼─────────┼─────────┼─────────┤
│ capital        │ of      │ city    │ and     │ ,       │ for     │
├────────────────┼─────────┼─────────┼─────────┼─────────┼─────────┤
│ of             │ France  │ the     │ Europe  │ French  │ Paris   │
╘════════════════╧

## Encoder-decoder

In [28]:
from transformers import BartForConditionalGeneration, BartTokenizer

In [29]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")



In [30]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [31]:
text = "A mysterious <mask> is located in Britany."
tokenized_text = tokenizer.encode(text, return_tensors="pt")
print(tokenizer.decode(tokenized_text[0]))


<s>A mysterious<mask> is located in Britany.</s>


In [32]:
model.eval();

In [33]:
generated_ids = model.generate(tokenized_text, do_sample=True, max_new_tokens=20)
print(tokenizer.decode(generated_ids[0]))

</s><s>A mysterious lake in France is located in Britany.</s>


## Inference

### Greedy decoding

In [34]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [35]:
gpt = GPT2LMHeadModel.from_pretrained("gpt2-large")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
gpt.eval();

In [36]:
text = "Paris is the capital of"
tokenized_text = tokenizer.encode(text)
input_tensor = torch.tensor([tokenizer.bos_token_id] + tokenized_text)[None, :]

In [37]:
output_tokens = []
gpt.eval()
input_sentence = text
print("Input text:", input_sentence)
input_ids = input_tensor
print("Input tensor:", input_tensor)
with torch.no_grad():
    for i in range(10):
        logits = gpt(input_ids).logits
        next_token = logits[0, -1].argmax()
        input_ids = torch.cat((input_ids, torch.tensor([next_token])[None, :]), dim=-1)
        print("Current text:", tokenizer.decode(input_ids[0]))

Input text: Paris is the capital of
Input tensor: tensor([[50256, 40313,   318,   262,  3139,   286]])
Current text: <|endoftext|>Paris is the capital of France
Current text: <|endoftext|>Paris is the capital of France,
Current text: <|endoftext|>Paris is the capital of France, and
Current text: <|endoftext|>Paris is the capital of France, and the
Current text: <|endoftext|>Paris is the capital of France, and the capital
Current text: <|endoftext|>Paris is the capital of France, and the capital of
Current text: <|endoftext|>Paris is the capital of France, and the capital of the
Current text: <|endoftext|>Paris is the capital of France, and the capital of the European
Current text: <|endoftext|>Paris is the capital of France, and the capital of the European Union
Current text: <|endoftext|>Paris is the capital of France, and the capital of the European Union.


In [38]:
generation_output = gpt.generate(input_ids=input_tensor, do_sample=False, num_beams=1, max_new_tokens=10)
tokenizer.decode(generation_output[0])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'<|endoftext|>Paris is the capital of France, and the capital of the European Union.'