# Basic BERT operations


In [1]:
!pip3 -q install datasets transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/471.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m471.0/471.6 kB[0m [31m22.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import transformers
import datasets
import torch

In [3]:
tokenizer=transformers.AutoTokenizer.from_pretrained("bert-base-cased") #you can also use the trusty "TurkuNLP/bert-base-finnish-cased-v1"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [4]:
# We will be running the model directly, so let's use return_tensors="pt" to get torch tensors rather than Python lists
texts=["Dogs like to [MASK] cats. They taste good.","Bad joke!"]
t=tokenizer(texts,padding=True, truncation=True, return_tensors="pt")
print("Input ids",t["input_ids"])
print("Token type ids",t["token_type_ids"])
print("Attention mask",t["attention_mask"])

Input ids tensor([[  101, 16406,  1176,  1106,   103, 11771,   119,  1220,  5080,  1363,
           119,   102],
        [  101,  6304,  8155,   106,   102,     0,     0,     0,     0,     0,
             0,     0]])
Token type ids tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Attention mask tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])


In [5]:
# This is what the first sequence looks like
tokenizer.decode(t["input_ids"][0])

'[CLS] Dogs like to [MASK] cats. They taste good. [SEP]'

# BERT: bare model
* How to use the bare model
* What does it give us?

In [6]:
bert=transformers.AutoModel.from_pretrained("bert-base-cased") #"TurkuNLP/bert-base-finnish-cased-v1" if you run this in Finnish


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

* in torch the model's forward() function tends to be mapped to `__call__()` i.e. it is used when you call the model as if it were a function


In [7]:
bert_out=bert(
    input_ids=t["input_ids"],
    attention_mask=t["attention_mask"],
    token_type_ids=t["token_type_ids"])
#an easy way to say the above would be bert(**t)


that's it, this is how you call BERT, now let's see what it gave us (not hard to figure out it is really a dictionary)

In [8]:
bert_out.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

* last_hidden_state: the last layer of the encoder
* pooler_output: the `tanh` layer on top of `[CLS]`

In [9]:
# Before you run this, stop to think:
# What will the shape be? How many dimensions? 1? 2? 3? more? And their approximate sizes?
# make a guess, see if it matches
bert_out.last_hidden_state.shape

torch.Size([2, 12, 768])

In [10]:
# And here? What will the shape be?
bert_out.pooler_output.shape

torch.Size([2, 768])

# BERT: masked language modelling output

* Not much we can do with the above
* But BERT is trained to predict masked words, let's try!

In [11]:
# Have a look at HuggingFace automodels documentation to see what types of automodels there are
bert=transformers.AutoModelForPreTraining.from_pretrained("bert-base-cased")

In [12]:
# Tell the model it is not really being trained (disables dropout for example)
# I do not think this is needed but am playing it safe, the docs say it is put to eval mode upon load: https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained.config
bert=bert.eval()

Now we can again run the model, and we will see the output is quite different!

In [13]:
bert_out=bert(**t)
bert_out.keys()

odict_keys(['prediction_logits', 'seq_relationship_logits'])

In [14]:
# What are these? https://huggingface.co/transformers/v3.0.2/model_doc/bert.html#transformers.BertForPreTraining
#What do you think these shapes will be?
print("Logits",bert_out["prediction_logits"].shape)
print("Seq relationship logits",bert_out["seq_relationship_logits"].shape)

Logits torch.Size([2, 12, 28996])
Seq relationship logits torch.Size([2, 2])


In [15]:
#cross-check
tokenizer.vocab_size

28996

...now let's see how well this works for the masked word prediction...
* we need to find the most likely predicted words
* which can be achieved by arg-sorting the predictions and picking top N words
* this is easy and we have done this kind of stuff before
* now let's try straight in torch without a roundtrip to numpy

In [16]:
predictions = bert_out["prediction_logits"]
print(predictions.shape)
top20=torch.argsort(predictions,dim=2,descending=True)[:,:,:20] #why dim=2? what does [:,:,:20] do?
print(top20)

torch.Size([2, 12, 28996])
tensor([[[  119,   117,   107,   114,  1103,  1105,   136,  1104,  1106,   118,
           1107,  1116,   170,   112,  1108,   113,   146,  1122,  1115,   188],
         [  119,   107,   117,  1103,   132,   114,  1105,  1104,  1106,   136,
            112,   118,   170,  1107,   146,  1108,   113,  1109,  1112,   188],
         [ 1176,  1567,  3851,  4819,  1328,  9353,  2409,  6613,  5548,  3097,
           2037,  3940,  1920, 13054,  1329,  1132,  7871, 20662,  1215,  7407],
         [ 1106, 27629,  1128,  1152,  1103,  1195,   146,  1122,  1136,   170,
           1115,  1706,  1105,  1143,   117,  1184,  6513,  1315,   189,  1505],
         [ 3940,  9839,  1138, 11109,  1267,  8263,  2824,  2311, 13671,  1505,
           1712,  1243,  4877,  4176,  1129,  2147, 19676,  1176,  3963,  3644],
         [11771,  5855, 17408,  3551,  1172, 23463,  6363,  1122,  8892,  1234,
          14986, 11260, 12237, 25164,  1128,  4067, 21235, 13475,  1152,   117],
       

In [17]:
print(texts[0])

print("Guesses:",tokenizer.decode(top20[0,4]))

Dogs like to [MASK] cats. They taste good.
Guesses: eat chase have pet see hunt watch kill scare play keep get feed ride be fight lick like catch avoid


# ...in one block...

In [18]:
texts=["Dogs like to [MASK] cats. They are cute."]
t=tokenizer(texts,padding=True, truncation=True, return_tensors="pt")
bert_out=bert(**t)
top20=torch.argsort(bert_out["prediction_logits"],dim=2,descending=True)[:,:,:20]
print("Guesses:",tokenizer.decode(top20[0,4]))

Guesses: have eat chase see pet keep play watch get scare be hunt ride like visit kill feed fight lick catch


In [19]:
print(t)
print(tokenizer.mask_token_id)

{'input_ids': tensor([[  101, 16406,  1176,  1106,   103, 11771,   119,  1220,  1132, 10509,
           119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
103


# TASKS

As an exercise, you can try to solve the following:

1. How good is BERT at the masked language modelling (MLM) task? Feed random texts e.g. from the IMDB dataset, mask a random token at a time, and check: did BERT predict it correctly?
2. If you did (1), can you answer did BERT predict it correctly in top-5?
3. Try can you do better. Make yourself a program which picks random texts from one of the datasets we used in this course and produces two files: one with segments of texts with one [MASK] and one with the correct answers. Then try to guess the words without looking at the latter file and then compare your answers with the correct ones. How well did you do?


In [20]:
dataset = datasets.load_dataset('imdb')

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [43]:
import random

random.seed(1337)

tuples = []

mask = "[MASK]"

for i in range(100):
  random_text = random.choice(dataset["train"]["text"])
  sentences_of_text = random_text.split(".")
  first_sentence = sentences_of_text[0]
  first_as_list = first_sentence.split(" ")

  if len(first_as_list) <= 4:
    continue
  random_integer = random.randint(4,len(first_as_list)-1)
  saved_word = first_as_list[random_integer]
  first_as_list[random_integer] = mask
  masked_sentence = " ".join(first_as_list)

  t=tokenizer(masked_sentence,padding=True, truncation=True, return_tensors="pt")
  bert_out=bert(**t)
  top20=torch.argsort(bert_out["prediction_logits"],dim=2,descending=True)[:,:,:20]
  guesses = tokenizer.decode(top20[0,4])
  tuples.append((guesses, saved_word))



In [50]:
top_five = 0
top_twenty = 0

for t in tuples:
  guesses = t[0].split(" ")
  if len(guesses) >= 5:
    topfive = guesses[:5]
    for g in guesses:
      stringword = str(g)
      stringtuple = str(t[1])
      if stringword.casefold() == stringtuple.casefold():
        top_twenty += 1
    for word in topfive:
      stringword = str(word)
      stringtuple = str(t[1])
      if stringword.casefold() == stringtuple.casefold():
        top_five += 1



print("TOP-5:")
print(f"for {len(tuples)} tries, there is {top_five} matches in top 5, with predicted word and masked")
print(f'So thats {(top_five*100) / len(tuples)}%....', end="\n\n")

print("TOP-20:")
print(f"for {len(tuples)} tries, there is {top_twenty} matches in top 20, with predicted word and masked")
print(f'So thats {(top_twenty*100) / len(tuples)}%....', end="\n\n")






TOP-5:
for 96 tries, there is 5 matches in top 5, with predicted word and masked
So thats 5.208333333333333%....

TOP-20:
for 96 tries, there is 9 matches in top 20, with predicted word and masked
So thats 9.375%....

