In [1]:
!pip install transformers datasets
!pip install transformers
!pip install 'transformers[torch]'
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hC

In [2]:

import torch
from transformers import BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [3]:
def predict_masked_tokens_bench(text, tokenizer, model, top_k=3):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    mask_positions = torch.where(input_ids == tokenizer.mask_token_id)[1]

    with torch.no_grad():
        output = model(input_ids)
    logits = output.logits

    predictions = []
    for mask_position in mask_positions:
        probs = logits[0, mask_position].softmax(dim=-1)
        top_k_values, top_k_indices = torch.topk(probs, top_k)
        top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_indices)

        mask_predictions = []
        for i in range(top_k):
            mask_predictions.append({
                "score": top_k_values[i].item(),
                "token": top_k_indices[i].item(),
                "token_str": top_k_tokens[i]
            })
        predictions.append(mask_predictions)

    return predictions

In [4]:
text = "1801—I have just returned from a [MASK] to my landlord—the solitary neighbour that I shall be troubled [MASK]." #original should be "visit" and "with"
predicted = predict_masked_tokens_bench(text, tokenizer, model, top_k=5)
predicted

[[{'score': 0.7276092171669006, 'token': 3942, 'token_str': 'visit'},
  {'score': 0.1526014655828476, 'token': 3661, 'token_str': 'letter'},
  {'score': 0.015420506708323956, 'token': 2655, 'token_str': 'call'},
  {'score': 0.012914762832224369, 'token': 4471, 'token_str': 'message'},
  {'score': 0.011289449408650398, 'token': 2831, 'token_str': 'talk'}],
 [{'score': 0.44526180624961853, 'token': 2011, 'token_str': 'by'},
  {'score': 0.1372787207365036, 'token': 2055, 'token_str': 'about'},
  {'score': 0.13569167256355286, 'token': 2007, 'token_str': 'with'},
  {'score': 0.030658947303891182, 'token': 1997, 'token_str': 'of'},
  {'score': 0.02405373379588127, 'token': 2013, 'token_str': 'from'}]]

In [13]:
for pred in predicted:
  for tok in pred:
    print(tok)

{'score': 0.7276092171669006, 'token': 3942, 'token_str': 'visit'}
{'score': 0.1526014655828476, 'token': 3661, 'token_str': 'letter'}
{'score': 0.015420506708323956, 'token': 2655, 'token_str': 'call'}
{'score': 0.012914762832224369, 'token': 4471, 'token_str': 'message'}
{'score': 0.011289449408650398, 'token': 2831, 'token_str': 'talk'}
{'score': 0.44526180624961853, 'token': 2011, 'token_str': 'by'}
{'score': 0.1372787207365036, 'token': 2055, 'token_str': 'about'}
{'score': 0.13569167256355286, 'token': 2007, 'token_str': 'with'}
{'score': 0.030658947303891182, 'token': 1997, 'token_str': 'of'}
{'score': 0.02405373379588127, 'token': 2013, 'token_str': 'from'}


In [16]:
import re

replaced_text = re.sub(r"\[MASK\]", "{}", text)
replaced_text = replaced_text.format(*[[tok['token_str'] for tok in pred] for pred in predicted])
replaced_text

"1801—I have just returned from a ['visit', 'letter', 'call', 'message', 'talk'] to my landlord—the solitary neighbour that I shall be troubled ['by', 'about', 'with', 'of', 'from']."

In [None]:
from itertools import product

combinations = list(product(*predicted))
combinations

[({'score': 0.7276092171669006, 'token': 3942, 'token_str': 'visit'},
  {'score': 0.44526180624961853, 'token': 2011, 'token_str': 'by'}),
 ({'score': 0.7276092171669006, 'token': 3942, 'token_str': 'visit'},
  {'score': 0.1372787207365036, 'token': 2055, 'token_str': 'about'}),
 ({'score': 0.7276092171669006, 'token': 3942, 'token_str': 'visit'},
  {'score': 0.13569167256355286, 'token': 2007, 'token_str': 'with'}),
 ({'score': 0.7276092171669006, 'token': 3942, 'token_str': 'visit'},
  {'score': 0.030658947303891182, 'token': 1997, 'token_str': 'of'}),
 ({'score': 0.7276092171669006, 'token': 3942, 'token_str': 'visit'},
  {'score': 0.02405373379588127, 'token': 2013, 'token_str': 'from'}),
 ({'score': 0.1526014655828476, 'token': 3661, 'token_str': 'letter'},
  {'score': 0.44526180624961853, 'token': 2011, 'token_str': 'by'}),
 ({'score': 0.1526014655828476, 'token': 3661, 'token_str': 'letter'},
  {'score': 0.1372787207365036, 'token': 2055, 'token_str': 'about'}),
 ({'score': 0.15

In [None]:
replaced_texts = []
for combination in combinations:
    replaced_text = re.sub(r"\[MASK\]", "{}", text)
    replaced_text = replaced_text.format(*[pred["token_str"] for pred in combination])
    
    score = 1
    for pred in combination:
        score *= pred["score"]

    replaced_texts.append({
        "sequence": replaced_text,
        "score": score,
        # "tokens": combination
    })
replaced_texts

[{'sequence': '1801—I have just returned from a visit to my landlord—the solitary neighbour that I shall be troubled by.',
  'score': 0.3239765942796051},
 {'sequence': '1801—I have just returned from a visit to my landlord—the solitary neighbour that I shall be troubled about.',
  'score': 0.09988526252876095},
 {'sequence': '1801—I have just returned from a visit to my landlord—the solitary neighbour that I shall be troubled with.',
  'score': 0.0987305116500341},
 {'sequence': '1801—I have just returned from a visit to my landlord—the solitary neighbour that I shall be troubled of.',
  'score': 0.02230773264694552},
 {'sequence': '1801—I have just returned from a visit to my landlord—the solitary neighbour that I shall be troubled from.',
  'score': 0.017501718417162193},
 {'sequence': '1801—I have just returned from a letter to my landlord—the solitary neighbour that I shall be troubled by.',
  'score': 0.06794760420175772},
 {'sequence': '1801—I have just returned from a letter to