In [1]:
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertModel.from_pretrained("google-bert/bert-base-uncased")



In [45]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [22]:
inputs = tokenizer("i'll [MASK].", return_tensors='pt')
token_ids = list(inputs['input_ids'].detach().cpu().numpy()[0])
token_ids

[101, 1045, 1005, 2222, 103, 1012, 102]

In [31]:
tokenizer.convert_ids_to_tokens(2222)

'll'

In [41]:
# outs = []
# for token_id in token_ids:
#     print(token_id)
#     outs.append(tokenizer.convert_ids_to_tokens())
outs = tokenizer.convert_ids_to_tokens(token_ids)
outs

['[CLS]', 'i', "'", 'll', '[MASK]', '.', '[SEP]']

In [13]:
tokenizer_scratch = BertTokenizer('./vocab.txt')
tokenizer_scratch.convert_tokens_to_ids('mankodiya')

1

In [14]:
inputs = tokenizer("I'm, my dog is cute?", return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state

In [15]:
tokenizer.convert_ids_to_tokens(102), inputs

('[SEP]',
 {'input_ids': tensor([[  101,  1045,  1005,  1049,  1010,  2026,  3899,  2003, 10140,  1029,
            102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])})

In [16]:
tokenizer.convert_ids_to_tokens(1049)

'm'

In [17]:
last_hidden_states.shape

torch.Size([1, 11, 768])

In [18]:
tokenizer("pluri-potent cells"), tokenizer.convert_ids_to_tokens(102)

({'input_ids': [101, 20228, 9496, 1011, 16834, 4442, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]},
 '[SEP]')

In [19]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [20]:
import os
from language import normalize_string

In [21]:
with open('./eng-fra.txt', encoding='utf-8') as f:
    lines = f.readlines()
pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]

In [22]:
pairs

[['go.', 'va !'],
 ['run!', 'cours\u202f!'],
 ['run!', 'courez\u202f!'],
 ['wow!', 'ca alors\u202f!'],
 ['fire!', 'au feu !'],
 ['help!', "a l'aide\u202f!"],
 ['jump.', 'saute.'],
 ['stop!', 'ca suffit\u202f!'],
 ['stop!', 'stop\u202f!'],
 ['stop!', 'arrete-toi !'],
 ['wait!', 'attends !'],
 ['wait!', 'attendez !'],
 ['i see.', 'je comprends.'],
 ['i try.', "j'essaye."],
 ['i won!', "j'ai gagne !"],
 ['i won!', "je l'ai emporte !"],
 ['oh no!', 'oh non !'],
 ['attack!', 'attaque !'],
 ['attack!', 'attaquez !'],
 ['cheers!', 'sante !'],
 ['cheers!', 'a votre sante !'],
 ['cheers!', 'merci !'],
 ['get up.', 'leve-toi.'],
 ['got it!', "j'ai pige !"],
 ['got it!', 'compris !'],
 ['got it?', 'pige\u202f?'],
 ['got it?', 'compris\u202f?'],
 ['got it?', "t'as capte\u202f?"],
 ['hop in.', 'monte.'],
 ['hop in.', 'montez.'],
 ['hug me.', 'serre-moi dans tes bras !'],
 ['hug me.', 'serrez-moi dans vos bras !'],
 ['i fell.', 'je suis tombee.'],
 ['i fell.', 'je suis tombe.'],
 ['i know.', 'je sai

In [23]:
pairs[1][0], pairs[2][0]

('run!', 'run!')

In [24]:
import pandas as pd

In [25]:
df_pairs = pd.DataFrame(data=pairs, columns=['english', 'french'],)

In [26]:
english_sent = '\n'.join(list(df_pairs['english'].drop_duplicates().values))
french_sent = '\n'.join(list(df_pairs['french'].drop_duplicates().values))

In [27]:
with open('./english.txt', 'w') as f:
    f.writelines(english_sent)
    
with open('./french.txt', 'w') as f:
    f.writelines(french_sent)

In [28]:
from language import Tokenizer

In [29]:
tokenizer = Tokenizer()

In [2]:
from transformers import AutoTokenizer, BertForMaskedLM
import torch

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertForMaskedLM.from_pretrained("google-bert/bert-base-uncased")

inputs = tokenizer("take [MASK].", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

# labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
# # mask labels of non-[MASK] tokens
# labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)

# outputs = model(**inputs, labels=labels)
# round(outputs.loss.item(), 2)

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


'care'

In [53]:
logits.shape, mask_token_index, inputs, predicted_token_id, tokenizer.decode(predicted_token_id)

(torch.Size([1, 9, 30522]),
 tensor([6]),
 {'input_ids': tensor([[ 101, 1996, 3007, 1997, 2605, 2003,  103, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 tensor([3000]),
 'paris')

In [47]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [3]:
import torch

In [12]:
random = torch.randint(low=1, high=10, size=(10, ))
random

tensor([8, 7, 8, 1, 4, 8, 2, 2, 7, 5])

In [14]:
torch.full(random.shape, .15)

tensor([0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500,
        0.1500])

In [25]:
import random
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

def mask_tokens(input_ids, tokenizer, mlm_probability=0.15):
    """
    Prepare masked tokens inputs/labels for masked language modeling (MLM).
    Args:
        input_ids (torch.Tensor): Tensor of token ids
        tokenizer (PreTrainedTokenizer): Hugging Face tokenizer
        mlm_probability (float): Probability of masking a token (15% default)
    Returns:
        Tuple of (input_ids, labels) where labels is the original token ids
        for the masked positions and input_ids is the input tensor with masks.
    """
    labels = input_ids.clone()

    # Masking logic: we randomly select 15% of tokens to mask
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # Replace 80% of masked tokens with [MASK]
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    input_ids[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # Replace 10% of masked tokens with random tokens
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.1)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    input_ids[indices_random] = random_words[indices_random]

    # 10% of masked tokens are left unchanged
    return input_ids, labels

# Example usage with a sentence
sentence = "The quick brown fox jumps over the lazy dog"
inputs = tokenizer(sentence, return_tensors='pt')
input_ids = inputs['input_ids']

# # Apply the masking
# masked_input_ids, labels = mask_tokens(input_ids, tokenizer)

# # Forward pass through the model
# outputs = model(input_ids=masked_input_ids, labels=labels)
# loss = outputs.loss
# logits = outputs.logits

# print(f"Loss: {loss}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
masked_input_ids

tensor([[  101,  1996,  4248,  2829,  4419,   103,  2058,  1996, 13971,  3899,
           102]])

In [27]:
labels = input_ids.clone()
probability_matrix = torch.full(labels.shape, 0.15)
labels, probability_matrix

(tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
            102]]),
 tensor([[0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500,
          0.1500, 0.1500]]))

In [33]:
special_tokens_mask = [
    tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
    for val in labels.tolist()
]
special_tokens_mask

[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]

In [34]:
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
masked_indices

tensor([[False, False, False, False, False, False, False, False, False,  True,
         False]])

In [35]:
labels[~masked_indices] = -100
labels

tensor([[-100, -100, -100, -100, -100, -100, -100, -100, -100, 3899, -100]])

In [40]:
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
input_ids[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
indices_replaced, masked_indices, indices_replaced

(tensor([[False, False, False, False, False, False, False, False, False, False,
          False]]),
 tensor([[False, False, False, False, False, False, False, False, False,  True,
          False]]),
 tensor([[False, False, False, False, False, False, False, False, False, False,
          False]]))

In [49]:
indices_random = torch.bernoulli(torch.full(labels.shape, 0.1)).bool() & masked_indices & ~indices_replaced
random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
input_ids[indices_random] = random_words[indices_random]
random_words, len(tokenizer), input_ids[indices_random]

(tensor([[11394,  5532, 12200, 26389,  8699, 15066, 15568, 10737, 11721, 15001,
           3006]]),
 30522,
 tensor([], dtype=torch.int64))

In [45]:
indices_random

tensor([[False, False, False, False, False, False, False, False, False, False,
         False]])

In [50]:
tokenizer.convert_tokens_to_ids(['sign', 'here', 'back'])

[3696, 2182, 2067]