In [1]:
from transformers import BertTokenizer
import torch
from torch.nn import functional as F

DEIVCE = torch.device('cuda:0')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [2]:
input = tokenizer(['adf df ad fdf d', 'adf dafa'], padding=True, return_tensors='pt')
input.to('cpu')

{'input_ids': tensor([[  101,  4748,  2546,  1040,  2546,  4748,  1042, 20952,  1040,   102],
        [  101,  4748,  2546,  4830,  7011,   102,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}

# Training

## Trainer & Model definition

In [None]:
from char_mlm import CharTokenizer
from transformers import Trainer, BertForMaskedLM, BertConfig, TrainingArguments
import os
from datetime import datetime

MODEL_DIR = os.path.join(
    './models', datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
)

tokenizer = CharTokenizer()
model_config = BertConfig(vocab_size = len(tokenizer.SPECIAL_TOKENS_ATTRIBUTES) + 128)
model = BertForMaskedLM(model_config)

training_args = TrainingArguments(
    output_dir=MODEL_DIR
)

trainer = Trainer(
    model,
    training_args
)


## Do train

In [None]:
trainer.train()

# prediction

In [7]:
from transformers import BertForMaskedLM
from custom_tokenizers import CharTokenizer

tokenizer = CharTokenizer()

DEIVCE = torch.device('cuda:0')

def mask_nonascii(text: str) -> str:
   return ''.join([char if ord(char) < 128 else '[MASK]' for char in text])


def mask_idx(text: str, idx: int) -> str:
   text = list(text)
   text[idx] = '[MASK]'
   return ''.join(text)


sents = ['hello']
masked_sents = [mask_idx(s, 1) for s in sents]

# prediction
inputs = tokenizer(masked_sents).to(DEIVCE)
labels = tokenizer(sents).to(DEIVCE)["input_ids"]

model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(DEIVCE)
outputs = model(**inputs)
loss = outputs.loss
logits = outputs.logits

for i in range(len(sents)):
   mask_index = torch.where(inputs["input_ids"][i] == tokenizer.mask_token_id)
   softmax = F.softmax(logits, dim=-1)
   mask_word = softmax[0, mask_index, :]
   top_5 = torch.topk(mask_word, 5, dim=1)[1][0]
   for token in top_5:
      word = tokenizer.decode([token])
      new_sentences = masked_sents[i].replace(tokenizer.mask_token, word)
      print(new_sentences)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


h̠llo
h͍llo
hܖllo
h܈llo
ḩllo
