In [None]:
from transformers import BertTokenizer
import torch
from torch.nn import functional as F

DEIVCE = torch.device('cuda:0')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
input = tokenizer(['adf df ad fdf d', 'adf dafa'], padding=True, return_tensors='pt')
input.to('cpu')

# Training

## module test

In [None]:
from char_mlm import CharMLMDataset, CharTokenizer

test = CharMLMDataset(
    masked_texts=['t[MASK]st', 'hel[MASK]o'],
    label_texts=['test', 'hello']
)

print(test.batch_encoding)
print(test.tokenizer.decode(test[0]['input_ids']))
print(test.tokenizer.decode(test[0]['labels']))


## loading dataset

In [None]:
from char_mlm import CharMLMDataset
import pandas as pd
from torch.utils.data.dataset import random_split
from random import randrange


def mask_idx(text: str, idx: int) -> str:
   text = list(text)
   text[idx] = '[MASK]'
   return ''.join(text)


sents = pd.read_csv('./Data/en_setence.csv').setence.to_list()[:100000]
sents_masked = [mask_idx(s, randrange(len(s))) for s in sents]

data = CharMLMDataset(sents_masked, sents)

train, dev, test = random_split(
    data, [len(data)//6 * 4, len(data)//6, len(data)//6 + len(data) % 6]
)
train_dataset, dev_dataset, test_dataset = train.dataset, dev.dataset, test.dataset


## Trainer & Model definition

In [None]:
from transformers import Trainer, BertForMaskedLM, BertConfig, TrainingArguments
import os
from datetime import datetime

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

MODEL_DIR = os.path.join(
    './models', datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
)

model_config = BertConfig(
    max_position_embeddings=1024,
)
model = BertForMaskedLM(model_config)

training_args = TrainingArguments(
    output_dir=MODEL_DIR,
    num_train_epochs=100,
    logging_dir=os.path.join(MODEL_DIR, 'tensorboard'),
    logging_strategy='epoch',
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=train.dataset,
    eval_dataset=dev.dataset,
)


## train

In [None]:
trainer.train()
trainer.save_model()

# prediction

In [None]:
from transformers import BertForMaskedLM
from custom_tokenizers import CharTokenizer

tokenizer = CharTokenizer()

DEIVCE = torch.device('cuda:0')

def mask_idx(text: str, idx: int) -> str:
   text = list(text)
   text[idx] = '[MASK]'
   return ''.join(text)


sents = ['hello']
masked_sents = [mask_idx(s, 1) for s in sents]

# prediction
inputs = tokenizer(masked_sents).to(DEIVCE)
labels = tokenizer(sents).to(DEIVCE)["input_ids"]

model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(DEIVCE)
outputs = model(**inputs)
loss = outputs.loss
logits = outputs.logits

for i in range(len(sents)):
   mask_index = torch.where(inputs["input_ids"][i] == tokenizer.mask_token_id)
   softmax = F.softmax(logits, dim=-1)
   mask_word = softmax[0, mask_index, :]
   top_5 = torch.topk(mask_word, 5, dim=1)[1][0]
   for token in top_5:
      word = tokenizer.decode([token])
      new_sentences = masked_sents[i].replace(tokenizer.mask_token, word)
      print(new_sentences)
