In [None]:
import torch
from transformers import PreTrainedTokenizerFast

class Dataset(torch.utils.data.IterableDataset):
    def __init__(self, filepath: str, tokenizer_name: str):
        self.filepath = filepath
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_name)
        self.t = {
            'cls': self.tokenizer.cls_token_id,
            'pad': self.tokenizer.pad_token_id,
            'sep': self.tokenizer.sep_token_id,
            'unk': self.tokenizer.unk_token_id,
            'mask': self.tokenizer.mask_token_id
        }
    
    def preprocess(self, text):
        inputs = self.tokenizer(
            text, max_length=512, padding='max_length',
            truncation=True, return_tensors='pt'
        )
        # clone the token IDs which will now be our target tokens
        inputs['labels'] = inputs.input_ids.detach().clone()[0]
        # mask input tokens
        inputs['input_ids'] = self.mask(inputs['input_ids'])[0]
        inputs['attention_mask'] = inputs['attention_mask'][0]
        return inputs
    
    def mask(self, input_ids):
        # create random array of floats with equal dimensions to input_ids tensor
        rand = torch.rand(input_ids.shape)
        # create mask array
        mask_arr = (rand < 0.15) * (input_ids != self.t['cls']) * \
            (input_ids != self.t['pad']) * (input_ids != self.t['sep']) * \
            (input_ids != self.t['unk'])
        # take indices of each True value
        for i in range(input_ids.shape[0]):
            input_ids[i, torch.flatten(mask_arr[i].nonzero()).tolist()] = self.tokenizer.mask_token_id
        return input_ids
    
    def __iter__(self):
        line = open(self.filepath, encoding='utf-8')
        tokens = map(self.preprocess, line)
        return tokens

In [None]:
train_file = '../data/dv-corpus-clean-unique-2m.txt'

dataset = Dataset(train_file, 'bert-base-dv')

In [None]:
batch_size = 32

loader = torch.utils.data.DataLoader(
    dataset, batch_size=batch_size)

In [None]:
from transformers import BertConfig

config = BertConfig(
    vocab_size=20_000,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [None]:
from transformers import BertForMaskedLM

model = BertForMaskedLM(config)

Setup CPU/GPU usage

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
print('')

In [None]:
dv_read = open(train_file, encoding='utf-8')
num_samples = 0
for row in dv_read:
    num_samples += 1
del row
dv_read.close()

In [None]:
num_samples

In [None]:
from transformers import AdamW
from transformers.optimization import get_linear_schedule_with_warmup

model.train()
optim = AdamW(
    model.parameters(),
    lr=1e-5,
    weight_decay=0.1
)

epochs = 2

# setup warmup for the first ~10% of steps
total_steps = int(num_samples / batch_size) * epochs
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optim, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

In [None]:
#writer = torch.utils.tensorboard.SummaryWriter()

**Note** when implementing NSP just shuffle the current dhivehi data into a new 'shuffled' file and pick non-following sentences from that

In [None]:
from tqdm.auto import tqdm

step = 0

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True, total=int(num_samples/batch_size))
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # take loss for tensorboard
        #writer.add_scalar('Loss/train', loss, step)
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        step += 1
        # update learning rate scheduler
        scheduler.step()
        # update the TDQM progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
model.save_pretrained('bert-base-dv')

In [None]:
device