First we will initialize our filiBERTo tokenizer like before, but within a `Dataset` object so that we will be ready to begin training our new model.

In [90]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import torch
from pathlib import Path
from tqdm.auto import tqdm
import random

class Dataset(torch.utils.data.Dataset):
    def __init__(self):
        # initialize the tokenizer using the tokenizer we initialized and saved to file
        self.tokenizer = ByteLevelBPETokenizer(
            './filiberto/filiberto-vocab.json',
            './filiberto/filiberto-merges.txt'
        )
        # set [CLS] and [SEP] to be added to start-end of sequences
        self.tokenizer._tokenizer.post_processor = BertProcessing(
            ('[SEP]', self.tokenizer.token_to_id('[SEP]')),
            ('[CLS]', self.tokenizer.token_to_id('[CLS]'))
        )
        # truncate anything more than 512 tokens in length
        self.tokenizer.enable_truncation(max_length=512)
        # and enable padding to 512 too
        self.tokenizer.enable_padding(length=512, pad_token='[PAD]')

        paths = [str(x) for x in Path('../../data/text/oscar_it').glob('**/*.txt')]
        # reorder paths (above will give text_999.txt as final file)
        self.paths = [f'../../data/text/oscar_it/text_{i}.txt' for i in range(len(paths))]
        # open the first file to get 'expected' length
        with open(self.paths[0], 'r', encoding='utf-8') as fp:
            lines = fp.read().split('\n')
        # save file length as 'expected' length
        self.file_size = len(lines)

    def __len__(self):
        # we calculate the total number of examples as the number of samples in the
        # first file, multipled by the number of files, minus the final value
        length = self.file_size * len(self.paths) - self.file_size
        with open(self.paths[-1], 'r', encoding='utf-8') as fp:
            lines = fp.read().split('\n')
        length += len(lines)
        return length

    def __getitem__(self, i):
        # get the file number and sample number based on i
        file_i, sample_i = self.get_loc(i)
        # load file
        with open(self.paths[file_i], 'r', encoding='utf-8') as fp:
            lines = fp.read().split('\n')
        # extract required sample
        sample = lines[sample_i]
        # encode
        sample = self.tokenizer.encode(sample)
        # convert tokens to tensor
        try:
            targets = torch.tensor(sample.ids)
        except RuntimeError:
            raise RuntimeError(f"{sample=}")
        # create attention mask tensor
        mask = torch.tensor(sample.attention_mask)
        # mask ~15% of tokens to create inputs
        input_ids = self.mlm(targets.detach().clone())
        # return dictionary of input_ids, attention_mask, and labels
        return {'input_ids': input_ids, 'attention_mask': mask, 'labels': targets}

    def get_loc(self, i):
        # get file number
        file_num = int(i / self.file_size)
        sample_num = i % self.file_size
        return file_num, sample_num

    def mlm(self, tensor):
        # create random array of floats with equal dims to tensor
        rand = torch.rand(tensor.shape)
        # mask random 15% where token is not 0 [PAD], 1 [CLS], or 2 [SEP]
        mask_arr = (rand < .15) * (tensor != 0) * (tensor != 1) * (tensor != 2)
        # get indices of mask positions from mask array
        mask_idx = torch.flatten(mask_arr.nonzero()).tolist()
        # mask tensor and return
        tensor[mask_idx] = 3
        return tensor

Next we initialize our `Dataset`.

In [91]:
dataset = Dataset()

And initialize the dataloader, which will load the data into the model during training.

In [92]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

And move onto building our model, we first need to create a BERT config object, which will describe which features we want to initialize our BERT model with.

In [6]:
from transformers import BertConfig

config = BertConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab set in previous notebook
    hidden_size=768,
    num_hidden_layers=12,
    pad_token_id=0
)

Then we import and initialize a BERT model with a language modeling head.

In [7]:
from transformers import BertLMHeadModel

model = BertLMHeadModel(config)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


And now we move onto training. First we setup GPU/CPU usage.

In [9]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

Activate the training mode of our model, and initialize our optimizer (Adam with weighted decay - reduces chance of overfitting).

In [11]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

Now we move onto the training loop.

In [93]:
from tqdm import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0:   0%|          | 12/1782631 [00:37<1561:27:53,  3.15s/it, loss=6.26]


KeyboardInterrupt: 

https://huggingface.co/blog/how-to-train

In [14]:
len(loop)

1782631

In [23]:
dataset.examples[10_000]

IndexError: list index out of range

In [24]:
dataset.examples[9999]

[1,
 5401,
 3580,
 9800,
 500,
 2163,
 16,
 20411,
 18,
 2174,
 300,
 403,
 330,
 7787,
 280,
 582,
 3756,
 534,
 8423,
 1091,
 904,
 9178,
 16,
 1035,
 7085,
 275,
 44820,
 532,
 311,
 48933,
 280,
 480,
 4144,
 1419,
 306,
 1056,
 267,
 4667,
 295,
 477,
 36444,
 30,
 478,
 22318,
 4028,
 585,
 26067,
 292,
 1139,
 17694,
 5220,
 5007,
 11765,
 275,
 11085,
 441,
 1117,
 75,
 16,
 336,
 662,
 287,
 40167,
 280,
 336,
 776,
 807,
 42220,
 13407,
 376,
 1309,
 18,
 18466,
 2490,
 23791,
 338,
 330,
 6709,
 275,
 567,
 1630,
 35,
 1041,
 4330,
 18,
 458,
 591,
 926,
 3136,
 77,
 35,
 15908,
 380,
 3571,
 4960,
 2807,
 1279,
 856,
 16,
 1046,
 478,
 31378,
 12137,
 271,
 27541,
 528,
 27337,
 267,
 10041,
 275,
 2947,
 316,
 352,
 2785,
 901,
 305,
 1647,
 280,
 316,
 470,
 7510,
 379,
 878,
 7900,
 16,
 280,
 3903,
 891,
 379,
 1091,
 303,
 875,
 316,
 12215,
 397,
 18,
 8936,
 459,
 287,
 1535,
 4733,
 316,
 1526,
 4787,
 682,
 3247,
 682,
 853,
 338,
 330,
 6639,
 292,
 1630,
 18,
 12