First we will initialize our filiBERTo tokenizer like before, and use it to encode our data.

In [1]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
import torch
from pathlib import Path
from tqdm.auto import tqdm
import random

In [2]:
# initialize the tokenizer using the tokenizer we initialized and saved to file
tokenizer = RobertaTokenizer.from_pretrained('filiberto', max_len=512)
model = RobertaForMaskedLM.from_pretrained('filiberto')

And now we load our PyTorch tensors.

In [3]:
input_ids = torch.load('./filiberto_training/input_ids.pt')
mask = torch.load('./filiberto_training/attention_mask.pt')
labels = torch.load('./filiberto_training/labels.pt')

In [4]:
input_ids[0][:20]

tensor([    0,   696, 17974,  1298,  7479,   292,  1081,     4, 11222,  7427,
          787,   742,  2014,   280,    11, 10012,  3686,  1314,  2192,  1616])

In [5]:
encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}

In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

Next we initialize our `Dataset`.

In [7]:
dataset = Dataset(encodings)

And initialize the dataloader, which will load the data into the model during training.

In [8]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

And now we move onto training. First we setup GPU/CPU usage.

In [9]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

Activate the training mode of our model, and initialize our optimizer (Adam with weighted decay - reduces chance of overfitting).

In [10]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

In [11]:
# optionally, if using tensorboard, initialize writer object
writer = torch.utils.tensorboard.SummaryWriter()

Now we move onto the training loop.

In [12]:
from tqdm import tqdm  # for our progress bar

epochs = 1  # trained for 4 in total
step = 0

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # take loss for tensorboard
        writer.add_scalar('Loss/train', loss, step)
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        step += 1

Epoch 0: 100%|██████████| 125000/125000 [14:02:25<00:00,  2.47it/s, loss=0.258]


In [14]:
model.save_pretrained('./filiberto')