First we will initialize our filiBERTo tokenizer like before, and use it to encode our data.

In [1]:
from transformers import RobertaTokenizer
import torch
from pathlib import Path
from tqdm.auto import tqdm
import random

In [2]:
# initialize the tokenizer using the tokenizer we initialized and saved to file
tokenizer = RobertaTokenizer.from_pretrained('filiberto', max_len=512)

In [3]:
def mlm(tensor):
    # create random array of floats with equal dims to tensor
    rand = torch.rand(tensor.shape)
    # mask random 15% where token is not 0 <s>, 1 <pad>, or 2 <s/>
    mask_arr = (rand < .15) * (tensor != 0) * (tensor != 1) * (tensor != 2)
    # loop through each row in tensor (cannot do in parallel)
    for i in range(tensor.shape[0]):
        # get indices of mask positions from mask array
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        # mask tensor
        tensor[i, selection] = 3
    return tensor

In [4]:
paths = [str(x) for x in Path('../../data/text/oscar_it').glob('**/*.txt')]
# initialize lists of tensors
input_ids = []
mask = []
labels = []
# open all files, encode and add to single dataset
for path in tqdm(paths[:50]):
    # :50
    # open the file and split into list by newline characters
    with open(path, 'r', encoding='utf-8') as fp:
        lines = fp.read().split('\n')
    # encode
    sample = tokenizer(lines, max_length=512, truncation=True, padding='max_length')
    # convert tokens to tensor
    labels.append(torch.tensor(sample.input_ids))
    # create attention mask tensor
    mask.append(torch.tensor(sample.attention_mask))
    # mask ~15% of tokens to create inputs
    input_ids.append(mlm(labels[-1].detach().clone()))
# convert lists of tensors into tensors
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

100%|██████████| 50/50 [31:12<00:00, 37.44s/it]


We have 500000 tokenized sequences, each containing 512 tokens:

In [5]:
input_ids.shape

torch.Size([500000, 512])

In [6]:
input_ids[0]

tensor([    0,   693, 18623,  1358,  7752,     3,  1056,   280,  7405,  6321,
          776,   726,  2145,   280,    11, 10205,     3,  1266,  1810,  1197,
          604,  1142, 10293,    30,   552,   267,  1340,     3,   385,  3375,
            3,  9777,  5942,   376, 25475,  2870,  1201,   391,  2691,   421,
        17927, 16996,   739,   305,   306,     3,   376,  7950, 17824,   980,
          435, 18388,  1475,   275,  2597,   391,    37, 24909,   739,  2689,
        27869,   275,     3,   625,   770, 13459,   483,     3,   275,     3,
          532,    18,   680,     3, 24138,   376,  7752, 17630, 18623,  1134,
         8882,   269,   431,   287, 12450,   483,  8041,  6056,   275,  5286,
           18, 11755,   367,     3,  6161,   317,     3,   570,  1594, 13181,
           18,   458,    16,   372,     3,  2150, 12054,    16,     3,   317,
         6122,  5324,  3329,   570,  1594, 13181,     3,     3,    18,   763,
        12656,  6323,  2484,  6544,     3,   469,  9106,    18, 

We can see the special tokens here, `0` is our **<s\>** token, `2` our **<s\\>** token, `3` our **<mask\>** token, and at the end we have two `1` - or **<pad\>** - tokens:

In [7]:
encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}

In [8]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

Next we initialize our `Dataset`.

In [9]:
dataset = Dataset(encodings)

And initialize the dataloader, which will load the data into the model during training.

In [10]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

And move onto building our model, we first need to create a RoBERTa config object, which will describe which features we want to initialize our RoBERTa model with.

In [11]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab set in previous notebook
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

Then we import and initialize a RoBERTa model with a language modeling head.

In [12]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config)

And now we move onto training. First we setup GPU/CPU usage.

In [13]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

Activate the training mode of our model, and initialize our optimizer (Adam with weighted decay - reduces chance of overfitting).

In [15]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-5)

In [16]:
# optionally, if using tensorboard, initialize writer object
writer = torch.utils.tensorboard.SummaryWriter()

Now we move onto the training loop.

In [17]:
from tqdm import tqdm  # for our progress bar

epochs = 2
step = 0

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # take loss for tensorboard
        writer.add_scalar('Loss/train', loss, step)
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        step += 1

Epoch 0:   4%|▎         | 66/1875 [11:51<5:25:14, 10.79s/it, loss=5.04]


KeyboardInterrupt: 

In [None]:
model.save_pretrained('./filiberto')