This notebook demonstrates the most important steps how to use **RoBERTa** model on **IstVoices_text** dataset. Covered topics are:
- Convert the raw data (images, annotations) into a *general* dataset  
- Prepare the *general* Istvoices dataset for the model
- Fine-tune the model on the *prepared* dataset
- Evaluate the model quantitatively

### Convert the raw data (images, annotations) into a *general* dataset

In [7]:
import datasets

dataset = datasets.load_dataset(r'istvoices_text_dataset.py', cache_dir=r'C:\Users\Habram\.cache')

Downloading and preparing dataset istvoices_text_dataset/default to C:/Users/Habram/.cache/istvoices_text_dataset/default/0.0.0/c5c3b25c6be4732e17121d07eb0006adccb33ce9c6363020451113361923e1ab...


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset istvoices_text_dataset downloaded and prepared to C:/Users/Habram/.cache/istvoices_text_dataset/default/0.0.0/c5c3b25c6be4732e17121d07eb0006adccb33ce9c6363020451113361923e1ab. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
print(dataset)
print(dataset['train'][0]['tokens'])
print(dataset['train'][0]['ner_tags'])

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 150
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 50
    })
})
['Paiva', 'e', 'Filhos', 'Alameda', 'Júlia', 'Soares', '682', 'Alameda', 'Júlia', 'Soares', '3311-428', 'Vale', 'de', 'Cambra', '3311-428', 'Portugal', 'Tax', 'ID:', 'PT403497395', 'Matos', 'Avenida', 'de', 'Sousa', '980', '2429-530', 'Penafiel', 'Portugal', 'Tax', 'ID:', 'PT570406524', 'Banco', 'Santander', 'Totta', 'Swift:', 'MDPZPTBG', 'IBAN:', 'PT10203468057322023270670', 'Invoice', 'number:', '290471902', 'Period:', '01.07.2021', '-', '31.07.2021', 'Invoice', 'date:', '31.07.2021', 'Our', 'reference:', '966432533', 'ferreirarodrigo@example.net', 'Description', 'Quantity', 'Brutto', 'Professional', 'software', '22', 'EUR', '2974.1', 'Gas', '5', 'EUR', '1800.4', 'Deposit', '16', 'EUR', '1004.9', 'Premium', 'cost', '11', 'EUR', '5557.7', 'Washing', 'machine', '8', 'EUR', '4812.0', 'Printe

### Prepare the *general* Istvoices dataset for the model

In [9]:
from transformers import RobertaForTokenClassification, RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [10]:
def add_encodings(example):
    """Processing the example

    Args:
        example (dict): The dataset example.

    Returns:
        dict: The dictionary containing the following updates:
            - input_ids: The list of input ids of the tokens.
            - attention_mask: The attention mask list.
            - ner_tags: The updated ner_tags.

    """
    # get the encodings of the tokens. The tokens are already split, that is why we must add is_split_into_words=True
    encodings = tokenizer(example['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
    # extend the ner_tags so that it matches the max_length of the input_ids
    labels = example['ner_tags'] + [0] * (tokenizer.model_max_length - len(example['ner_tags']))
    # return the encodings and the extended ner_tags
    return { **encodings, 'labels': labels }

dataset = dataset.map(add_encodings)

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [13]:
print(dataset)
print(dataset['train'][0]['tokens'])
print(dataset['train'][0]['ner_tags'])
print(dataset['train'][0]['input_ids'])
print(dataset['train'][0]['attention_mask'])
print(dataset['train'][0]['labels'])

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 150
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
})
['Paiva', 'e', 'Filhos', 'Alameda', 'Júlia', 'Soares', '682', 'Alameda', 'Júlia', 'Soares', '3311-428', 'Vale', 'de', 'Cambra', '3311-428', 'Portugal', 'Tax', 'ID:', 'PT403497395', 'Matos', 'Avenida', 'de', 'Sousa', '980', '2429-530', 'Penafiel', 'Portugal', 'Tax', 'ID:', 'PT570406524', 'Banco', 'Santander', 'Totta', 'Swift:', 'MDPZPTBG', 'IBAN:', 'PT10203468057322023270670', 'Invoice', 'number:', '290471902', 'Period:', '01.07.2021', '-', '31.07.2021', 'Invoice', 'date:', '31.07.2021', 'Our', 'reference:', '966432533', 'ferreirarodrigo@example.net', 'Description', 'Quantity', 'Brutto', 'Professional', 'software', '22', 'EUR', '2974.1', 'Gas', '5', 'EUR', '1800.4', 'Deposit', '16', 'EUR', '1004.9', 'Premium', 

In [15]:
# format the datasets so that we return only 'input_ids', 'attention_mask' and 'labels'
# making it easier to train and validate the model
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [17]:
# get the NER labels and create two dictionaries for accessing their ids
labels = dataset['train'].features['ner_tags'].feature
label2id = { k: labels.str2int(k) for k in labels.names }
id2label = { v: k for k, v in label2id.items() }
id2label

{0: 'Other',
 1: 'R_Name',
 2: 'R_Street',
 3: 'R_HouseNumber',
 4: 'R_ZIP',
 5: 'R_City',
 6: 'R_Country',
 7: 'R_VAT',
 8: 'S_Name',
 9: 'S_Street',
 10: 'S_HouseNumber',
 11: 'S_ZIP',
 12: 'S_City',
 13: 'S_Country',
 14: 'S_VAT',
 15: 'S_Bank',
 16: 'S_BIC',
 17: 'S_IBAN',
 18: 'S_Tel',
 19: 'S_Email',
 20: 'I_Number',
 21: 'I_Date',
 22: 'I_DueDate',
 23: 'I_Amount',
 24: 'I_Currency'}

### Fine-tune the model on the *prepared* dataset

In [18]:
# initialize the model and provide the 'num_labels' used to create the classification layer
model = RobertaForTokenClassification.from_pretrained('roberta-base', id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
# set the model in 'train' mode and send it to the device
model.train().to(device)
# initialize the Adam optimizer (used for training/updating the model)
optimizer = optim.AdamW(params=model.parameters(), lr=1e-5)

In [23]:
# set the number of epochs
n_epochs = 3
# batch the train data so that each batch contains 4 examples (using 'batch_size')
train_data = torch.utils.data.DataLoader(dataset['train'], batch_size=4)

In [26]:
import tqdm
tqdmn = tqdm.notebook.tqdm

for i, batch in enumerate(tqdmn(train_data)):
    # move the batch tensors to the same device as the
    batch = { k: v.to(device) for k, v in batch.items() }
    break
batch

  0%|          | 0/38 [00:00<?, ?it/s]

{'input_ids': tensor([[    0,  5476,  7222,  ...,     1,     1,     1],
         [    0,  7056,     6,  ...,     1,     1,     1],
         [    0,  1811,  4306,  ...,     1,     1,     1],
         [    0, 25893,   241,  ...,     1,     1,     1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 2,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [27]:
train_loss = []
# iterate through the data 'n_epochs' times
for epoch in tqdmn(range(n_epochs)):
    current_loss = 0
    # iterate through each batch of the train data
    for i, batch in enumerate(tqdmn(train_data)):
        # move the batch tensors to the same device as the
        batch = { k: v.to(device) for k, v in batch.items() }
        # send 'input_ids', 'attention_mask' and 'labels' to the model
        outputs = model(**batch)
        # the outputs are of shape (loss, logits)
        loss = outputs[0]
        # with the .backward method it calculates all
        # of  the gradients used for autograd
        loss.backward()
        # NOTE: if we append `loss` (a tensor) we will force the GPU to save
        # the loss into its memory, potentially filling it up. To avoid this
        # we rather store its float value, which can be accessed through the
        # `.item` method
        current_loss += loss.item()
        if i % 8 == 0 and i > 0:
            # update the model using the optimizer
            optimizer.step()
            # once we update the model we set the gradients to zero
            optimizer.zero_grad()
            # store the loss value for visualization
            train_loss.append(current_loss / 32)
            current_loss = 0
    # update the model one last time for this epoch
    optimizer.step()
    optimizer.zero_grad()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

KeyboardInterrupt: 