### Load and analyze the dataset

In [1]:
from datasets import load_dataset

# Load the custom dataset (or it if it does not exist)
# Param 1: location of the dataset loader script
# Param 2: location of cache folder, where the dataset will be saved
dataset = load_dataset(r'C:\Users\Habram\Documents\thesis-masters\IstVoices_de\istvoices_dataset_de.py',
                       cache_dir=r'C:\Users\Habram\.cache')

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset istvoices_dataset_de (C:/Users/Habram/.cache/istvoices_dataset_de/default/0.0.0/1f9f58bf326613f8f47333b34087510d0faf21ae9d46bc895f367a6c1810165e)
100%|██████████| 2/2 [00:00<00:00, 114.40it/s]


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'bboxes', 'ner_tags', 'image'],
        num_rows: 150
    })
    test: Dataset({
        features: ['id', 'tokens', 'bboxes', 'ner_tags', 'image'],
        num_rows: 50
    })
})

In [3]:
#dataset.push_to_hub('ihabram/istvoices_de')

In [4]:
dataset['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'bboxes': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-R_NAME', 'I-R_NAME', 'B-R_STREET', 'I-R_STREET', 'B-R_HOUSENUMBER', 'I-R_HOUSENUMBER', 'B-R_ZIP', 'I-R_ZIP', 'B-R_CITY', 'I-R_CITY', 'B-R_COUNTRY', 'I-R_COUNTRY', 'B-S_NAME', 'I-S_NAME', 'B-S_STREET', 'I-S_STREET', 'B-S_HOUSENUMBER', 'I-S_HOUSENUMBER', 'B-S_ZIP', 'I-S_ZIP', 'B-S_CITY', 'I-S_CITY', 'B-S_COUNTRY', 'I-S_COUNTRY', 'B-S_BANK', 'I-S_BANK', 'B-S_IBAN', 'I-S_IBAN', 'B-I_NUMBER', 'I-I_NUMBER', 'B-I_DATE', 'I-I_DATE', 'B-I_AMOUNT', 'I-I_AMOUNT'], id=None), length=-1, id=None),
 'image': Image(decode=True, id=None)}

### Prepare dataset for the model

In [5]:
from datasets.features import ClassLabel

features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"

# Define the dictionaries which associate the labels with integer IDs
label_list = features[label_column_name].feature.names
id2label = {k: v for k,v in enumerate(label_list)}
label2id = {v: k for k,v in enumerate(label_list)}

In [6]:
from transformers import LayoutLMv2ImageProcessor, LayoutXLMTokenizer
from transformers import LayoutLMv2ForTokenClassification

image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")

# Load the pre-trained model which will be fine-tuned
model = LayoutLMv2ForTokenClassification.from_pretrained("microsoft/layoutxlm-base",
                                                         id2label=id2label,
                                                         label2id=label2id)

Some weights of the model checkpoint at microsoft/layoutxlm-base were not used when initializing LayoutLMv2ForTokenClassification: ['layoutlmv2.visual.backbone.bottom_up.res2.2.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.20.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res5.0.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.0.shortcut.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.13.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res3.2.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.11.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.7.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res2.1.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.9.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res2.1.conv2.norm.num_batches_tracked', 'lay

In [8]:
# Define a function, which takes an example from the general-purpose dataset and processes it with the model's tokenizer.
# The result aligns with the expected format from the model.

def prepare_examples(examples):
  images = examples[image_column_name]
  words = examples[text_column_name]
  boxes = examples[boxes_column_name]
  word_labels = examples[label_column_name]

  encoding = tokenizer(words, boxes=boxes, word_labels=word_labels, truncation=True, padding="max_length", max_length=512, return_tensors='pt')
  img_features = image_processor(images, return_tensors='pt').pixel_values
  encoding['image'] = img_features

  return encoding

# Convert the general-purpose dataset into a model-specific dataset
train_dataset = dataset["train"].map(
    prepare_examples,
    remove_columns=column_names,
    #features = features
)
test_dataset = dataset["test"].map(
    prepare_examples,
    remove_columns=column_names,
    #features = features
)

Loading cached processed dataset at C:\Users\Habram\.cache\istvoices_dataset_de\default\0.0.0\1f9f58bf326613f8f47333b34087510d0faf21ae9d46bc895f367a6c1810165e\cache-3b056bad6373e850.arrow
                                                           

In [17]:
train_dataset.set_format("torch")

In [37]:
import torch

with torch.no_grad():
  outputs = model(**train_dataset[0], output_hidden_states=True)['logits']

In [55]:
predictions = outputs.argmax(-1)
labels = train_dataset[0]['labels']

In [56]:
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]