### Load and analyze the dataset

In [1]:
from datasets import load_dataset

# Load the custom dataset (or it if it does not exist)
# Param 1: location of the dataset loader script
# Param 2: location of cache folder, where the dataset will be saved
dataset = load_dataset(r'C:\Users\Habram\Documents\thesis-masters\IstVoices_de\istvoices_dataset_de.py',
                       cache_dir=r'C:\Users\Habram\.cache')

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset istvoices_dataset_de (C:/Users/Habram/.cache/istvoices_dataset_de/default/0.0.0/1f9f58bf326613f8f47333b34087510d0faf21ae9d46bc895f367a6c1810165e)
100%|██████████| 2/2 [00:00<00:00, 62.56it/s]


### Load the model, tokenizer, image processor

In [2]:
from transformers import LayoutLMv2ImageProcessor, LayoutXLMTokenizer
from transformers import LayoutLMv2Model
import torch

image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")

# Load the pre-trained model which will be fine-tuned
model = LayoutLMv2Model.from_pretrained("microsoft/layoutxlm-base")

Some weights of the model checkpoint at microsoft/layoutxlm-base were not used when initializing LayoutLMv2Model: ['layoutlmv2.visual.backbone.bottom_up.res4.15.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.0.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.0.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.10.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res2.0.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.6.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res5.1.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.6.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.2.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.7.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.21.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backb

### Extract the embeddings (output of the LayoutXLM model)

In [3]:
from dataclasses import dataclass

@dataclass
class DataCollatorForTokenClassification:
    feature_extractor: LayoutLMv2ImageProcessor
    tokenizer: LayoutXLMTokenizer

    def __call__(self, examples):
        tokens = []
        bboxes = []
        ner_tags = []
        images = []

        for ex in examples:
            images.append(ex['image'])

        img_features = image_processor(images, return_tensors='pt').pixel_values

        for ex in examples:
            tokens.append(ex['tokens'])
            bboxes.append(ex['bboxes'])
            ner_tags.append(ex['ner_tags'])

        encoding = tokenizer(tokens, 
                     boxes=bboxes, 
                     word_labels=ner_tags, 
                     truncation=True, 
                     padding="max_length", 
                     max_length=512,
                     return_tensors='pt'
                     )

        encoding['image'] = img_features

        return encoding
    
data_collator = DataCollatorForTokenClassification(
    image_processor,
    tokenizer,
)

from torch.utils.data import DataLoader

dataloader = DataLoader(dataset['train'], 
                        batch_size=1, 
                        collate_fn=data_collator)

In [13]:
import numpy as np

def group_subword_embeddings(tokens, embeddings):
    groups = []
    current_group = []
    for idx, token in enumerate(tokens):
        # !Not an underscore! _▁, but a special character
        if token.startswith('▁'):
            if len(current_group) > 0:
                groups.append(current_group)
            current_group = [idx]
        elif token == '<s>' or token == '</s>' or token == '<pad>':
            continue
        else:
            current_group.append(idx)
    groups.append(current_group)

    grouped_embeddings = []
    grouped_tokens = []
    for group in groups:
        grouped_tokens.append(''.join([tokens[i].lstrip('▁') for i in group]))
        grouped_embeddings.append(np.array(embeddings[group[0]: group[-1]+1].mean(axis=0)))

    return grouped_tokens, np.array(grouped_embeddings)

In [23]:
from tqdm import tqdm

size_of_dataset = 150

iterator = iter(dataloader)

all_labels = []
all_embeddings = []
all_tokens = []

# Iterate over the dataset
for iteration in tqdm(range(size_of_dataset)):
    # Preprocess one example
    example = next(iterator)

    # Remove the labels from the example + remove the -100 elements
    labels = example.pop('labels')
    labels = np.array(labels)
    labels = np.delete(labels, np.where(labels == -100))
    labels = torch.tensor(labels)

    # Perform a forward-pass on the XLM model
    with torch.no_grad():
        output = model(**example)

    # Get the output embeddings of the XLM model
    xlm_embeddings = output['last_hidden_state'][0]

    # Get the input IDs
    input_ids = example["input_ids"]
    
    # Convert the input IDs to sub-words
    tokens = tokenizer.convert_ids_to_tokens(*input_ids)

    # Group the sub-words and embeddings
    joined_tokens, joined_embeddings = group_subword_embeddings(tokens, xlm_embeddings)

    all_labels.append(labels)
    all_embeddings.append(joined_embeddings)
    all_tokens.append(joined_tokens)

100%|██████████| 150/150 [03:20<00:00,  1.33s/it]


In [25]:
print(all_embeddings[11].shape,
      len(all_tokens[11]),
      all_labels[11].size())

(143, 768) 143 torch.Size([143])


In [28]:
train_data = {
    'embeddings': all_embeddings,
    'tokens':     all_tokens,
    'labels':     all_labels
}

AttributeError: 'list' object has no attribute 'tolist'

In [27]:
import json

filename = 'training_data.json'
with open(filename, 'w') as file_object:
    json.dump(train_data, file_object)

TypeError: Object of type ndarray is not JSON serializable

In [None]:
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

import torch.nn as nn
import torch.optim as optim

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(768, 35)

    def forward(self, x):
        x = self.fc(x)
        return x
    
net = Net()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
optimizer.zero_grad()
classifier_input = torch.Tensor(joined_embeddings)
classifier_output = net(classifier_input)

In [None]:
classifier_output.size()

In [None]:
target = encoded_input['labels']
target = np.array(target)
target = np.delete(target, np.where(target == -100))
target = torch.tensor(target)

In [None]:
loss = criterion(classifier_output, target)

In [None]:
loss.backward()
optimizer.step()

In [None]:
loss