In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import sys
sys.path.append('../dataset')
import get_funsd


In [4]:
funsd = get_funsd.CustomFunsdDataset('../dataset/')
funsd.split_generators()

In [5]:
funsd

CustomFunsdDataset:
DatasetDict({
    train: Dataset({features: ['id', 'tokens', 'ner_boxes', 'bboxes', 'ner_tags','line_ids','linkings','image','image_name'], num_rows: 149}),
    test: Dataset({features: ['id', 'tokens', 'ner_boxes', 'bboxes', 'ner_tags','line_ids','linkings','image','image_name'], num_rows: 50})
})

In [6]:
label_map = {
    0: 'O',
    1: 'B-HEADER',
    2: 'I-HEADER',
    3: 'B-QUESTION',
    4: 'I-QUESTION',
    5: 'B-ANSWER', 
    6: 'I-ANSWER'}

In [7]:
funsd['train'][0].keys()

dict_keys(['id', 'tokens', 'ner_boxes', 'bboxes', 'ner_tags', 'line_ids', 'linkings', 'image', 'image_name'])

In [8]:
from datasets import Dataset, DatasetDict

funsd_train_dataset = Dataset.from_dict({
    "id": [entry["id"] for entry in funsd["train"]],
    "tokens": [entry["tokens"] for entry in funsd["train"]],
    "ner_boxes": [entry["ner_boxes"] for entry in funsd["train"]],
    "bboxes": [entry["bboxes"] for entry in funsd["train"]],
    "ner_tags": [entry["ner_tags"] for entry in funsd["train"]],
    "line_ids": [entry["line_ids"] for entry in funsd["train"]],
    "linkings": [entry["linkings"] for entry in funsd["train"]],
    "image": [entry["image"] for entry in funsd["train"]],
    "image_name": [entry["image_name"] for entry in funsd["train"]],
})

funsd_test_dataset = Dataset.from_dict({
    "id": [entry["id"] for entry in funsd["test"]],
    "tokens": [entry["tokens"] for entry in funsd["test"]],
    "ner_boxes": [entry["ner_boxes"] for entry in funsd["test"]],
    "bboxes": [entry["bboxes"] for entry in funsd["test"]],
    "ner_tags": [entry["ner_tags"] for entry in funsd["test"]],
    "line_ids": [entry["line_ids"] for entry in funsd["test"]],
    "linkings": [entry["linkings"] for entry in funsd["test"]],
    "image": [entry["image"] for entry in funsd["test"]],
    "image_name": [entry["image_name"] for entry in funsd["test"]],
})

# Optionally, you can create a DatasetDict if you have train/test splits
dataset = DatasetDict({
    "train": funsd_train_dataset,
    'test': funsd_test_dataset
})


In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_boxes', 'bboxes', 'ner_tags', 'line_ids', 'linkings', 'image', 'image_name'],
        num_rows: 149
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_boxes', 'bboxes', 'ner_tags', 'line_ids', 'linkings', 'image', 'image_name'],
        num_rows: 50
    })
})

In [10]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [11]:
def unnormalize_box(bbox, width, height):
     return [
         width * (bbox[0] / 1000),
         height * (bbox[1] / 1000),
         width * (bbox[2] / 1000),
         height * (bbox[3] / 1000),
     ]

def normalize_box(bbox, width, height):
    return [
        bbox[0] / width * 1000,  # x1
        bbox[1] / height * 1000, # y1
        bbox[2] / width * 1000,  # x2
        bbox[3] / height * 1000, # y2
    ]


In [12]:
example = dataset['train'][0]
normalize_box(unnormalize_box(example['bboxes'][0], example['image'].width, example['image'].height), example['image'].width, example['image'].height), example['bboxes'][0]

([292.0, 91.0, 376.0, 175.0], [292, 91, 376, 175])

In [13]:

import numpy as np
def make_dataset(documents):
    images = documents['image']
    
    rgb_images = list(map(lambda img: img.convert('RGB'), images))
  
    words = documents['tokens']
    
    boxes = documents['bboxes'] ## token boxes normalized below
    
    word_labels = documents['ner_tags']
    
    normalized_boxes = []

    # Normalize the boxes by image width and height
    for i in range(len(boxes)):
        normalized_boxes.append([])
        for j in range(len(boxes[i])):
            normalized_boxes[-1].append(normalize_box(boxes[i][j], images[i].width, images[i].height))

    encoding = processor(rgb_images, words, boxes=normalized_boxes, word_labels=word_labels,
                         truncation=True, padding="max_length", return_offsets_mapping=True)
    
    encoding.pop('offset_mapping')

    # Ensure that the encoding contains the expected keys
    return encoding

In [14]:
cols = funsd_train_dataset.column_names



Standardization: Many popular image classification models, like AlexNet, ResNet, and Vision Transformers, originally used 224x224 as a default input size. It has since become a standard, simplifying the process of reusing pre-trained models and ensuring consistency across various tasks.

Computational Efficiency: A size of 224x224 provides a good balance between image detail and computational cost. If images were too large, the model would require significantly more memory and processing power, but if they were too small, important details could be lost. 224x224 strikes a balance where enough spatial information is preserved while keeping the model efficient.

Training and Pre-training: Many models, including LayoutLMv3, are pre-trained on large datasets like ImageNet, where images are typically resized to 224x224. To take advantage of this pre-training and transfer learning, using the same input size ensures compatibility with the pre-trained weights and architectures.

224x224 is automated by layoutlm3

Bounding boxes: Stay on a 0-1000 scale to maintain relative spatial positioning of text or entities in the document.
Images: Resized to 224x224 pixels to meet the input size requirement of the visual part of the model.

In [15]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D

features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})
train_dataset = dataset["train"].map(
    make_dataset,
    batched=True,
    remove_columns = cols,
    features=features)

Map: 100%|██████████| 149/149 [00:03<00:00, 49.15 examples/s]


In [16]:
test_dataset = dataset["test"].map(
    make_dataset,
    batched=True,
    remove_columns = cols,
    features=features)

Map: 100%|██████████| 50/50 [00:00<00:00, 50.90 examples/s]


In [17]:
print(processor.tokenizer.decode(train_dataset[0]["input_ids"]))

<s> R&D : Suggestion: Date: Licensee Yes No 597005708 R&D QUALITY IMPROVEMENT SUGGESTION/ SOLUTION FORM Name / Phone Ext. : M. Hamann P. Harper, P. Martinez 9/ 3/ 92 R&D Group: J. S. Wigand Supervisor / Manager Discontinue coal retention analyses on licensee submitted product samples (Note : Coal Retention testing is not performed by most licensees. Other B&W physical measurements as ends stability and inspection for soft spots in ciparettes are thought to be sufficient measures to assure cigarette physical integrity. The proposed action will increase laboratory productivity . ) Suggested Solutions (s) : Delete coal retention from the list of standard analyses performed on licensee submitted product samples. Special requests for coal retention testing could still be submitted on an exception basis. Have you contacted your Manager/ Supervisor? Manager Comments: Manager, please contact suggester and forward comments to the Quality Council. qip . wp</s><pad><pad><pad><pad><pad><pad><pad><

In [18]:
train_dataset.set_format("torch")
test_dataset.set_format("torch")

In [19]:
example = train_dataset[0]
for k,v in example.items():
    print(k,v.shape)


pixel_values torch.Size([3, 224, 224])
input_ids torch.Size([512])
attention_mask torch.Size([512])
bbox torch.Size([512, 4])
labels torch.Size([512])


In [20]:
c=0
for id, label in zip(train_dataset[0]["input_ids"], train_dataset[0]["labels"]):
  print(processor.tokenizer.decode([id]), label.item())
  c+=1
  if c>10:break

<s> -100
 R 0
& -100
D -100
 : 3
 Suggest 3
ion -100
: -100
 Date 3
: -100
 License 5


In [21]:
from evaluate import load 
metric = load("seqeval")  # Replace with the actual metric name


In [22]:
label_list = ['O', 'B-HEADER', 'I-HEADER', 'B-QUESTION', 'I-QUESTION', 'B-ANSWER', 'I-ANSWER']

In [23]:
import numpy as np

return_entity_level_metrics = False

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [24]:
label_map_reversed = {v: k for k, v in label_map.items()} 


In [31]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers.data.data_collator import default_data_collator
import optuna
import os
from transformers import LayoutLMv3ForTokenClassification
from transformers.trainer_callback import TrainerCallback


hyperparameter_dict = {}

def objective(trial):
    model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base",
                                                             id2label=label_map,
                                                             label2id=label_map_reversed).to(device)
    
    dropout = trial.suggest_float('dropout', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-4, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-2, log=True)
    batch_size = 2

    model.config.hidden_dropout_prob = dropout
    model.config.attention_probs_dropout_prob = dropout

    trial_output_dir = f'./results/'
    training_args = TrainingArguments(
        output_dir=trial_output_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=1,  # Increased for demonstration
        weight_decay=weight_decay,
        logging_steps=10,
        evaluation_strategy="epoch",
        metric_for_best_model="eval_f1",
        load_best_model_at_end=True,
        save_strategy="epoch",
        save_total_limit=1
    )

    early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=5)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset.select(range(119)),
        eval_dataset=train_dataset.select(range(119, len(train_dataset)))
        tokenizer=processor,
        data_collator=default_data_collator,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

    train_result = trainer.train()
    eval_results = trainer.evaluate()
    
    hyperparameter_dict[trial.number] = {
        "final_eval_results": eval_results,
        "training_history": train_result.metrics,
        "hyperparameters": {
            "dropout": dropout,
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "batch_size": batch_size,
        }
    }
    
    return eval_results["eval_f1"]

In [32]:
# Run the optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1)  # Adjust number of trials as needed

# After optimization, you can access the results for each trial
for trial_number, trial_data in hyperparameter_dict.items():
    print(f"Trial {trial_number}:")
    print("Hyperparameters:", trial_data["hyperparameters"])
    print("Final Eval F1:", trial_data["final_eval_results"]["eval_f1"])
    print("Epoch-wise metrics:")
    for epoch, metrics in enumerate(trial_data["epoch_metrics"]):
        print(f"  Epoch {epoch + 1}:")
        print(f"    Train Loss: {metrics.get('loss', 'N/A')}")
        print(f"    Eval Loss: {metrics.get('eval_loss', 'N/A')}")
        print(f"    Eval F1: {metrics.get('eval_f1', 'N/A')}")
    print("\n")

[I 2024-10-15 15:56:36,172] A new study created in memory with name: no-name-89e850e4-2bfa-4f4d-8075-2ca6ad951351
Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[W 2024-10-15 15:56:38,332] Trial 0 failed with parameters: {'dropout': 0.32637578131819545, 'learning_rate': 3.415774190323937e-05, 'weight_decay': 4.516419320497657e-05} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\PX\Master\διπλωμαιτκ\venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\PX\AppData\Local\Temp\ipykernel_17428\4215495837.py", line 53, in objective
    train_result = trainer.train()
             

KeyboardInterrupt: 

In [192]:
import matplotlib.pyplot as plt

def plot_trial_metrics(trial_number):
    trial_data = hyperparameter_dict[trial_number]
    history = trial_data["training_history"]
    
    steps = list(range(0, len(history["train_loss"]) * 100, 100))
    train_loss = history["train_loss"]
    eval_loss = history["eval_loss"]
    eval_f1 = history["eval_f1"]
    
    plt.figure(figsize=(12, 6))
    plt.plot(steps, train_loss, label='Train Loss')
    plt.plot(steps, eval_loss, label='Eval Loss')
    plt.plot(steps, eval_f1, label='Eval F1')
    plt.xlabel('Steps')
    plt.title(f'Metrics for Trial {trial_number}')
    plt.legend()
    plt.show()

# After your Optuna study is complete:
for trial_number in hyperparameter_dict.keys():
    plot_trial_metrics(trial_number)

In [30]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(".\laoutlm3_output/checkpoint-100")

In [33]:
def getf1(example):
    image = example["image"].convert("RGB")
    words = example["tokens"]
    boxes = example["bboxes"]
    normalized_boxes = []

    for i in range(len(boxes)):
        normalized_boxes.append(normalize_box(boxes[i], image.width, image.height))


    normalized_boxes = torch.tensor(normalized_boxes, dtype=torch.long)
    
    word_labels = example["ner_tags"]
    word_labels = torch.tensor(word_labels, dtype=torch.long)

    
    encoding = processor(image, words, boxes=normalized_boxes, word_labels=word_labels,
                         truncation=True, padding="max_length", return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**encoding)
        
    logits = outputs.logits
    predictions = logits.argmax(-1).squeeze().tolist()
    labels = encoding.labels.squeeze().tolist()
    
    true_predictions = [model.config.id2label[pred] for pred, label in zip(predictions, labels) if label != - 100]
    true_labels = [model.config.id2label[label] for prediction, label in zip(predictions, labels) if label != -100]

    correct_predictions = sum(t == p for t, p in zip(true_labels, true_predictions))
    all_tokens = len(true_labels)
    

    return correct_predictions, all_tokens


In [None]:
correct = 0
all_tokens = 0
for i in range(len(funsd_test_dataset)):
    ci,ai = getf1(funsd_test_dataset[i])
    correct += ci
    all_tokens += ai


In [None]:
correct/all_tokens

In [None]:
from PIL import ImageDraw, ImageFont

draw = ImageDraw.Draw(image)

font = ImageFont.load_default()

def iob_to_label(label):
    label = label[2:]
    if not label:
      return 'other'
    return label

label2color = {'question':'blue', 'answer':'green', 'header':'orange', 'other':'violet'}

for prediction, box in zip(true_predictions,true_boxes ):
    predicted_label = iob_to_label(prediction).lower()
    draw.rectangle(box, outline=label2color[predicted_label])
    draw.text((box[0] + 10, box[1] - 10), text=predicted_label, fill=label2color[predicted_label], font=font)

image

In [None]:
image = example["image"]
image = image.convert("RGB")

draw = ImageDraw.Draw(image)

for word, box, label in zip(example['tokens'], example['bboxes'], example['ner_tags']):
  actual_label = iob_to_label(label_map[label]).lower()
  draw.rectangle(box, outline=label2color[actual_label], width=2)
  draw.text((box[0] + 10, box[1] - 10), actual_label, fill=label2color[actual_label], font=font)

image

In [None]:
### put an image from epant !!!!!!!!!!!!!!!!!!!!!!!!!!!!!