In [1]:
from datasets import load_dataset 

# this dataset uses the new Image feature :)
dataset = load_dataset("nielsr/funsd-layoutlmv3")

from transformers import AutoProcessor

# we'll use the Auto API here - it will load LayoutLMv3Processor behind the scenes,
# based on the checkpoint we provide from the hub
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

Found cached dataset funsd-layoutlmv3 (C:/Users/Habram/.cache/huggingface/datasets/nielsr___funsd-layoutlmv3/funsd/1.0.0/0e3f4efdfd59aa1c3b4952c517894f7b1fc4d75c12ef01bcc8626a69e41c1bb9)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
from datasets.features import ClassLabel

features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"

# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
# unique labels.
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

if isinstance(features[label_column_name].feature, ClassLabel):
    label_list = features[label_column_name].feature.names
    # No need to convert the labels since they are already ints.
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
else:
    label_list = get_label_list(dataset["train"][label_column_name])
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
num_labels = len(label_list)

def prepare_examples(examples):
  images = examples[image_column_name]
  words = examples[text_column_name]
  boxes = examples[boxes_column_name]
  word_labels = examples[label_column_name]

  encoding = processor(images, words, boxes=boxes, word_labels=word_labels,
                       truncation=True, padding="max_length")

  return encoding

from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D

# we need to define custom features for `set_format` (used later on) to work properly
features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})

train_dataset = dataset["train"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)
eval_dataset = dataset["test"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

Loading cached processed dataset at C:\Users\Habram\.cache\huggingface\datasets\nielsr___funsd-layoutlmv3\funsd\1.0.0\0e3f4efdfd59aa1c3b4952c517894f7b1fc4d75c12ef01bcc8626a69e41c1bb9\cache-b8cd7aaa41f679ea.arrow
Loading cached processed dataset at C:\Users\Habram\.cache\huggingface\datasets\nielsr___funsd-layoutlmv3\funsd\1.0.0\0e3f4efdfd59aa1c3b4952c517894f7b1fc4d75c12ef01bcc8626a69e41c1bb9\cache-d5c4c7cf6684c166.arrow


In [14]:
import numpy as np
from seqeval.metrics import classification_report
from datasets import load_metric

metric = load_metric("seqeval")

return_entity_level_metrics = True

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [15]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("test/checkpoint-1000")

In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test",
                                  max_steps=1000,
                                  per_device_train_batch_size=2,
                                  per_device_eval_batch_size=2,
                                  learning_rate=1e-5,
                                  evaluation_strategy="steps",
                                  eval_steps=100,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1")

In [17]:
from transformers.data.data_collator import default_data_collator

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

In [26]:
result = trainer.predict(eval_dataset)



  0%|          | 0/25 [00:00<?, ?it/s]

[[[ 0.5227469  -0.61665034 -0.72801656 ... -0.5851758  -0.59295017
    0.1891223 ]
  [-1.2408422   1.1296552  -2.1369545  ... -1.2010559  -1.888513
   -2.1990263 ]
  [-1.301788   -1.0314339  -1.0290934  ...  4.3501344  -2.3781564
   -1.5551789 ]
  ...
  [ 0.514364   -0.6048404  -0.7255876  ... -0.5872829  -0.5809066
    0.18480977]
  [ 0.514364   -0.6048404  -0.7255876  ... -0.5872829  -0.5809066
    0.18480977]
  [ 0.514364   -0.6048404  -0.7255876  ... -0.5872829  -0.5809066
    0.18480977]]

 [[ 0.48502555 -0.49840876 -0.8495505  ... -0.44594455 -0.5907527
    0.22827671]
  [-0.5603218   1.0881749  -2.4192579  ... -1.04953    -1.636013
   -2.1455333 ]
  [-0.23497693 -1.0392225  -1.1466985  ...  4.1440706  -2.9664743
   -1.4228077 ]
  ...
  [ 0.482246   -0.49599653 -0.84252983 ... -0.45163774 -0.58099794
    0.23516463]
  [ 0.482246   -0.49599653 -0.84252983 ... -0.45163774 -0.58099794
    0.23516463]
  [ 0.482246   -0.49599653 -0.84252983 ... -0.45163774 -0.58099794
    0.23516463]]

In [27]:
predictions = np.argmax(result[0], axis=2)

In [28]:
predictions.shape

(50, 512)

In [9]:
trainer.evaluate()

  0%|          | 0/25 [00:00<?, ?it/s]

Hello!
<transformers.trainer_utils.EvalPrediction object at 0x000001921DEFD6A0>


{'eval_loss': 0.5202368497848511,
 'eval_ANSWER_precision': 0.892271662763466,
 'eval_ANSWER_recall': 0.9326805385556916,
 'eval_ANSWER_f1': 0.9120287253141832,
 'eval_ANSWER_number': 817,
 'eval_HEADER_precision': 0.6972477064220184,
 'eval_HEADER_recall': 0.6386554621848739,
 'eval_HEADER_f1': 0.6666666666666666,
 'eval_HEADER_number': 119,
 'eval_QUESTION_precision': 0.910665451230629,
 'eval_QUESTION_recall': 0.9275766016713092,
 'eval_QUESTION_f1': 0.9190432382704692,
 'eval_QUESTION_number': 1077,
 'eval_overall_precision': 0.891747572815534,
 'eval_overall_recall': 0.912568306010929,
 'eval_overall_f1': 0.9020378099680825,
 'eval_overall_accuracy': 0.8655651967193629,
 'eval_runtime': 10.5125,
 'eval_samples_per_second': 4.756,
 'eval_steps_per_second': 2.378}