# LayoutLMv3 for receipt information understanding

## Table of Contents

1. [Fine Tuning the LayoutLMv3 for CORD Dataset](#fine-tuning-the-layoutlmv3-for-cord-dataset)
2. [Load the Fine Tuned Model for Test Dataset](#load-the-fine-tuned-model-for-test-dataset)
3. [Detect total amount in a New Receipt Dataset](#detect-total-amount-in-a-new-dataset)

## Fine Tuning the LayoutLMv3 for CORD Dataset

In [1]:
from datasets import load_dataset
from transformers import AutoProcessor
from datasets.features import ClassLabel
from datasets import Features, Sequence, Value, Array2D, Array3D
from datasets import load_metric
import numpy as np
from transformers import LayoutLMv3ForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers.data.data_collator import default_data_collator

In [2]:
# Load the dataset from huggingface
dataset = load_dataset("hcsun/cord", trust_remote_code=True)

In [3]:
# Preprocessed dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image'],
        num_rows: 100
    })
    test: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image'],
        num_rows: 100
    })
})

In [4]:
dataset["train"].features

{'id': Value(dtype='string', id=None),
 'words': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'bboxes': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-MENU.NM', 'B-MENU.NUM', 'B-MENU.UNITPRICE', 'B-MENU.CNT', 'B-MENU.DISCOUNTPRICE', 'B-MENU.PRICE', 'B-MENU.ITEMSUBTOTAL', 'B-MENU.VATYN', 'B-MENU.ETC', 'B-MENU.SUB_NM', 'B-MENU.SUB_UNITPRICE', 'B-MENU.SUB_CNT', 'B-MENU.SUB_PRICE', 'B-MENU.SUB_ETC', 'B-VOID_MENU.NM', 'B-VOID_MENU.PRICE', 'B-SUB_TOTAL.SUBTOTAL_PRICE', 'B-SUB_TOTAL.DISCOUNT_PRICE', 'B-SUB_TOTAL.SERVICE_PRICE', 'B-SUB_TOTAL.OTHERSVC_PRICE', 'B-SUB_TOTAL.TAX_PRICE', 'B-SUB_TOTAL.ETC', 'B-TOTAL.TOTAL_PRICE', 'B-TOTAL.TOTAL_ETC', 'B-TOTAL.CASHPRICE', 'B-TOTAL.CHANGEPRICE', 'B-TOTAL.CREDITCARDPRICE', 'B-TOTAL.EMONEYPRICE', 'B-TOTAL.MENUTYPE_CNT', 'B-TOTAL.MENUQTY_CNT', 'I-MENU.NM', 'I-MENU.NUM', 'I-MENU.UNITPRICE', 'I-MENU.CNT', 'I-MENU.DIS

In [5]:
# Load the layoutlmv3 processor
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)



### Prepare the data for NER task and create mappings between label names and IDs

In [6]:
# Get the features
features = dataset["train"].features
label_column_name = "ner_tags"

def get_label_list(labels):
    """Function to get a list of unique labels from the dataset."""
    return sorted(set(label for sublist in labels for label in sublist))

# Get the label list
if isinstance(features[label_column_name].feature, ClassLabel):
    label_list = features[label_column_name].feature.names
else:
    label_list = get_label_list(dataset["train"][label_column_name])

# Create mappings from label to id and vice versa
id2label = {k: v for k,v in enumerate(label_list)}
label2id = {v: k for k,v in enumerate(label_list)}
num_labels = len(label_list)

In [7]:
print(label_list)
print(id2label)

['O', 'B-MENU.NM', 'B-MENU.NUM', 'B-MENU.UNITPRICE', 'B-MENU.CNT', 'B-MENU.DISCOUNTPRICE', 'B-MENU.PRICE', 'B-MENU.ITEMSUBTOTAL', 'B-MENU.VATYN', 'B-MENU.ETC', 'B-MENU.SUB_NM', 'B-MENU.SUB_UNITPRICE', 'B-MENU.SUB_CNT', 'B-MENU.SUB_PRICE', 'B-MENU.SUB_ETC', 'B-VOID_MENU.NM', 'B-VOID_MENU.PRICE', 'B-SUB_TOTAL.SUBTOTAL_PRICE', 'B-SUB_TOTAL.DISCOUNT_PRICE', 'B-SUB_TOTAL.SERVICE_PRICE', 'B-SUB_TOTAL.OTHERSVC_PRICE', 'B-SUB_TOTAL.TAX_PRICE', 'B-SUB_TOTAL.ETC', 'B-TOTAL.TOTAL_PRICE', 'B-TOTAL.TOTAL_ETC', 'B-TOTAL.CASHPRICE', 'B-TOTAL.CHANGEPRICE', 'B-TOTAL.CREDITCARDPRICE', 'B-TOTAL.EMONEYPRICE', 'B-TOTAL.MENUTYPE_CNT', 'B-TOTAL.MENUQTY_CNT', 'I-MENU.NM', 'I-MENU.NUM', 'I-MENU.UNITPRICE', 'I-MENU.CNT', 'I-MENU.DISCOUNTPRICE', 'I-MENU.PRICE', 'I-MENU.ITEMSUBTOTAL', 'I-MENU.VATYN', 'I-MENU.ETC', 'I-MENU.SUB_NM', 'I-MENU.SUB_UNITPRICE', 'I-MENU.SUB_CNT', 'I-MENU.SUB_PRICE', 'I-MENU.SUB_ETC', 'I-VOID_MENU.NM', 'I-VOID_MENU.PRICE', 'I-SUB_TOTAL.SUBTOTAL_PRICE', 'I-SUB_TOTAL.DISCOUNT_PRICE', 'I-SUB

In [8]:
# Define the structure of the data that will be fed to the model
features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})

### Prepare training and evaluation datasets

In [9]:
def prepare_examples(examples):
    """Function to prepare examples for training. Encode the examples using the layoutlmv3 processor."""
    images = examples['image']
    words = examples['words']
    boxes = examples['bboxes']
    word_labels = examples[label_column_name]
    encoding = processor(images, words, boxes=boxes, word_labels=word_labels, truncation=True, padding="max_length")
    return encoding

def prepare_dataset(dataset, batched=True, remove_columns=dataset["train"].column_names, features=features):
    """Function to prepare the dataset for training. Map the prepare_examples function to each batch of examples in the dataset."""
    return dataset.map(
        prepare_examples,
        batched=batched,
        remove_columns=remove_columns,
        features=features,
    )

train_dataset = prepare_dataset(dataset["train"])
eval_dataset = prepare_dataset(dataset["test"])

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [10]:
# Set the output format of the train_dataset to PyTorch tensors
train_dataset.set_format("torch")

In [11]:
def flatten_dict(d):
    """Function to flatten a dictionary."""
    def expand(key, value):
        if isinstance(value, dict):
            return [(key + '_' + k, v) for k, v in flatten_dict(value).items()]
        else:
            return [(key, value)]
    items = [item for k, v in d.items() for item in expand(k, v)]
    return dict(items)

In [12]:
# seqeval metric is used for sequence labeling evaluation
metric = load_metric("seqeval", trust_remote_code=True)

  metric = load_metric("seqeval", trust_remote_code=True)


In [13]:
def compute_metrics(p, return_entity_level_metrics=False):
    """Function to compute metrics for evaluation."""
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        return flatten_dict(results)
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [14]:
# Load pre-trained weights of the layoutlmv3 model, set the number of labels and the mappings from label to id
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", id2label=id2label, label2id=label2id)

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Training the Model

Set up and start the training process for the model

1. define the training arguments using the `TrainingArguments` class from the Hugging Face's `transformers` library. These arguments include the output directory, maximum steps, batch sizes, learning rate, evaluation strategy etc.

2. initialize the `Trainer` with the model, the training arguments, the training and evaluation datasets, a function to compute metrics etc.

3. call `trainer.train()` to start the training process.

In [16]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="layoutlmv3-cord",
    max_steps=500,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    push_to_hub=True,
    push_to_hub_model_id=f"layoutlmv3-cord",
    learning_rate=2e-5,
    evaluation_strategy="steps",
    eval_steps=250,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    gradient_accumulation_steps=2,
    fp16=True,
    logging_strategy="steps",
    logging_steps=100,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

  0%|          | 0/500 [00:00<?, ?it/s]

{'loss': 2.1415, 'learning_rate': 1.6040000000000002e-05, 'epoch': 1.25}
{'loss': 0.961, 'learning_rate': 1.204e-05, 'epoch': 2.5}


  0%|          | 0/20 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5548838973045349, 'eval_precision': 0.8511730205278593, 'eval_recall': 0.8690119760479041, 'eval_f1': 0.86, 'eval_accuracy': 0.8730899830220713, 'eval_runtime': 10.7584, 'eval_samples_per_second': 9.295, 'eval_steps_per_second': 1.859, 'epoch': 3.12}
{'loss': 0.5622, 'learning_rate': 8.040000000000001e-06, 'epoch': 3.75}
{'loss': 0.4423, 'learning_rate': 4.04e-06, 'epoch': 5.0}
{'loss': 0.3714, 'learning_rate': 4e-08, 'epoch': 6.25}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 0.3655731678009033, 'eval_precision': 0.8983050847457628, 'eval_recall': 0.9124251497005988, 'eval_f1': 0.9053100631266245, 'eval_accuracy': 0.9231748726655348, 'eval_runtime': 10.4896, 'eval_samples_per_second': 9.533, 'eval_steps_per_second': 1.907, 'epoch': 6.25}
{'train_runtime': 4118.615, 'train_samples_per_second': 1.214, 'train_steps_per_second': 0.121, 'train_loss': 0.8956798400878906, 'epoch': 6.25}


TrainOutput(global_step=500, training_loss=0.8956798400878906, metrics={'train_runtime': 4118.615, 'train_samples_per_second': 1.214, 'train_steps_per_second': 0.121, 'train_loss': 0.8956798400878906, 'epoch': 6.25})

In [17]:
# Evaluation Results with 500 steps of training
trainer.evaluate()



  0%|          | 0/20 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3655731678009033,
 'eval_precision': 0.8983050847457628,
 'eval_recall': 0.9124251497005988,
 'eval_f1': 0.9053100631266245,
 'eval_accuracy': 0.9231748726655348,
 'eval_runtime': 10.8356,
 'eval_samples_per_second': 9.229,
 'eval_steps_per_second': 1.846,
 'epoch': 6.25}

In [18]:
# trainer.save_model(r"")