In [2]:
import os
from pathlib import Path
from io import BytesIO
import base64

HF_TOKEN = os.getenv("HUGGINGFACE_INFERENCE_TOKEN")
REPO_PATH = Path(os.path.realpath("")).parent

In [None]:
from PIL import Image

image = Image.open(REPO_PATH / ".private/invoice.png")
image = image.convert("RGB")
image.resize((350,450))

# LayoutLM

In [43]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from PIL import Image

image = Image.open(REPO_PATH / ".private/invoice.png")
buffered = BytesIO()
image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
pipe = pipeline("document-question-answering", model="impira/layoutlm-invoices")
pipe(
    img_str,
    "What is the invoice number?"
)

Some weights of the model checkpoint at impira/layoutlm-invoices were not used when initializing LayoutLMForQuestionAnswering: ['token_classifier_head.weight', 'token_classifier_head.bias']
- This IS expected if you are initializing LayoutLMForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.9999730587005615, 'answer': '18', 'start': 32, 'end': 32}]

: 

## Serveless inference from hub

In [8]:
import requests

API_URL = "https://api-inference.huggingface.co/models/impira/layoutlm-invoices"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	with open(payload["image"], "rb") as f:
		img = f.read()
	payload["image"] = base64.b64encode(img).decode("utf-8")  
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

output = query({
		"image": str(REPO_PATH / ".private/invoice.png"),
		"question": " What is the invoice number?"
	})
output

[{'score': 0.9999761581420898, 'answer': '18', 'start': 32, 'end': 32}]

## Load model directly

In [3]:
from transformers import AutoModelForDocumentQuestionAnswering, LayoutLMv2Processor, AutoTokenizer

model = AutoModelForDocumentQuestionAnswering.from_pretrained("impira/layoutlm-invoices")
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at impira/layoutlm-invoices were not used when initializing LayoutLMForQuestionAnswering: ['token_classifier_head.weight', 'token_classifier_head.bias']
- This IS expected if you are initializing LayoutLMForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
question = "What is the total gross amount?"

encoding = processor(image, question, return_tensors="pt")
del encoding["image"]
print(encoding.keys())
print(processor.tokenizer.decode(encoding.input_ids.squeeze()))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox'])
[CLS] what is the total gross amount? [SEP] monsieur jeremy arancio ( el ) harness network, inc. 19 chemin des chardons 16 dunn street 18110 fussy ca 92677 laguna niguel fr us registration number : 904 429 024 vat number : fr77904429024 invoice number : 18 issued on : 10 / 23 / 2023 payment due date : 11 / 22 / 2023 nlp consulting the goal of this collaboration is to build a proof of concept for a matching tool between start - up founders and incubators. details qty unit price vat % total excl. vat matching tool ( proof of concept ) 1 €4, 750. 00 0 % €4, 750. 00 * search algorithm for incubators corresponding to start - ups ’ needs and requirements * project success if : - no hard mismatch - 3 out of 5 incubators corresponding to start - ups - open - code : easy - to - modify code for incubator filtering * delivery : - matching tool algorithm - docker + restful api - bitbucket repository total excl. vat €4, 750. 00 va

In [None]:
import torch
with torch.no_grad():
    prediction = model(**encoding)
print(prediction)

In [12]:
prediction.end_logits.shape

torch.Size([1, 356])

# Donut

In [None]:
from transformers import pipeline

buffered = BytesIO()
image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')

pipe = pipeline("image-to-text", model="katanaml-org/invoices-donut-model-v1")
predictions = pipe(img_str)


# LayoutLMv3 

This model is fine-tuned on invoices and receipts. The [github repo](https://github.com/Theivaprakasham/layoutlmv3/tree/main)

In [4]:
from transformers import AutoProcessor, AutoModelForTokenClassification

processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True, )
model = AutoModelForTokenClassification.from_pretrained("Theivaprakasham/layoutlmv3-finetuned-invoice")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def unnormalize_box(bbox, width, height):
    return [
        width * (bbox[0] / 1000),
        height * (bbox[1] / 1000),
        width * (bbox[2] / 1000),
        height * (bbox[3] / 1000),
    ]

In [6]:
from datasets import load_dataset

dataset = load_dataset("darentang/generated", split="test")
labels = dataset.features['ner_tags'].feature.names
id2label = {v: k for v, k in enumerate(labels)}
print(id2label)
label2color = {
    "B-ABN": 'blue',
    "B-BILLER": 'blue',
    "B-BILLER_ADDRESS": 'green',
    "B-BILLER_POST_CODE": 'black',
    "B-DUE_DATE": "blue",
    "B-GST": 'green',
    "B-INVOICE_DATE": 'violet',
    "B-INVOICE_NUMBER": 'orange',
    "B-SUBTOTAL": 'green',
    "B-TOTAL": 'blue',
    "I-BILLER_ADDRESS": 'blue',
    "O": 'orange'
  } 

{0: 'O', 1: 'B-ABN', 2: 'B-BILLER', 3: 'B-BILLER_ADDRESS', 4: 'B-BILLER_POST_CODE', 5: 'B-DUE_DATE', 6: 'B-GST', 7: 'B-INVOICE_DATE', 8: 'B-INVOICE_NUMBER', 9: 'B-SUBTOTAL', 10: 'B-TOTAL', 11: 'I-BILLER_ADDRESS'}


In [7]:
def get_words_from_tokens(offset_mapping, tokens):
    """Recompose words."""
    words = []
    word = ""
    for idx, token_mapping in enumerate(offset_mapping):
        if token_mapping[0] == 0:
            if word:
                words.append(word.strip())
                word = ""
        word += tokens[idx]
    # Last word
    if idx == len(offset_mapping) - 1:
        words.append(word)
    return words

In [8]:
import numpy as np
from PIL import ImageDraw, ImageFont

def process_image(image):

    print(type(image))
    width, height = image.size

    # encode
    encoding = processor(image, truncation=True, return_offsets_mapping=True, return_tensors="pt")
    offset_mapping = encoding.pop('offset_mapping')

    # forward pass
    outputs = model(**encoding)

    # get predictions
    predictions = outputs.logits.argmax(-1).squeeze().tolist()
    token_boxes = encoding.bbox.squeeze().tolist()

    # only keep non-subword predictions
    is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0
    true_predictions = [id2label[pred] for idx, pred in enumerate(predictions) if not is_subword[idx]]
    true_boxes = [unnormalize_box(box, width, height) for idx, box in enumerate(token_boxes) if not is_subword[idx]]

    # Extract each word from the encoding
    tokens = [processor.tokenizer.decode(token) for token in encoding.input_ids.squeeze()]     
    words = get_words_from_tokens(offset_mapping.squeeze(), tokens)
    
    assert len(words) == len(true_predictions)
    assert len(words) == len(true_boxes)

    return {"words": words, "predictions": true_predictions, "boxes": true_boxes}

In [9]:
def draw_image(image, predictions, boxes):
    # draw predictions over the image
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    for prediction, box in zip(predictions, boxes):
        draw.rectangle(box, outline=label2color[prediction])
        draw.text((box[0]+10, box[1]-10), text=prediction, fill=label2color[prediction], font=font)
    return image

In [10]:
output = process_image(image)

<class 'PIL.Image.Image'>



No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



In [None]:
draw_image(image, predictions=output["predictions"], boxes=output['boxes'])