Set-up environment

In [None]:
from huggingface_hub import login

login("")

In [2]:
import os

# Function to find the correct image path
def find_image_path(base_path, image_name):
    for root, dirs, files in os.walk(base_path):
        if image_name in files:
            return os.path.join(root, image_name)
    return None

Load image

In [15]:
from datasets import load_dataset

dataset = load_dataset("katanaml-org/invoices-donut-data-v1")

In [4]:
# Get the image path from the dataset
image_path = dataset['test'][10]['image']
image_name = os.path.basename(image_path)

In [5]:
# Define the base directory of the project
base_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))  # Ajusta esto según sea necesari

In [6]:
from PIL import Image
import requests
from io import BytesIO
import os

# Get the image path or URL
image_path_or_url = dataset['test'][10]['image']

# Define the base path
base_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

# Function to find the correct image path
def find_image_path(base_path, image_name):
    for root, dirs, files in os.walk(base_path):
        if image_name in files:
            return os.path.join(root, image_name)
    return None

# Get the image name from the path
image_name = os.path.basename(image_path_or_url)

# Find the correct image path
correct_image_path = find_image_path(base_path, image_name)

if correct_image_path:
    image = Image.open(correct_image_path)
else:
    print(f"Image not found: {image_name}")

In [None]:
correct_image_path = find_image_path(base_path, image_name)
correct_image_path

Load model and processor

In [None]:
from transformers import DonutProcessor, VisionEncoderDecoderModel

if correct_image_path:
    print(f"Imagen encontrada en: {correct_image_path}")
    
    # Open the image
    image = Image.open(correct_image_path)
    
else:
    print(f"No se pudo encontrar la imagen: {image_name}")

Prepare using processor
We prepare the image for the model using DonutProcessor.

In [10]:
from transformers import DonutProcessor, VisionEncoderDecoderModel

# Load the processor and the model
processor = DonutProcessor.from_pretrained("katanaml-org/invoices-donut-model-v1")
model = VisionEncoderDecoderModel.from_pretrained("katanaml-org/invoices-donut-model-v1")

In [11]:
# Process the image
pixel_values = processor(image, return_tensors="pt").pixel_values
print(f"Shape of pixel_values: {pixel_values.shape}")


Shape of pixel_values: torch.Size([1, 3, 1280, 960])


Generate
Finally, we let the model autoregressively generate the structured data.

In [12]:
import torch
import json
import re

# Generate output
task_prompt = "<s_cord-v2>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

try:
    outputs = model.generate(
        pixel_values.to(device),
        decoder_input_ids=decoder_input_ids.to(device),
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
        output_scores=True,
    )

    # Process the output
    sequence = processor.batch_decode(outputs.sequences)[0]
    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token

    # Convert to JSON
    try:
        # We can convert the generated sequence to JSON if required:
        json_output = processor.token2json(sequence)
        print("Structured Output:")
        print(json.dumps(json_output, indent=2))
    except Exception as e:
        print(f"Error converting to JSON: {e}")
        print("Raw sequence:")
        print(sequence)

except Exception as e:
    print(f"Error during generation: {e}")



Structured Output:
{
  "header": {
    "invoice_no": "48902311",
    "invoice_date": "03/31/2016",
    "seller": "Hill Group 47445 Tiffany Canyon Suite 530 Lake Carolyn, MN 88734",
    "client": "Calhoun PLC 692 Pittman Square Apt. 121 Victorfort, KY 65016",
    "seller_tax_id": "964-97-3541",
    "client_tax_id": "961-78-6129",
    "iban": "GB57H5XH62221465152779"
  },
  "items": [
    {
      "item_desc": "2,00",
      "item_net_price": "89,99",
      "item_net_worth": "179,98",
      "item_vat": "10%",
      "item_gross_worth": "197,98"
    },
    {
      "item_desc": "1,00",
      "item_net_price": "8,99",
      "item_net_worth": "8,99",
      "item_vat": "10%",
      "item_gross_worth": "9,89"
    },
    {
      "item_desc": "2,00",
      "item_net_price": "2,99",
      "item_net_worth": "5,98",
      "item_vat": "10%",
      "item_gross_worth": "6,58"
    },
    {
      "item_desc": "3,00",
      "item_net_price": "34,99",
      "item_net_worth": "104,97",
      "item_vat": "10%"