# Inference

In [1]:
from transformers import DonutProcessor, VisionEncoderDecoderModel

processor = DonutProcessor.from_pretrained("invoice-model")
model = VisionEncoderDecoderModel.from_pretrained("invoice-model")

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at invoice-model and are newly initialized: ['decoder.lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Single Inference

In [None]:
import re
import torch
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"

model.eval()
model.to(device)

pixel_values = processor(Image.open("data/test/16.jpeg").convert("RGB"), return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device)
# prepare decoder inputs
task_prompt = "<s_cord-v2>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
decoder_input_ids = decoder_input_ids.to(device)

# autoregressively generate sequence
outputs = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

# turn into JSON
seq = processor.batch_decode(outputs.sequences)[0]
seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
seq = processor.token2json(seq)
seq

{'address': 'KOTESHWOR, KTM, NEPAL',
 'vat_no': '300142084',
 'bill_no': 'SI537040-KOT-080/81',
 'invoice_date': '15/12/2080',
 'payment_mode': 'QRCODE',
 'gross_amount': '2505.00',
 'discount': '62.00',
 'net_amount': '2443.00',
 'total_qty': '11',
 'items': [{'particulars': 'FUCHENG STE',
   'item_qty': '1',
   'item_rate': '525.00',
   'amount': '525.00'},
  {'particulars': 'YERA GLASS',
   'item_qty': '1',
   'item_rate': '270.00',
   'amount': '270.00'},
  {'particulars': 'POT HOLDER',
   'item_qty': '1',
   'item_rate': '195.00',
   'amount': '195.00'},
  {'particulars': 'RELIABLe LU',
   'item_qty': '1',
   'item_rate': '620.00',
   'amount': '620.00'},
  {'particulars': 'HANHAN POT',
   'item_qty': '1',
   'item_rate': '145.00',
   'amount': '145.00'},
  {'particulars': 'UNION UG-40',
   'item_qty': '6',
   'item_rate': '125.00',
   'amount': '750.00'}]}

## Bulk Inference and Evaulation

In [3]:
import re
import json
import torch
from tqdm.auto import tqdm
import numpy as np

from donut import JSONParseEvaluator

from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"

model.eval()
model.to(device)

output_list = []
accs = []

dataset = load_dataset("data", split="test")

for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
    # prepare encoder inputs
    pixel_values = processor(sample["image"].convert("RGB"), return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    # prepare decoder inputs
    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
    decoder_input_ids = decoder_input_ids.to(device)
    
    # autoregressively generate sequence
    outputs = model.generate(
            pixel_values,
            decoder_input_ids=decoder_input_ids,
            max_length=model.decoder.config.max_position_embeddings,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
            num_beams=1,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            return_dict_in_generate=True,
        )

    # turn into JSON
    seq = processor.batch_decode(outputs.sequences)[0]
    seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
    seq = processor.token2json(seq)

    ground_truth = json.loads(sample["ground_truth"])
    ground_truth = ground_truth["gt_parse"]
    evaluator = JSONParseEvaluator()
    score = evaluator.cal_acc(seq, ground_truth)

    accs.append(score)
    output_list.append(seq)

scores = {"accuracies": accs, "mean_accuracy": np.mean(accs)}
print(scores, f"length : {len(accs)}")

Resolving data files:   0%|          | 0/74 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'accuracies': [0.9911504424778761, 0.990909090909091, 0.9797979797979798, 0.9962546816479401, 0.992831541218638], 'mean_accuracy': 0.9901887472103048} length : 5


In [4]:
print("Mean accuracy:", np.mean(accs))

Mean accuracy: 0.9901887472103048


In [5]:
output_list

[{'address': 'KOTESHWOR, KTM, NEPAL',
  'vat_no': '300142084',
  'bill_no': 'SI672395-KOT-080/81',
  'invoice_date': '13/02/2081',
  'payment_mode': 'QRCODE',
  'net_amount': '3445.00',
  'total_qty': '1',
  'items': {'particulars': 'DIRK & BROW',
   'item_qty': '1',
   'item_rate': '3445.00',
   'amount': '3445.00'}},
 {'address': 'KOTESHWOR, KTM, NEPAL',
  'vat_no': '300142084',
  'bill_no': 'SI672407-KOT-080/81',
  'invoice_date': '13/02/2081',
  'payment_mode': 'Cash',
  'net_amount': '2639.00',
  'total_qty': '1',
  'items': {'particulars': 'AIBOASS MEN',
   'item_qty': '1',
   'item_rate': '2639.00',
   'amount': '2639.00'}},
 {'address': 'BHAKTAPUR, NEPAL',
  'vat_no': '300142084',
  'bill_no': 'SI954816-BKT-080/81',
  'invoice_date': '15/02/2081',
  'payment_mode': 'Cash',
  'net_amount': '50.00',
  'total_qty': '1',
  'items': {'particulars': 'COCA COLA Z',
   'item_qty': '1',
   'item_rate': '50.00',
   'amount': '50.00'}},
 {'address': 'KOTESHWOR, KTM, NEPAL',
  'vat_no': '3