In [2]:
import torch
from PIL import Image
from transformers import DonutProcessor, VisionEncoderDecoderModel
from transformers.image_utils import load_image
from donut import DonutModel

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

pretrained_model = DonutModel.from_pretrained("result/train_cord/test_experiment")

if torch.cuda.is_available():
    pretrained_model.half()
    pretrained_model.to("cuda")

pretrained_model.eval()

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


DonutModel(
  (encoder): SwinEncoder(
    (model): SwinTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
        (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (layers): Sequential(
        (0): BasicLayer(
          dim=128, input_resolution=(320, 240), depth=2
          (blocks): ModuleList(
            (0): SwinTransformerBlock(
              (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                (qkv): Linear(in_features=128, out_features=384, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=128, out_features=128, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (softmax): Softmax(dim=-1)
              )
              (drop_path): Identity()
              (norm2): LayerNorm((128,), ep

In [3]:
import donut
json_evalutor = donut.JSONParseEvaluator()

def calc_val_edit(image, label):
    output = pretrained_model.inference(image=image, prompt=f"<s_synthetic_data2>")["predictions"][0]
    score = json_evalutor.cal_acc(output, label)
    return (score, output)

In [4]:
from datasets import load_dataset

dataset = load_dataset("test")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'image'],
        num_rows: 10
    })
})

In [5]:
import numpy

scores = []
predictions = []
ground_truths = []

for ds in dataset["train"]:
    score, prediction = calc_val_edit(ds["image"], ds["text"])
    scores.append(score)
    predictions.append(prediction)
    ground_truths.append(ds["text"])
    
scores = {
    "ted_accuracy": numpy.mean(scores),
    "f1_accuracy": json_evalutor.cal_f1(predictions, ground_truths),
}
scores

{'ted_accuracy': 0.8092537603543895, 'f1_accuracy': 0.7076923076923077}

In [None]:
dataset = load_dataset("test_set_3")

In [7]:
scores = []
predictions = []
ground_truths = []

for ds in dataset["train"]:
    score, prediction = calc_val_edit(ds["image"], ds["text"])
    scores.append(score)
    predictions.append(prediction)
    ground_truths.append(ds["text"])
    
scores = {
    "ted_accuracy": numpy.mean(scores),
    "f1_accuracy": json_evalutor.cal_f1(predictions, ground_truths),
}
scores

{'ted_accuracy': 0.3177371981700884, 'f1_accuracy': 0.2828282828282828}

In [8]:
dataset = load_dataset("test_set_4")

scores = []
predictions = []
ground_truths = []

for ds in dataset["train"]:
    score, prediction = calc_val_edit(ds["image"], ds["text"])
    scores.append(score)
    predictions.append(prediction)
    ground_truths.append(ds["text"])
    
scores = {
    "ted_accuracy": numpy.mean(scores),
    "f1_accuracy": json_evalutor.cal_f1(predictions, ground_truths),
}
scores

Generating train split: 0 examples [00:00, ? examples/s]

{'ted_accuracy': 0.02168674698795181, 'f1_accuracy': 0.03550295857988166}

In [6]:
import os
import json
import extract_receipt as er
import numpy

base_dir = 'testdata_with_labels_1'
acc_list = []
predictions = []
ground_truths = []

for img_name in os.listdir(f'{base_dir}/receipts'):
  if not '.jpg' in img_name:
    continue

  file_num = img_name.replace('.jpg', '')
  image = Image.open(f'{base_dir}/receipts/{img_name}').convert("RGB")
  try:
   image = er.extract_receipt(image)
  except:
    pass
  with open(f'{base_dir}/labels/{file_num}.json', 'r') as f:
     label = json.loads(f.read())

  score, prediction = calc_val_edit(image, label)
  acc_list.append(score)
  predictions.append(prediction)
  ground_truths.append(label)

scores = {
    "ted_accuracy": numpy.mean(acc_list),
    "f1_accuracy": json_evalutor.cal_f1(predictions, ground_truths),
}
scores

{'ted_accuracy': 0.08902073311057182, 'f1_accuracy': 0.032093362509117436}