In [1]:
!pip install --upgrade transformers accelerate safetensors torch torchvision donut-python

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
import re
import json
import os
from tqdm import tqdm
import donut

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"




In [2]:
json_evalutor = donut.JSONParseEvaluator()

def extract_json(text):
    import re
    match = re.search(r'\{.*\}', text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            print("Found JSON-like text, but it couldn't be parsed.")
    else:
        print("No JSON found.")
    return None

def calc_val_edit(image, label, model, processor, evaluator):
    prompt = """
    Extract the following information from a receipt and output as JSON with the following attributes (no other attributes):
    - shop_name
    - date (format: dd.MM.yyyy)
    - total
    - products (list of objects with name, amount, and price)
    """

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt}
            ]
        },
    ]

    prompt_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt_text, images=[image], return_tensors="pt", truncation=True, max_length=2048).to(DEVICE)

    generation_args = {
        "max_new_tokens": 1000,
        "do_sample": False
    }

    generated_ids = model.generate(**inputs, **generation_args)
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
    pred_text = generated_texts[0]

    pred_json = extract_json(pred_text)
    accuracy = evaluator.cal_acc(pred_json, label)

    return accuracy, pred_json, pred_text



In [3]:
# Base model
base_model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager"  # explicitly disable flash attention
).to(DEVICE)


base_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")

# Fine-tuned model
finetuned_model_path = "my_smolvlm_receipt_finetuned"
finetuned_model = AutoModelForVision2Seq.from_pretrained(
    finetuned_model_path,
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager"
).to(DEVICE)
finetuned_processor = AutoProcessor.from_pretrained(finetuned_model_path)


In [4]:
from pathlib import Path
from tqdm import tqdm
import json
from PIL import Image
import donut

# Set up evaluator and validation dir
json_evalutor = donut.JSONParseEvaluator()
valid_dir = Path("dsp_valid_data")
skip_dirs = {"__MACOSX", ".ipynb_checkpoints"}
results = {"base": [], "finetuned": []}

# Counter for processed examples
count = 0
max_examples = 10

for sample_dir in tqdm(valid_dir.iterdir(), desc="Evaluating"):
    if not sample_dir.is_dir() or sample_dir.name in skip_dirs:
        continue

    image_path = sample_dir / "preprocessed.jpg"
    label_path = sample_dir / "label.json"

    if not image_path.exists() or not label_path.exists():
        continue

    try:
        image = Image.open(image_path).convert("RGB")
        with open(label_path, "r", encoding="utf-8") as f:
            label = json.load(f)

        # Base model evaluation
        acc_b, pred_json_b, text_b = calc_val_edit(image, label, base_model, base_processor, json_evalutor)
        results["base"].append((sample_dir.name, acc_b, pred_json_b, label, text_b))

        # Fine-tuned model evaluation
        acc_f, pred_json_f, text_f = calc_val_edit(image, label, finetuned_model, finetuned_processor, json_evalutor)
        results["finetuned"].append((sample_dir.name, acc_f, pred_json_f, label, text_f))

        count += 1
        if count >= max_examples:
            break

    except Exception as e:
        print(f"Error in {sample_dir.name}: {e}")


Evaluating: 11it [06:57, 37.92s/it]


In [5]:
import json

output_path = "evaluation_results.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for i in range(len(results["base"])):
        record = {
            "id": results["base"][i][0],
            "ground_truth": results["base"][i][3],
            "base_model": {
                "prediction": results["base"][i][2],
                "raw_text": results["base"][i][4],
                "accuracy": results["base"][i][1]
            },
            "finetuned_model": {
                "prediction": results["finetuned"][i][2],
                "raw_text": results["finetuned"][i][4],
                "accuracy": results["finetuned"][i][1]
            }
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"✅ Saved results for {len(results['base'])} samples to '{output_path}'")


✅ Saved results for 10 samples to 'evaluation_results.jsonl'


In [6]:
output_dir = Path("evaluation_results")
output_dir.mkdir(exist_ok=True)

with open(output_dir / "results_finetuned_vs_base.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print("Evaluation results saved to:", output_dir / "results_finetuned_vs_base.json")


Evaluation results saved to: evaluation_results/results_finetuned_vs_base.json
