In [1]:
from transformers import LayoutLMv3FeatureExtractor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
sys.path.append("../")
from src.models.docsblip import DocsBlip

import torch
from transformers import AutoTokenizer, LayoutLMv3FeatureExtractor, LayoutLMv3Processor, LayoutLMv3TokenizerFast, LiltModel
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
from datasets import load_dataset

from typing import Dict

In [3]:
model = DocsBlip()
model.eval()
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

lilt_tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True)
processor = LayoutLMv3Processor(feature_extractor, lilt_tokenizer)



In [4]:
dataset = load_dataset("nielsr/funsd", split="test")
sample = dataset[0]

image = sample['image']
width, height = image.size
words = [w for w in sample["words"] if len(w) > 0]
bboxes = [b for b in sample["bboxes"]][:len(words)]

In [21]:
image = image.convert("RGB")

In [22]:
encoding = processor(image, return_offsets_mapping=True, return_tensors="pt")

In [27]:
for k, v in encoding.items(): print(k)

input_ids
attention_mask
offset_mapping
bbox
pixel_values


In [28]:
encoding['pixel_values'].shape

torch.Size([1, 3, 224, 224])

In [25]:
def normalize_bbox(bbox, width, height):
    """Scale pixel coords to [0, 1000] range for LiLT"""
    return [
        int(1000 * bbox[0] / width),
        int(1000 * bbox[1] / height),
        int(1000 * bbox[2] / width),
        int(1000 * bbox[3] / height),
    ]


def load_funsd_sample():
    dataset = load_dataset("nielsr/funsd", split="test")
    sample = dataset[0]

    image = sample['image']
    width, height = image.size
    words = [w for w in sample["words"] if len(w) > 0]
    # bboxes = [normalize_bbox(b, width, height) for b in sample["bboxes"]][:len(words)]
    bboxes = [b for b in sample["bboxes"]][:len(words)]

    return image, words, bboxes

In [29]:
from transformers.modeling_outputs import BaseModelOutput

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
image, words, bboxes = load_funsd_sample()
encoding = lilt_tokenizer(
    text=words,
    boxes=bboxes,
    padding="max_length",
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

input_ids = encoding.input_ids.to(device)
attention_mask = encoding.attention_mask.to(device)
bbox_tensor = encoding.bbox.to(device)

with torch.no_grad():
    encoder_outputs = model.encoder.model(
        input_ids=input_ids,
        bbox=bbox_tensor,
        attention_mask=attention_mask,
    )

hidden_states = encoder_outputs.last_hidden_state

hidden_proj = BaseModelOutput(model.adapter(hidden_states))

instr = "What is the total amount?"

instr_tokens = tokenizer(
    instr,
    return_tensors="pt",
).to(device)

gen_kwargs = dict(
    max_new_tokens=30,
    num_beams=3,
    temperature=0.7
)

# Generate tokens with a decoder given features
with torch.no_grad():
    outputs = model.decoder.generate(
        encoder_outputs=hidden_proj,
        input_ids=instr_tokens.input_ids,
        attention_mask=instr_tokens.attention_mask,
        **gen_kwargs,
    )

generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print("\nInstruction:", instr)
print("Answer:", generated[0])


Instruction: What is the total amount?
Answer:  Colonelshireshireshireadvertisementshireadvertisementadvertisementshireshire[/shireshire [/shireshire©shire[/ [/ Coloneladvertisement Colonelshireadvertisement © Colonelshire
