In [26]:
# write python code to get layout level bboxes stored from doc-layout-yolo model
import os
import json
from PIL import Image, ImageDraw
from doclayout_yolo import YOLOv10

In [27]:
MODEL_PATH = "/data/BADRI/FINAL/THESIS/GRVQA/gr-doc-vqa-grounding/model/doclayout_yolo.pt"

model = YOLOv10(MODEL_PATH)

In [28]:
NAME = "multilingual_test_batch"

IMG_DIR = f"/data/BADRI/FINAL/THESIS/GRVQA/gr-doc-vqa-grounding/data/input/{NAME}/"
OUT_DIR = f"/data/BADRI/FINAL/THESIS/GRVQA/gr-doc-vqa-grounding/data/output/{NAME}/doclayoutyolo/"

OUT_IMG_DIR = os.path.join(OUT_DIR, "images/")
OUT_JSON_DIR = os.path.join(OUT_DIR, "json/")

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)
if not os.path.exists(OUT_IMG_DIR):
    os.makedirs(OUT_IMG_DIR)
if not os.path.exists(OUT_JSON_DIR):
    os.makedirs(OUT_JSON_DIR)


layout_map = {
  "0": "Caption",
  "1": "Footnote",
  "2": "Formula",
  "3": "List-item",
  "4": "Page-footer",
  "5": "Page-header",
  "6": "Picture",
  "7": "Section-header",
  "8": "Table",
  "9": "Text",
  "10": "Title"
}


In [29]:
for file in os.listdir(IMG_DIR):
    IMG_PATH = os.path.join(IMG_DIR, file)

    image = Image.open(IMG_PATH)
    
    det_res = model.predict(
        IMG_PATH,   # Image to predict
        imgsz=1024,        # Prediction image size
        conf=0.5,          # Confidence threshold
        device="cuda:0"    # Device to use (e.g., 'cuda:0' or 'cpu')
    )

    predictions = []
    blocks = det_res[0].boxes

    for block in blocks:
        class_id = block.cls[0].item()
        if class_id == 0:
            continue
        x1, y1, x2, y2 = block.xyxy[0].tolist()
        bbox = [x1, y1, x2, y2]
        bbox = [round(x, 2) for x in bbox]
        layout = layout_map[str(int(class_id))]
        confidence = round(block.conf[0].item(), 2)

        predictions.append({
            "layout": layout,
            "bbox": bbox,
            "confidence": confidence
        })

    # Draw bounding boxes on the image
        
        draw = ImageDraw.Draw(image)
        draw.rectangle(bbox, outline='green', width=2)

        # draw the layout text on the image
        # font = ImageFont.truetype("arial.ttf", 16)
        draw.text((bbox[0], bbox[1]-10), layout, fill='green')

        # Save the image with bounding boxes
        image.save(f"{OUT_IMG_DIR}/{file}")

        # save json file
        with open(f"{OUT_JSON_DIR}/{file.split('.')[0]}.json", "w") as f:
            json.dump(predictions, f, indent=4, ensure_ascii=False)


    

    



image 1/1 /data/BADRI/FINAL/THESIS/GRVQA/gr-doc-vqa-grounding/data/input/multilingual_test_batch/fp_38.png: 1024x768 12 List-items, 4 Pictures, 2 Section-headers, 2 Texts, 90.6ms
Speed: 3.8ms preprocess, 90.6ms inference, 0.9ms postprocess per image at shape (1, 3, 1024, 768)

image 1/1 /data/BADRI/FINAL/THESIS/GRVQA/gr-doc-vqa-grounding/data/input/multilingual_test_batch/fp_77.png: 1024x672 4 List-items, 4 Section-headers, 2 Tables, 3 Texts, 29.3ms
Speed: 3.7ms preprocess, 29.3ms inference, 0.8ms postprocess per image at shape (1, 3, 1024, 672)

image 1/1 /data/BADRI/FINAL/THESIS/GRVQA/gr-doc-vqa-grounding/data/input/multilingual_test_batch/fp_65.png: 1024x800 1 Section-header, 9 Texts, 89.4ms
Speed: 4.0ms preprocess, 89.4ms inference, 0.8ms postprocess per image at shape (1, 3, 1024, 800)

image 1/1 /data/BADRI/FINAL/THESIS/GRVQA/gr-doc-vqa-grounding/data/input/multilingual_test_batch/12_217_page_1.png: 1024x736 7 List-items, 1 Section-header, 3 Texts, 29.2ms
Speed: 5.0ms preprocess

In [15]:
predictions

[{'layout': 'Text',
  'bbox': [92.09, 376.54, 559.89, 450.03],
  'confidence': 0.98},
 {'layout': 'Table',
  'bbox': [83.95, 477.36, 573.77, 581.93],
  'confidence': 0.97},
 {'layout': 'Text',
  'bbox': [88.06, 626.44, 566.38, 719.63],
  'confidence': 0.95},
 {'layout': 'Table',
  'bbox': [78.35, 768.39, 576.31, 928.47],
  'confidence': 0.91},
 {'layout': 'List-item',
  'bbox': [126.4, 184.07, 543.2, 236.99],
  'confidence': 0.81},
 {'layout': 'Section-header',
  'bbox': [91.23, 741.84, 370.38, 755.15],
  'confidence': 0.81},
 {'layout': 'Section-header',
  'bbox': [92.12, 596.14, 300.9, 609.18],
  'confidence': 0.73},
 {'layout': 'Section-header',
  'bbox': [137.19, 84.65, 523.18, 108.53],
  'confidence': 0.72},
 {'layout': 'Text',
  'bbox': [262.48, 111.14, 399.01, 122.04],
  'confidence': 0.7},
 {'layout': 'List-item',
  'bbox': [128.22, 250.45, 452.83, 261.38],
  'confidence': 0.56},
 {'layout': 'List-item',
  'bbox': [151.08, 278.41, 478.59, 289.05],
  'confidence': 0.55},
 {'layo