In [1]:
import os
import json

In [5]:
ROOT = r"X:\vrdu\vrdu-main"
subdirs = [os.path.join(ROOT, r"ad-buy-form"), os.path.join(ROOT, r"registration-form")]

In [10]:
from collections import defaultdict
import fitz
def pages2images(p):
    doc = fitz.open(p)
    images = []
    for page in doc:
        images.append(page.get_pixmap())
    return images

def extract_locs_and_text(data, texts=None, locs=None):
    if texts is None:
        texts = []
    if locs is None:
        locs = []
    
    if isinstance(data, list) and len(data) == 3 and len(data[1]) == 5:
        str_val, list_5, remaining = data[0], data[1], data[2:]
        if isinstance(str_val, str) and isinstance(list_5, list) and len(list_5) == 5:
            texts.append(str_val.strip())
            locs.append((list_5[0], list_5[1:]))
        
        for item in remaining:
            extract_locs_and_text(item, texts, locs)
    
    elif isinstance(data, dict):
        for value in data.values():
            extract_locs_and_text(value, texts, locs)
    
    elif isinstance(data, list):
        for item in data:
            extract_locs_and_text(item, texts, locs)
    
    return locs, texts


dst_dir = os.path.join(ROOT, "images")
os.makedirs(dst_dir, exist_ok=True)

save_images = False
annotations = defaultdict(list)

# run
for subdir in subdirs:
    with open(os.path.join(subdir, "dataset.jsonl"), 'r') as json_file:
        file_level_json = list(json_file)
    for json_obj in file_level_json:
        json_obj = json.loads(json_obj)
        
        if save_images:
            images = pages2images(os.path.join(subdir,"pdfs", json_obj['filename']))
            for page_idx, image in enumerate(images):
                image.save(os.path.join(dst_dir, f"{json_obj['filename'].replace('.pdf', '')}_{page_idx}.png"))

        locs, texts = extract_locs_and_text(json_obj["annotations"])
        for loc, text in zip(locs, texts):
            # Text, Bbox, Page
            # annotations[json_obj['filename']].append((text, loc[1], loc[0]))
            annotations[ f"{json_obj['filename'].replace('.pdf', '')}_{loc[0]}"].append((text, loc[1]))
        


In [11]:
annotations

defaultdict(list,
            {'00a83bbc-0101-f092-4bd7-e75f315e8f14_0': [('KMSP',
               [0.14883721, 0.066215202, 0.19011629, 0.086531229]),
              ('KMSP', [0.54021317, 0.16802508, 0.57994187, 0.18432602]),
              ('4614 Collection Center Drive\nChicago, IL 60693',
               [0.14593023, 0.087283671, 0.3255814, 0.12942062]),
              ('Michael Bloomberg 2020, Inc',
               [0.54118216, 0.090909094, 0.67635661, 0.10721003]),
              ('MIKE BLOOMBERG 2020 INC',
               [0.5397287, 0.10971787, 0.68410856, 0.12413793]),
              ('950658', [0.82623863, 0.16787003, 0.86601537, 0.18501805]),
              ('03/29/20', [0.87772584, 0.22776517, 0.91958725, 0.24137062]),
              ('12/30/19', [0.82885516, 0.22826909, 0.87149531, 0.24187453]),
              ('9', [0.051144011, 0.42490205, 0.063930012, 0.43839791]),
              ('Fox 9 AM News at 6am',
               [0.10133225, 0.42371997, 0.21679451, 0.43939394]),
             

In [None]:
import cv2
import numpy as np

def visualize_annotations(img, ann):
    img = img.copy()
    for obj in ann:
        text, bbox = obj
        xmin, ymin, xmax, ymax = bbox
        xmin = int(img.shape[1] * xmin)
        xmax = int(img.shape[1] * xmax)
        ymin = int(img.shape[0] * ymin)
        ymax = int(img.shape[0] * ymax)

        cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
        # cv2.putText(img, text, (xmin, ymin), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
    return img

for imgname, anns in annotations.items():
    img_path = os.path.join(dst_dir, f"{imgname}.png")
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    print(anns)
    img = visualize_annotations(img, anns)
    cv2.imshow("img", img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()


[('KMSP', [0.14883721, 0.066215202, 0.19011629, 0.086531229]), ('KMSP', [0.54021317, 0.16802508, 0.57994187, 0.18432602]), ('4614 Collection Center Drive\nChicago, IL 60693', [0.14593023, 0.087283671, 0.3255814, 0.12942062]), ('Michael Bloomberg 2020, Inc', [0.54118216, 0.090909094, 0.67635661, 0.10721003]), ('MIKE BLOOMBERG 2020 INC', [0.5397287, 0.10971787, 0.68410856, 0.12413793]), ('950658', [0.82623863, 0.16787003, 0.86601537, 0.18501805]), ('03/29/20', [0.87772584, 0.22776517, 0.91958725, 0.24137062]), ('12/30/19', [0.82885516, 0.22826909, 0.87149531, 0.24187453]), ('9', [0.051144011, 0.42490205, 0.063930012, 0.43839791]), ('Fox 9 AM News at 6am', [0.10133225, 0.42371997, 0.21679451, 0.43939394]), ('02/17/20', [0.35912532, 0.45645863, 0.40089712, 0.4691582]), ('02/23/20', [0.41295207, 0.45718431, 0.45640594, 0.4691582]), ('9', [0.048788693, 0.50631261, 0.064266488, 0.51937312]), ('Fox 9 AM News at 6am', [0.1025434, 0.50679207, 0.21477593, 0.52194357]), ('02/17/20', [0.35828426, 0

KeyboardInterrupt: 

: 