In [47]:
import fitz
import numpy as np
import io
from pathlib import Path
from IPython.display import display
from PIL import Image

In [65]:
def extract_pdf_pages_data(pdf_path):
  """
  Extract text and images from all pages of a PDF.

  Returns:
      [
        {
          "text": str,
          "images": [PIL.Image.Image, ...]
        },
        ...
      ]
  """
  pdf_path = Path(pdf_path)
  doc = fitz.open(pdf_path)

  pages_data = []

  for page_index in range(doc.page_count):
    page = doc.load_page(page_index)

    # ---- TEXT ----
    blocks = page.get_text("blocks")
    _text_for_sorting = []

    for b in blocks:
      x0, y0, x1, y1, text, *_ = b
      if text.strip():
        _text_for_sorting.append((y0, x0, text))

    text = "".join(t for _, _, t in sorted(_text_for_sorting))

    # ---- IMAGES ----
    _images_for_sorting = []

    for img in page.get_images(full=True):
      xref = img[0]
      img_name = img[7]

      bbox = page.get_image_bbox(img_name)
      y0, x0 = bbox.y0, bbox.x0

      pix = fitz.Pixmap(doc, xref)

      # Ensure RGB (PNG-safe)
      if pix.colorspace is None or pix.colorspace.n != 3:
        pix = fitz.Pixmap(fitz.csRGB, pix)

      pil_img = Image.open(io.BytesIO(pix.tobytes("png")))

      _images_for_sorting.append((y0, x0, pil_img))

      pix = None

    images = [img for _, _, img in sorted(_images_for_sorting)]

    pages_data.append({
        "text": text,
        "images": images
    })

  doc.close()
  return pages_data

In [71]:
pdf_files = [
    "./data/poster-pdfs/multi-per-page/01_Radha_Krishna_Art_Poster.pdf",
    "./data/poster-pdfs/multi-per-page/02_Radha_Krishna_Art_Poster_Vertical.pdf",
    "./data/poster-pdfs/multi-per-page/03_Mixed_Art_Poster_Chinese_Scenery.pdf",
    "./data/poster-pdfs/multi-per-page/04_Ganesh_Art_Poster.pdf",
    "./data/poster-pdfs/multi-per-page/08_Ambedkar_Buddha_Art_Poster.pdf",
    "./data/poster-pdfs/multi-per-page/08_Horse_Art_Poster.pdf",
    "./data/poster-pdfs/multi-per-page/09_Ambedkar_Art_Poster.pdf",
    "./data/poster-pdfs/multi-per-page/38_Nature_Art_Poster.pdf",
]

for pdf_path in pdf_files:
  print("\n" + "=" * 80)
  print("FILE:", pdf_path)

  data = extract_pdf_pages_data(pdf_path)

  for page_idx, page in enumerate(data[1:], start=2):
    n_images = len(page["images"])
    result = page["text"].split("\n")[:-2][-n_images:]
    print(f"Page {page_idx} | images={n_images}:", result)


FILE: ./data/poster-pdfs/multi-per-page/01_Radha_Krishna_Art_Poster.pdf
Page 2 | images=6: ['7121', '7120', '7119', '7118', '7117', '7116']
Page 3 | images=6: ['7115', '7114', '7113', '7112', '7111', '7110']
Page 4 | images=6: ['7109', '7108', '7107', '7106', '7105', '7104']
Page 5 | images=6: ['7103', '7102', '7101', '7100', '7099', '7098']
Page 6 | images=6: ['7097', '7096', '7095', '7094', '7093', '7092']
Page 7 | images=6: ['7091', '7090', '7089', '7088', '7087', '7086']
Page 8 | images=6: ['7085', '7084', '7083', '7082', '7081', '7080']
Page 9 | images=6: ['7125', '7124', '7123', '7122', '7079', '7078']
Page 10 | images=6: ['7008', '7009', '7010', '7011', '7012', '7013']
Page 11 | images=6: ['7069', '7070', '7071', '7072', '7073', '7074']
Page 12 | images=6: ['7063', '7064', '7065', '7066', '7067', '7068']
Page 13 | images=6: ['6886', '6887', '6888', '6889', '6890', '6891']
Page 14 | images=6: ['6935', '6936', '6937', '6969', '7075', '7076']
Page 15 | images=6: ['6892', '6894', '