In [6]:
import fitz
import numpy as np
import io
from pathlib import Path
from IPython.display import display
from PIL import Image

In [54]:
def extract_pdf_pages_data(pdf_path):
  """
  Extract text and images from all pages of a PDF.

  Returns:
      [
        {
          "text": str,
          "images": [PIL.Image.Image, ...]
        },
        ...
      ]
  """
  pdf_path = Path(pdf_path)
  doc = fitz.open(pdf_path)

  pages_data = []

  for page_index in range(doc.page_count):
    page = doc.load_page(page_index)

    # ---- TEXT ----
    blocks = page.get_text("blocks")
    _text_for_sorting = []

    for b in blocks:
      x0, y0, x1, y1, text, *_ = b
      if text.strip():
        _text_for_sorting.append((y0, x0, text))

    text = "".join(
        t[2] for t in sorted(_text_for_sorting, key=lambda t: t[:-1])
    )

    # ---- IMAGES ----
    _images_for_sorting = []

    for img in page.get_images(full=True):
      xref = img[0]
      img_name = img[7]

      bbox = page.get_image_bbox(img_name)
      y0, x0 = bbox.y0, bbox.x0

      pix = fitz.Pixmap(doc, xref)

      # Ensure RGB (PNG-safe)
      if pix.colorspace is None or pix.colorspace.n != 3:
        pix = fitz.Pixmap(fitz.csRGB, pix)

      pil_img = Image.open(io.BytesIO(pix.tobytes("png")))

      _images_for_sorting.append((y0, x0, pil_img))

      pix = None

    images = [
        t[2] for t in sorted(_images_for_sorting, key=lambda t: t[:-1])
    ]

    pages_data.append({
        "text": text,
        "images": images
    })

  doc.close()
  return pages_data

In [69]:
pdf_files = [
    "./data/poster-pdfs/multi-per-page/01_Radha_Krishna_Art_Poster.pdf",
    "./data/poster-pdfs/multi-per-page/02_Radha_Krishna_Art_Poster_Vertical.pdf",
    "./data/poster-pdfs/multi-per-page/03_Mixed_Art_Poster_Chinese_Scenery.pdf",
    "./data/poster-pdfs/multi-per-page/04_Ganesh_Art_Poster.pdf",
    "./data/poster-pdfs/multi-per-page/08_Ambedkar_Buddha_Art_Poster.pdf",
    "./data/poster-pdfs/multi-per-page/08_Horse_Art_Poster.pdf",
    "./data/poster-pdfs/multi-per-page/09_Ambedkar_Art_Poster.pdf",
    "./data/poster-pdfs/multi-per-page/38_Nature_Art_Poster.pdf",
]

for pdf_path in pdf_files:
  print("\n" + "=" * 80)
  print("FILE:", pdf_path)

  data = extract_pdf_pages_data(pdf_path)

  for page_idx, page in enumerate(data[1:], start=2):
    n_images = len(page["images"])
    result = page["text"].split("\n")[:-2][-n_images:]
    print(f"Page {page_idx} | images={n_images}:", result)


FILE: ./data/poster-pdfs/multi-per-page/01_Radha_Krishna_Art_Poster.pdf
Page 2 | images=6: ['7121', '7120', '7119', '7118', '7117', '7116']
Page 3 | images=6: ['7115', '7114', '7113', '7112', '7111', '7110']
Page 4 | images=6: ['7109', '7108', '7107', '7106', '7105', '7104']
Page 5 | images=6: ['7103', '7102', '7101', '7100', '7099', '7098']
Page 6 | images=6: ['7097', '7096', '7095', '7094', '7093', '7092']
Page 7 | images=6: ['7091', '7090', '7089', '7088', '7087', '7086']
Page 8 | images=6: ['7085', '7084', '7083', '7082', '7081', '7080']
Page 9 | images=6: ['7125', '7124', '7123', '7122', '7079', '7078']
Page 10 | images=6: ['7008', '7009', '7010', '7011', '7012', '7013']
Page 11 | images=6: ['7069', '7070', '7071', '7072', '7073', '7074']
Page 12 | images=6: ['7063', '7064', '7065', '7066', '7067', '7068']
Page 13 | images=6: ['6886', '6887', '6888', '6889', '6890', '6891']
Page 14 | images=6: ['6935', '6936', '6937', '6969', '7075', '7076']
Page 15 | images=6: ['6892', '6894', '

In [68]:
from enum import Enum, auto


class TextExtractMode(Enum):
  LAST_LINE = auto()
  LAST_LINE_AFTER_COLON = auto()
  SECOND_LAST_LINE = auto()


pdf_files = [
    ('./data/poster-pdfs/1-per-page/02_Ram_Catalogue_PDF_Brochure.pdf', TextExtractMode.SECOND_LAST_LINE),
    ('./data/poster-pdfs/1-per-page/29_Chinese_Scenery_Modern_Art_Poster.pdf', TextExtractMode.LAST_LINE_AFTER_COLON),
    ('./data/poster-pdfs/1-per-page/30_Chinese_Scenery_Modern_Art_Poster.pdf', TextExtractMode.LAST_LINE_AFTER_COLON),
    ('./data/poster-pdfs/1-per-page/31_Chinese_Scenery_Modern_Art_Poster.pdf', TextExtractMode.LAST_LINE),
    ('./data/poster-pdfs/1-per-page/40_Mahakal_Art_Poster_2023.pdf', TextExtractMode.LAST_LINE),
    ('./data/poster-pdfs/1-per-page/45_Tirupati_Balaji_Art_Poster.pdf', TextExtractMode.LAST_LINE),
    ('./data/poster-pdfs/1-per-page/46_Ram_Darbar_Art_Poster.pdf', TextExtractMode.LAST_LINE),
    ('./data/poster-pdfs/1-per-page/47_Shiva_Family_Art_Poster.pdf', TextExtractMode.LAST_LINE),
    ('./data/poster-pdfs/1-per-page/48_Hanuman_Art_Poster.pdf', TextExtractMode.LAST_LINE),
    ('./data/poster-pdfs/1-per-page/51_Buddha_Art_Poster.pdf', TextExtractMode.LAST_LINE),
    ('./data/poster-pdfs/1-per-page/61_Mixed_Gods_Art_Poster.pdf', TextExtractMode.LAST_LINE),
    ('./data/poster-pdfs/1-per-page/72_Buddha_Art_Poster.pdf', TextExtractMode.LAST_LINE),
    ('./data/poster-pdfs/1-per-page/75_Khatu_Shyam_Art_Poster.pdf', TextExtractMode.LAST_LINE),
]


for pdf_path, mode in pdf_files:
  print("\n" + "=" * 80)
  print("FILE:", pdf_path)

  data = extract_pdf_pages_data(pdf_path)

  for page_idx, page in enumerate(data[1:], start=2):
    lines = page["text"].split("\n")[:-1]

    if mode == TextExtractMode.LAST_LINE:
      code = lines[-1]

    elif mode == TextExtractMode.LAST_LINE_AFTER_COLON:
      code = lines[-1].split(":", 1)[1].strip()

    elif mode == TextExtractMode.SECOND_LAST_LINE:
      code = lines[-2]

    print(f"Page {page_idx}: {code}")


FILE: ./data/poster-pdfs/1-per-page/02_Ram_Catalogue_PDF_Brochure.pdf
Page 2: R-14075
Page 3: R-14076
Page 4: R-14078
Page 5: R-14090
Page 6: R-14071
Page 7: R-14072
Page 8: R-14073
Page 9: R-14074
Page 10: R-14077
Page 11: R-14041
Page 12: R-14042
Page 13: R-14043
Page 14: R-14044
Page 15: R-14045
Page 16: R-14046
Page 17: R-14047
Page 18: R-14048
Page 19: R-14049
Page 20: R-14050
Page 21: R-14051
Page 22: R-14052
Page 23: R-14053
Page 24: R-14054
Page 25: R-14055
Page 26: R-14056
Page 27: R-14057
Page 28: R-14058
Page 29: R-14059
Page 30: R-14060
Page 31: R-14061
Page 32: R-14062
Page 33: R-14063
Page 34: R-14064
Page 35: R-14065
Page 36: R-14066
Page 37: R-14067
Page 38: R-14068
Page 39: R-14069
Page 40: R-14079
Page 41: R-14080
Page 42: R-14081
Page 43: R-14082
Page 44: R-14083
Page 45: R-14084
Page 46: R-14085
Page 47: R-14086
Page 48: R-14087
Page 49: R-14088
Page 50: R-14089
Page 51: R-14010
Page 52: R-14011
Page 53: R-14012
Page 54: R-14015
Page 55: R-14016
Page 56: R-14017
Pa