In [80]:
import io
import json
import re
import hashlib
from collections import defaultdict
from enum import Enum, auto
from pathlib import Path

import fitz
import numpy as np
from IPython.display import display
from PIL import Image

In [81]:
def extract_pdf_pages_data(pdf_path):
  """
  Extract text and images from all pages of a PDF.

  Returns:
      [
        {
          "text": str,
          "images": [PIL.Image.Image, ...]
        },
        ...
      ]
  """
  pdf_path = Path(pdf_path)
  doc = fitz.open(pdf_path)

  pages_data = []

  for page_index in range(doc.page_count):
    page = doc.load_page(page_index)

    # ---- TEXT ----
    blocks = page.get_text("blocks")
    _text_for_sorting = []

    for b in blocks:
      x0, y0, x1, y1, text, *_ = b
      if text.strip():
        _text_for_sorting.append((y0, x0, text))

    text = "".join(
        t[2] for t in sorted(_text_for_sorting, key=lambda t: t[:-1])
    )

    # ---- IMAGES ----
    _images_for_sorting = []

    for img in page.get_images(full=True):
      xref = img[0]
      img_name = img[7]

      bbox = page.get_image_bbox(img_name)
      y0, x0 = bbox.y0, bbox.x0

      pix = fitz.Pixmap(doc, xref)

      # Ensure RGB (PNG-safe)
      if pix.colorspace is None or pix.colorspace.n != 3:
        pix = fitz.Pixmap(fitz.csRGB, pix)

      pil_img = Image.open(io.BytesIO(pix.tobytes("png")))

      _images_for_sorting.append((y0, x0, pil_img))

      pix = None

    images = [
        t[2] for t in sorted(_images_for_sorting, key=lambda t: t[:-1])
    ]

    pages_data.append({
        "text": text,
        "images": images
    })

  doc.close()
  return pages_data

In [82]:
class Poster:
  def __init__(self, pdf_path: str, page_no: int, code: str, image):
    self.pdf_path = pdf_path
    self.page_no = page_no
    self.code = code.strip()
    self.image = image

  def __repr__(self):
    return f"<Poster page={self.page_no}, code={self.code}>"

In [83]:
class TextExtractMode(Enum):
  LAST_LINE = auto()
  LAST_LINE_AFTER_COLON = auto()
  SECOND_LAST_LINE = auto()

# pdf_registry

In [84]:
pdfs_multi_per_page = {
    "./data/poster-pdfs/multi-per-page/01_Radha_Krishna_Art_Poster.pdf": {},
    "./data/poster-pdfs/multi-per-page/02_Radha_Krishna_Art_Poster_Vertical.pdf": {},
    "./data/poster-pdfs/multi-per-page/03_Mixed_Art_Poster_Chinese_Scenery.pdf": {},
    "./data/poster-pdfs/multi-per-page/04_Ganesh_Art_Poster.pdf": {},
    "./data/poster-pdfs/multi-per-page/08_Ambedkar_Buddha_Art_Poster.pdf": {},
    "./data/poster-pdfs/multi-per-page/08_Horse_Art_Poster.pdf": {},
    "./data/poster-pdfs/multi-per-page/09_Ambedkar_Art_Poster.pdf": {},
    "./data/poster-pdfs/multi-per-page/38_Nature_Art_Poster.pdf": {},
}

In [85]:
pdfs_single_per_page = {
    "./data/poster-pdfs/1-per-page/02_Ram_Catalogue_PDF_Brochure.pdf": {
        "text_mode": TextExtractMode.SECOND_LAST_LINE,
    },
    "./data/poster-pdfs/1-per-page/29_Chinese_Scenery_Modern_Art_Poster.pdf": {
        "text_mode": TextExtractMode.LAST_LINE_AFTER_COLON,
    },
    "./data/poster-pdfs/1-per-page/30_Chinese_Scenery_Modern_Art_Poster.pdf": {
        "text_mode": TextExtractMode.LAST_LINE_AFTER_COLON,
    },
    "./data/poster-pdfs/1-per-page/31_Chinese_Scenery_Modern_Art_Poster.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
    },
    "./data/poster-pdfs/1-per-page/40_Mahakal_Art_Poster_2023.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
    },
    "./data/poster-pdfs/1-per-page/45_Tirupati_Balaji_Art_Poster.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
    },
    "./data/poster-pdfs/1-per-page/46_Ram_Darbar_Art_Poster.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
    },
    "./data/poster-pdfs/1-per-page/47_Shiva_Family_Art_Poster.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
    },
    "./data/poster-pdfs/1-per-page/48_Hanuman_Art_Poster.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
    },
    "./data/poster-pdfs/1-per-page/51_Buddha_Art_Poster.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
    },
    "./data/poster-pdfs/1-per-page/61_Mixed_Gods_Art_Poster.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
    },
    "./data/poster-pdfs/1-per-page/72_Buddha_Art_Poster.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
    },
    "./data/poster-pdfs/1-per-page/75_Khatu_Shyam_Art_Poster.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
    },
}

In [86]:
pdfs_incremental = {
    "./data/poster-pdfs/ocr-needed/39_Modern_Art_Poster_2023.pdf": {
        "start_code": 25284,
        "prefix": "",
    },
    "./data/poster-pdfs/ocr-needed/43_Modern_Art_Poster.pdf": {
        "start_code": 25725,
        "prefix": "",
    },
    "./data/poster-pdfs/ocr-needed/44_Modern_Art_Poster.pdf": {
        "start_code": 25569,
        "prefix": "",
    },
    "./data/poster-pdfs/ocr-needed/49_LGS_Art_Poster_New.pdf": {
        "start_code": 1001,
        "prefix": "LGS",
    },
}

# multi_per_page

In [87]:
def extract_multi_per_page(pdf_path, options):
  data = extract_pdf_pages_data(pdf_path)
  result = []

  for page_idx, page in enumerate(data[1:], start=2):
    images = page["images"]
    n = len(images)
    codes = page["text"].splitlines()[:-1][-n:]

    for code, image in zip(codes, images):
      result.append(
          Poster(pdf_path, page_idx, code, image)
      )

  return result

# single_per_page

In [88]:
def extract_single_per_page(pdf_path, options):
  mode = options["text_mode"]

  data = extract_pdf_pages_data(pdf_path)
  result = []

  for page_idx, page in enumerate(data, start=1):
    lines = page["text"].splitlines()
    if not lines:
      continue

    if mode == TextExtractMode.LAST_LINE:
      code = lines[-1]
    elif mode == TextExtractMode.LAST_LINE_AFTER_COLON:
      code = lines[-1].split(":", 1)[1].strip()
    elif mode == TextExtractMode.SECOND_LAST_LINE:
      code = lines[-2]

    image = page["images"][-1]

    result.append(
        Poster(pdf_path, page_idx, code, image)
    )

  return result

# incremental_ocr_needed

In [89]:
def extract_incremental(pdf_path, options):
  start_code = options["start_code"]
  prefix = options.get("prefix", "")

  data = extract_pdf_pages_data(pdf_path)
  result = []

  code = start_code

  for page_idx, page in enumerate(data, start=1):
    image = page["images"][-1]
    full_code = f"{prefix}-{code}" if prefix else str(code)

    result.append(
        Poster(pdf_path, page_idx, full_code, image)
    )

    code += 1

  return result

# all_posters 

In [90]:
pipelines = [
    (pdfs_multi_per_page, extract_multi_per_page),
    (pdfs_single_per_page, extract_single_per_page),
    (pdfs_incremental, extract_incremental),
]

In [91]:
all_posters = []

for registry, extractor in pipelines:
  for pdf_path, options in registry.items():
    print(f"Processing: {pdf_path}")

    posters = extractor(pdf_path, options)
    all_posters.extend(posters)

print(len(all_posters))

Processing: ./data/poster-pdfs/multi-per-page/01_Radha_Krishna_Art_Poster.pdf
Processing: ./data/poster-pdfs/multi-per-page/02_Radha_Krishna_Art_Poster_Vertical.pdf
Processing: ./data/poster-pdfs/multi-per-page/03_Mixed_Art_Poster_Chinese_Scenery.pdf
Processing: ./data/poster-pdfs/multi-per-page/04_Ganesh_Art_Poster.pdf
Processing: ./data/poster-pdfs/multi-per-page/08_Ambedkar_Buddha_Art_Poster.pdf
Processing: ./data/poster-pdfs/multi-per-page/08_Horse_Art_Poster.pdf
Processing: ./data/poster-pdfs/multi-per-page/09_Ambedkar_Art_Poster.pdf
Processing: ./data/poster-pdfs/multi-per-page/38_Nature_Art_Poster.pdf
Processing: ./data/poster-pdfs/1-per-page/02_Ram_Catalogue_PDF_Brochure.pdf
Processing: ./data/poster-pdfs/1-per-page/29_Chinese_Scenery_Modern_Art_Poster.pdf
Processing: ./data/poster-pdfs/1-per-page/30_Chinese_Scenery_Modern_Art_Poster.pdf
Processing: ./data/poster-pdfs/1-per-page/31_Chinese_Scenery_Modern_Art_Poster.pdf
Processing: ./data/poster-pdfs/1-per-page/40_Mahakal_Art_Po

# save_poster_images

In [92]:
def poster_image_filename(poster: Poster) -> str:
  pdf_name = Path(poster.pdf_path).stem
  code = re.sub(r"[^\w\-]", "_", poster.code)

  h = hashlib.md5(
      poster.image.tobytes()
  ).hexdigest()[:5]

  return f"{pdf_name}--p{poster.page_no}--{code}--{h}.png"

In [93]:
def save_poster_images(posters, output_dir="frontend/poster-images"):
  output_dir = Path(output_dir)
  output_dir.mkdir(parents=True, exist_ok=True)

  poster_to_image = {}

  for poster in posters:
    fname = poster_image_filename(poster)
    path = (output_dir / fname).with_suffix(".jpg")

    poster.image.convert("RGB").save(
        path,
        format="JPEG",
        quality=85,
        subsampling=2,
        optimize=True,
        progressive=True,
    )

    poster_to_image[poster] = path.name

  return poster_to_image

# generate_pdf_json

In [94]:
def generate_pdf_json(posters, poster_to_image, output_dir="frontend/json"):
  output_dir = Path(output_dir)
  output_dir.mkdir(parents=True, exist_ok=True)

  grouped = defaultdict(list)

  for poster in posters:
    grouped[poster.pdf_path].append({
        "page_no": poster.page_no,
        "code": poster.code,
        "image_file": poster_to_image[poster],
    })

  for pdf_path, posters_data in grouped.items():
    data = {
        "pdf_path": pdf_path,
        "posters": posters_data
    }

    json_name = Path(pdf_path).stem + ".json"
    with open(output_dir / json_name, "w", encoding="utf-8") as f:
      json.dump(data, f, indent=2, ensure_ascii=False)

In [None]:
# poster_to_image = save_poster_images(all_posters)

In [None]:
# generate_pdf_json(all_posters, poster_to_image)