In [3]:
import io
import json
import re
import hashlib
import random
import itertools
import fitz

from collections import defaultdict
from enum import Enum, auto
from pathlib import Path
from PIL import Image

In [4]:
def extract_pdf_pages_data(pdf_path):
  """
  Extract text and images from all pages of a PDF.

  Returns:
      [
        {
          "text": str,
          "images": [<PIL Image>, ...]
        },
        ...
      ]
  """
  pdf_path = Path("./data/poster-pdfs") / pdf_path
  doc = fitz.open(pdf_path)

  pages_data = []

  for page_index in range(doc.page_count):
    page = doc.load_page(page_index)

    # ---- text ----
    blocks = page.get_text("blocks")
    _text_for_sorting = []

    for b in blocks:
      x0, y0, x1, y1, text, *_ = b
      if text.strip():
        _text_for_sorting.append((y0, x0, text))

    text = "".join(
        t[2] for t in sorted(_text_for_sorting, key=lambda t: t[:-1])
    )

    # ---- images ----
    _images_for_sorting = []

    for img in page.get_images(full=True):
      xref = img[0]
      img_name = img[7]

      bbox = page.get_image_bbox(img_name)
      y0, x0 = bbox.y0, bbox.x0

      pix = fitz.Pixmap(doc, xref)

      # ensure rgb (png-safe)
      if pix.colorspace is None or pix.colorspace.n != 3:
        pix = fitz.Pixmap(fitz.csRGB, pix)

      pil_img = Image.open(io.BytesIO(pix.tobytes("png")))

      _images_for_sorting.append((y0, x0, pil_img))

      pix = None

    images = [
        t[2] for t in sorted(_images_for_sorting, key=lambda t: t[:-1])
    ]

    pages_data.append({
        "text": text,
        "images": images
    })

  doc.close()
  return pages_data

In [5]:
class PDF:
  def __init__(self, path, config, extractor):
    p = Path(path)

    self.path = path
    self.name = p.stem
    self.readable_name = self.name.replace("_", " ")
    self.category = config.get("category")

    self.config = config
    self.extractor = extractor
    self.posters = []

  def process(self):
    self.posters = self.extractor(self)

  def dictify(self, include_posters):
    landscape = [p for p in self.posters if p.image_size[0] >= p.image_size[1]]
    portrait = [p for p in self.posters if p.image_size[0] < p.image_size[1]]

    num_landscape = len(landscape)
    num_portrait = len(portrait)

    used_landscape_for_sample = num_landscape >= 5
    collection = landscape if used_landscape_for_sample else portrait
    sample = random.sample(collection, k=min(5, len(collection)))

    obj = {
        "path": self.path,
        "name": self.name,
        "readableName": self.readable_name,
        "category": self.category,
        "totalPosters": len(self.posters),
        "numLandscapePosters": num_landscape,
        "numPortraitPosters": num_portrait,
        "usedLandscapeForSample": used_landscape_for_sample,
        "posters": [],
        "postersSample": [p.dictify() for p in sample],
    }

    if include_posters:
      obj["posters"] = [p.dictify() for p in self.posters]

    return obj

  def __repr__(self):
    return f"<PDF path={self.path}>"

In [6]:
class Poster:
  _pdf_counters = defaultdict(int)  # pdf.name -> counter

  def __init__(self, pdf, page_no, code, image):
    self.pdf = pdf
    self.page_no = page_no
    self.code = re.sub(r"[^\w\-]", "_", code.strip())
    self.image = image
    self.image_size = image.size

    Poster._pdf_counters[self.pdf.name] += 1
    self.index = Poster._pdf_counters[self.pdf.name]

    h = hashlib.md5(
        image.tobytes()
    ).hexdigest()[:5]

    self.id = f"{pdf.name}--i{self.index}--{self.code}"
    self.image_file = f'{self.id}--{h}.jpg'

  def dictify(self):
    return {
        "pdfName": self.pdf.name,
        "pageNo": self.page_no,
        "index": self.index,
        "code": self.code,
        "id": self.id,
        "imageFile": self.image_file,
        "imageSize": self.image_size,
    }

  def __repr__(self):
    return (
        f"<Poster id={self.id}>"
    )

In [7]:
class TextExtractMode(Enum):
  LAST_LINE = auto()
  LAST_LINE_AFTER_COLON = auto()
  SECOND_LAST_LINE = auto()

# PDF_REGISTRY, CATEGORY_REGISTRY

In [None]:
pdfs_multi_per_page = {
    "multi-per-page/01_Radha_Krishna.pdf": {
        "category": "Radha Krishna",
    },
    "multi-per-page/02_Radha_Krishna_Vertical.pdf": {
        "category": "Radha Krishna",
    },
    "multi-per-page/03_Mixed.pdf": {
        "category": "Mixed",
    },
    "multi-per-page/04_Ganesh.pdf": {
        "category": "Ganesh",
    },
    "multi-per-page/08_Ambedkar_Buddha.pdf": {
        "category": "Ambedkar Buddha",
    },
    "multi-per-page/08_Horse.pdf": {
        "category": "Horse",
    },
    "multi-per-page/09_Ambedkar.pdf": {
        "category": "Ambedkar",
    },
    "multi-per-page/38_Nature.pdf": {
        "category": "Nature",
    },
}

In [9]:
pdfs_single_per_page = {
    "1-per-page/02_Ram.pdf": {
        "text_mode": TextExtractMode.SECOND_LAST_LINE,
        "category": "Ram",
    },
    "1-per-page/29_Chinese_Scenery_Modern.pdf": {
        "text_mode": TextExtractMode.LAST_LINE_AFTER_COLON,
        "category": "Chinese Scenery",
    },
    "1-per-page/30_Chinese_Scenery_Modern.pdf": {
        "text_mode": TextExtractMode.LAST_LINE_AFTER_COLON,
        "category": "Chinese Scenery",
    },
    "1-per-page/31_Chinese_Scenery_Modern.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
        "category": "Chinese Scenery",
    },
    "1-per-page/40_Mahakal.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
        "category": "Mahakal",
    },
    "1-per-page/45_Tirupati_Balaji.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
        "category": "Tirupati Balaji",
    },
    "1-per-page/46_Ram_Darbar.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
        "category": "Ram Darbar",
    },
    "1-per-page/47_Shiva_Family.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
        "category": "Shiva Family",
    },
    "1-per-page/48_Hanuman.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
        "category": "Hanuman",
    },
    "1-per-page/51_Buddha.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
        "category": "Buddha",
    },
    "1-per-page/61_Mixed_Gods.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
        "category": "Mixed Gods",
    },
    "1-per-page/72_Buddha.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
        "category": "Buddha",
    },
    "1-per-page/75_Khatu_Shyam.pdf": {
        "text_mode": TextExtractMode.LAST_LINE,
        "category": "Khatu Shyam",
    },
}

In [10]:
pdfs_incremental = {
    "ocr-needed/39_Modern.pdf": {
        "start_code": 25284,
        "prefix": "",
        "category": "Modern",
    },
    "ocr-needed/43_Modern.pdf": {
        "start_code": 25725,
        "prefix": "",
        "category": "Modern",
    },
    "ocr-needed/44_Modern.pdf": {
        "start_code": 25569,
        "prefix": "",
        "category": "Modern",
    },
    "ocr-needed/49_LGS.pdf": {
        "start_code": 1001,
        "prefix": "LGS",
        "category": "Lakshmi Ganesh Saraswati",
    },
}

In [11]:
PDF_REGISTRY = {
    **pdfs_multi_per_page,
    **pdfs_single_per_page,
    **pdfs_incremental,
}

In [12]:
CATEGORY_REGISTRY = [
    {
        "name": "Radha Krishna",
        "description": (
            "घर में प्रेम, सामंजस्य और भक्ति का संचार करता है। "
            "राधा कृष्ण पोस्टर दिव्य प्रेम और भावनात्मक संतुलन का प्रतीक हैं।"
        ),
    },
    {
        "name": "Chinese Scenery",
        "description": (
            "आपके स्थान में शांति और प्राकृतिक संतुलन लाता है। "
            "चीनी प्राकृतिक दृश्य कला शांति, समृद्धि और सजगता को बढ़ावा देती है।"
        ),
    },
    {
        "name": "Ganesh",
        "description": (
            "भगवान गणेश बाधाओं को दूर करते हैं और सफलता आकर्षित करते हैं। "
            "नए आरंभ और सकारात्मक ऊर्जा के लिए आदर्श पोस्टर।"
        ),
    },
    {
        "name": "Ambedkar Buddha",
        "description": (
            "ज्ञान, समानता और आंतरिक शक्ति की प्रेरक छवि। "
            "प्रेरणा और विचारशील जीवन के लिए उपयुक्त।"
        ),
    },
    {
        "name": "Horse",
        "description": (
            "गति, शक्ति और प्रगति का प्रतीक। "
            "घोड़े के पोस्टर ऊर्जा बढ़ाते हैं और आगे बढ़ने की प्रेरणा देते हैं।"
        ),
    },
    {
        "name": "Ambedkar",
        "description": (
            "बुद्धिमत्ता, साहस और सामाजिक न्याय का सम्मान। "
            "प्रेरणा और आत्मसम्मान के लिए अर्थपूर्ण पोस्टर।"
        ),
    },
    {
        "name": "Nature",
        "description": (
            "शांति और ताजगी से जुड़ने का माध्यम। "
            "प्राकृतिक पोस्टर तनाव कम करते हैं और दृश्य संतुलन लाते हैं।"
        ),
    },
    {
        "name": "Ram",
        "description": (
            "भगवान राम धर्म, अनुशासन और नैतिक शक्ति के प्रतीक हैं। "
            "शांति और पारिवारिक मूल्यों के लिए आदर्श।"
        ),
    },
    {
        "name": "Mahakal",
        "description": (
            "महाकाल समय, शक्ति और निर्भयता का प्रतीक हैं। "
            "आत्मविश्वास बढ़ाने वाली सशक्त आध्यात्मिक उपस्थिति।"
        ),
    },
    {
        "name": "Tirupati Balaji",
        "description": (
            "धन, भक्ति और मनोकामना पूर्ति से जुड़े। "
            "घर और कार्यस्थल के लिए अत्यंत शुभ।"
        ),
    },
    {
        "name": "Ram Darbar",
        "description": (
            "आदर्श पारिवारिक मूल्यों और नेतृत्व का संपूर्ण चित्र। "
            "एकता, भक्ति और सामंजस्य को प्रोत्साहित करता है।"
        ),
    },
    {
        "name": "Shiva Family",
        "description": (
            "शक्ति, करुणा और परिवार के संतुलन का प्रतीक। "
            "घर में आध्यात्मिक स्थिरता लाता है।"
        ),
    },
    {
        "name": "Hanuman",
        "description": (
            "भक्ति, शक्ति और सुरक्षा का प्रतीक। "
            "साहस, एकाग्रता और भय से मुक्ति के लिए उपयुक्त।"
        ),
    },
    {
        "name": "Buddha",
        "description": (
            "शांति, सजगता और ज्ञानोदय का प्रतीक। "
            "ध्यान कक्ष और शांत वातावरण के लिए उपयुक्त।"
        ),
    },
    {
        "name": "Mixed Gods",
        "description": (
            "विभिन्न देवी-देवताओं की सामूहिक दिव्य ऊर्जा। "
            "एक ही चित्र में संपूर्ण आध्यात्मिक संतुलन के लिए।"
        ),
    },
    {
        "name": "Khatu Shyam",
        "description": (
            "आस्था, चमत्कार और अटूट भक्ति के लिए प्रसिद्ध। "
            "भक्तों के लिए आशा और भावनात्मक शक्ति प्रदान करता है।"
        ),
    },
    {
        "name": "Modern",
        "description": (
            "आधुनिक कला जो शैली और व्यक्तित्व जोड़ती है। "
            "आधुनिक इंटीरियर और रचनात्मक स्थानों के लिए आदर्श।"
        ),
    },
    {
        "name": "Lakshmi Ganesh Saraswati",
        "description": (
            "धन, विद्या और सफलता का शक्तिशाली संगम। "
            "समृद्धि और शिक्षा के लिए अत्यंत लाभकारी।"
        ),
    },
    {
        "name": "Mixed",
        "description": (
            "जिसका कोई निश्चित विषय नहीं होता। "
            "इसमें गणेश, बुद्ध, घोड़े, प्रकृति या आधुनिक कला कुछ भी हो सकता है।"
        ),
    },
]

# extract_multi_per_page

In [13]:
def extract_multi_per_page(pdf):
  data = extract_pdf_pages_data(pdf.path)
  result = []

  for page_idx, page in enumerate(data[1:], start=2):
    images = page["images"]
    n = len(images)
    codes = page["text"].splitlines()[:-1][-n:]

    for code, image in zip(codes, images):
      result.append(
          Poster(pdf, page_idx, code, image)
      )

  return result

# extract_single_per_page

In [14]:
def extract_single_per_page(pdf):
  mode = pdf.config["text_mode"]

  data = extract_pdf_pages_data(pdf.path)
  result = []

  for page_idx, page in enumerate(data, start=1):
    lines = page["text"].splitlines()
    if not lines:
      continue

    if mode == TextExtractMode.LAST_LINE:
      code = lines[-1]
    elif mode == TextExtractMode.LAST_LINE_AFTER_COLON:
      code = lines[-1].split(":", 1)[1].strip()
    elif mode == TextExtractMode.SECOND_LAST_LINE:
      code = lines[-2]

    image = page["images"][-1]

    result.append(
        Poster(pdf, page_idx, code, image)
    )

  return result

# extract_incremental

In [15]:
def extract_incremental(pdf):
  start_code = pdf.config["start_code"]
  prefix = pdf.config.get("prefix", "")

  data = extract_pdf_pages_data(pdf.path)
  result = []

  code = start_code

  for page_idx, page in enumerate(data, start=1):
    image = page["images"][-1]
    full_code = f"{prefix}-{code}" if prefix else str(code)

    result.append(
        Poster(pdf, page_idx, full_code, image)
    )

    code += 1

  return result

# process_all_pdfs 

In [16]:
pipelines = [
    (pdfs_multi_per_page, extract_multi_per_page),  # registry, extractor
    (pdfs_single_per_page, extract_single_per_page),
    (pdfs_incremental, extract_incremental),
]

In [17]:
def process_all_pdfs():
  pdfs = []
  total_posters = 0

  for registry, extractor in pipelines:
    for path, config in registry.items():
      print(f"Processing: {path}")

      pdf = PDF(path, config, extractor)
      pdf.process()

      pdfs.append(pdf)
      total_posters += len(pdf.posters)

  print(f"Total posters: {total_posters}")
  return pdfs

In [18]:
pdfs = process_all_pdfs()

Processing: multi-per-page/01_Radha_Krishna.pdf
Processing: multi-per-page/02_Radha_Krishna_Vertical.pdf
Processing: multi-per-page/03_Mixed.pdf
Processing: multi-per-page/04_Ganesh.pdf
Processing: multi-per-page/08_Ambedkar_Buddha.pdf
Processing: multi-per-page/08_Horse.pdf
Processing: multi-per-page/09_Ambedkar.pdf
Processing: multi-per-page/38_Nature.pdf
Processing: 1-per-page/02_Ram.pdf
Processing: 1-per-page/29_Chinese_Scenery_Modern.pdf
Processing: 1-per-page/30_Chinese_Scenery_Modern.pdf
Processing: 1-per-page/31_Chinese_Scenery_Modern.pdf
Processing: 1-per-page/40_Mahakal.pdf
Processing: 1-per-page/45_Tirupati_Balaji.pdf
Processing: 1-per-page/46_Ram_Darbar.pdf
Processing: 1-per-page/47_Shiva_Family.pdf
Processing: 1-per-page/48_Hanuman.pdf
Processing: 1-per-page/51_Buddha.pdf
Processing: 1-per-page/61_Mixed_Gods.pdf
Processing: 1-per-page/72_Buddha.pdf
Processing: 1-per-page/75_Khatu_Shyam.pdf
Processing: ocr-needed/39_Modern.pdf
Processing: ocr-needed/43_Modern.pdf
Processing

# validation & integrity checks

In [19]:
def check_duplicate_codes(pdfs, show_details=True):
  code_to_pdf_pages = defaultdict(lambda: defaultdict(list))
  total_codes = 0

  # collect data
  for pdf in pdfs:
    for poster in pdf.posters:
      total_codes += 1
      code_to_pdf_pages[poster.code][pdf.path].append(poster.page_no)

  unique_codes = len(code_to_pdf_pages)

  print("Counts:")
  print(f"  Total codes : {total_codes}")
  print(f"  Unique codes: {unique_codes}")

  if not show_details:
    return

  print("\nDuplicate codes across PDFs:")

  for code, pdf_map in code_to_pdf_pages.items():
    is_duplicate = (
        len(pdf_map) > 1 or
        any(len(pages) > 1 for pages in pdf_map.values())
    )

    if not is_duplicate:
      continue

    print(code)
    for path, pages in pdf_map.items():
      if len(pages) > 1:
        pages_str = ", ".join(map(str, pages))
        print(f"  {path} -> pages {pages_str}")
      else:
        print(f"  {path} -> page {pages[0]}")

In [20]:
check_duplicate_codes(pdfs, show_details=False)

Counts:
  Total codes : 3766
  Unique codes: 3739


In [21]:
def check_duplicate_ids(pdfs):
  ids = [
      [poster.id for poster in pdf.posters]
      for pdf in pdfs
  ]

  ids_flat = list(itertools.chain.from_iterable(ids))

  total_files = len(ids_flat)
  unique_files = len(set(ids_flat))

  print("Counts:")
  print(f"  Total file names : {total_files}")
  print(f"  Unique file names: {unique_files}")

  assert total_files == unique_files, f"Duplicate IDs found! Total: {total_files}, Unique: {unique_files}"

In [22]:
check_duplicate_ids(pdfs)

Counts:
  Total file names : 3766
  Unique file names: 3766


In [23]:
def check_duplicate_pdf_names(pdfs):
  names = [pdf.name for pdf in pdfs]

  total_names = len(names)
  unique_names = len(set(names))

  print("Counts:")
  print(f"  Total PDF names : {total_names}")
  print(f"  Unique PDF names: {unique_names}")

  assert total_names == unique_names, f"Duplicate PDF names found! Total: {total_names}, Unique: {unique_names}"

In [24]:
check_duplicate_pdf_names(pdfs)

Counts:
  Total PDF names : 25
  Unique PDF names: 25


# save_poster_images

In [25]:
def save_poster_images(pdfs):
  output_dir = Path("frontend/public/poster-images")
  output_dir.mkdir(parents=True, exist_ok=True)

  skipped_count = 0
  generated_count = 0

  for pdf in pdfs:
    for poster in pdf.posters:
      path = output_dir / poster.image_file

      if path.exists():
        skipped_count += 1
        continue

      poster.image.convert("RGB").save(
          path,
          format="JPEG",
          quality=95,
          subsampling=2,
          optimize=True,
          progressive=True,
      )
      generated_count += 1

  total_size_bytes = sum(f.stat().st_size for f in output_dir.glob('*') if f.is_file())
  total_size_mb = total_size_bytes / (1024 * 1024)

  print(f"Skipped {skipped_count} files (already exist)")
  print(f"Generated {generated_count} new images")
  print(f"Final folder size: {total_size_mb:.2f} MB")

In [26]:
save_poster_images(pdfs)

Skipped 3766 files (already exist)
Generated 0 new images
Final folder size: 144.14 MB


# generate_pdf_json

In [29]:
def generate_pdf_json(pdfs):
  output_dir = Path("frontend/public/pdfs-data")
  output_dir.mkdir(parents=True, exist_ok=True)

  pdfs_metadata = []

  for pdf in pdfs:
    # full PDF data
    with open(output_dir / f"{pdf.name}.json", "w", encoding="utf-8") as f:
      json.dump(pdf.dictify(include_posters=True), f, indent=2, ensure_ascii=False)

    # metadata entry
    pdfs_metadata.append(pdf.dictify(include_posters=False))

  metadata = {
      "pdfs": sorted(pdfs_metadata, key=lambda pdf: pdf["name"]),
      "categories": sorted(CATEGORY_REGISTRY, key=lambda cat: cat["name"]),
  }

  with open(output_dir / "metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

In [30]:
generate_pdf_json(pdfs)