In [10]:
from pathlib import Path

# Paths (relative to project root)
data_dir = Path("data")
input_file = data_dir / "all-forwarded-pdfs.txt"
output_file = data_dir / "unique-pdfs.txt"

# Read all lines (keep exact filenames, strip newline)
with input_file.open("r", encoding="utf-8") as f:
  files = [line.rstrip("\n") for line in f if line.strip()]

# Get unique filenames
unique_files = sorted(set(files))

# Write unique filenames
with output_file.open("w", encoding="utf-8") as f:
  for name in unique_files:
    _ = f.write(name + "\n")

print(f"Written {len(unique_files)} unique files to {output_file}")

Written 25 unique files to data\unique-pdfs.txt


In [11]:
from pathlib import Path
import hashlib
from collections import defaultdict


def remove_duplicate_pdfs(base_dir="data/poster-pdfs", dry_run=True):
  """
  Find duplicate PDFs by checksum.
  Keep the file with the OLDEST modified time.
  Optionally delete newer duplicates.
  """

  base_dir = Path(base_dir)
  pdf_files = list(base_dir.rglob("*.pdf"))

  def checksum(path, chunk_size=8192):
    h = hashlib.sha256()  # built-in, fast, very popular
    with path.open("rb") as f:
      for chunk in iter(lambda: f.read(chunk_size), b""):
        h.update(chunk)
    return h.hexdigest()

  checksum_map = defaultdict(list)

  for pdf in pdf_files:
    checksum_map[checksum(pdf)].append(pdf)

  unique_files = []
  to_delete = []

  for files in checksum_map.values():
    # sort by modified time (oldest first)
    files_sorted = sorted(files, key=lambda p: p.stat().st_mtime)
    keep = files_sorted[0]          # oldest
    delete = files_sorted[1:]       # newer duplicates

    unique_files.append(keep)
    to_delete.extend(delete)

  print(f"Total PDFs scanned : {len(pdf_files)}")
  print(f"Unique PDFs kept   : {len(unique_files)}")
  print(f"Files to delete   : {len(to_delete)}")

  print("\n--- Kept (oldest) ---")
  for f in unique_files:
    print(f)

  print("\n--- To delete (newer duplicates) ---")
  for f in to_delete:
    print(f)

  if not dry_run:
    for f in to_delete:
      f.unlink()
    print(f"\nDeleted {len(to_delete)} duplicate files.")

In [12]:
remove_duplicate_pdfs()

Total PDFs scanned : 25
Unique PDFs kept   : 25
Files to delete   : 0

--- Kept (oldest) ---
data\poster-pdfs\1-per-page\02_Ram_Catalogue_PDF_Brochure.pdf
data\poster-pdfs\1-per-page\29_Chinese_Scenery_Modern_Art_Poster.pdf
data\poster-pdfs\1-per-page\30_Chinese_Scenery_Modern_Art_Poster.pdf
data\poster-pdfs\1-per-page\31_Chinese_Scenery_Modern_Art_Poster.pdf
data\poster-pdfs\1-per-page\40_Mahakal_Art_Poster_2023.pdf
data\poster-pdfs\1-per-page\45_Tirupati_Balaji_Art_Poster.pdf
data\poster-pdfs\1-per-page\46_Ram_Darbar_Art_Poster.pdf
data\poster-pdfs\1-per-page\47_Shiva_Family_Art_Poster.pdf
data\poster-pdfs\1-per-page\48_Hanuman_Art_Poster.pdf
data\poster-pdfs\1-per-page\51_Buddha_Art_Poster.pdf
data\poster-pdfs\1-per-page\61_Mixed_Gods_Art_Poster.pdf
data\poster-pdfs\1-per-page\72_Buddha_Art_Poster.pdf
data\poster-pdfs\1-per-page\75_Khatu_Shyam_Art_Poster.pdf
data\poster-pdfs\multi-per-page\01_Radha_Krishna_Art_Poster.pdf
data\poster-pdfs\multi-per-page\02_Radha_Krishna_Art_Poster_Ve

In [13]:
from pathlib import Path
import re


def check_unique_pdfs(
    unique_list_path="data/unique-pdfs.txt",
    pdf_dir="data/poster-pdfs"
):
  unique_list_path = Path(unique_list_path)
  pdf_dir = Path(pdf_dir)

  # Normalize: collapse multiple spaces → single space
  def normalize(name: str) -> str:
    return re.sub(r"\s+", " ", name).strip()

  # Read expected filenames
  with unique_list_path.open("r", encoding="utf-8") as f:
    expected = {normalize(line.rstrip("\n")) for line in f if line.strip()}

  # Actual PDFs on disk (after normalization)
  actual = {
      normalize(p.name): p
      for p in pdf_dir.rglob("*.pdf")
  }

  # Check presence
  missing = sorted(expected - actual.keys())

  if not missing:
    print("✅ All unique PDFs are present as actual files.")
    print(f"Total checked: {len(expected)}")
  else:
    print("❌ Missing PDFs:")
    print(f"Missing count: {len(missing)}")
    for name in missing:
      print(name)

In [14]:
check_unique_pdfs()

❌ Missing PDFs:
Missing count: 25
01_Radha Krishna Art Poster.pdf
02_Radha Krishna Art Poster_Vertical.pdf
03_All Mix Art Poster (Chiness Sinnery).pdf
04_Ganesh Art Poster.pdf
08_Ambedkar + Buddha Art Poster.pdf
08_horse Art Poster.pdf
09_Ambedkar Art Poster.pdf
2-Ram Catalouge PDF Brochure.pdf
29_Chinese scenery (Morden Art) Art Poster.pdf
30_Chinese scenery (Morden Art) Art Poster.pdf
31_Chinese scenery (Morden Art) Art Poster.pdf
38_Natural Art Poster.pdf
39_Art Poster Mordern Art 2023.pdf
40_Art Poster Mahakal Art 2023.pdf
43.Art Poster Mordern Art.pdf
44.Art Poster Mordern Art.pdf
45_Art Poster Tirupati Balaji.pdf
46_Art Poster Ramdarbar.pdf
47_Art Poster Shiva Family.pdf
48_Art Poster Hanuman.pdf
49_Art Poster LGS ( NEW ).pdf
51_Art Postar BUDDHA.pdf
61_Art Poster MIX GOD.pdf
72_Art Postar BUDDHA.pdf
75_Art Poster Khatu Shyam.pdf


In [15]:
from pathlib import Path


def rename_poster_pdfs(base_dir="data/poster-pdfs", dry_run=True):
  base_dir = Path(base_dir)

  rename_map = {
      # Radha Krishna
      "01_Radha Krishna Art Poster.pdf": "01_Radha_Krishna_Art_Poster.pdf",
      "02_Radha Krishna Art Poster_Vertical.pdf": "02_Radha_Krishna_Art_Poster_Vertical.pdf",

      # Mixed / Scenic
      "03_All Mix Art Poster (Chiness Sinnery).pdf": "03_Mixed_Art_Poster_Chinese_Scenery.pdf",

      # Ganesh
      "04_Ganesh Art Poster.pdf": "04_Ganesh_Art_Poster.pdf",

      # Ambedkar / Buddha
      "08_Ambedkar + Buddha  Art Poster.pdf": "08_Ambedkar_Buddha_Art_Poster.pdf",
      "09_Ambedkar Art Poster.pdf": "09_Ambedkar_Art_Poster.pdf",

      # Horse
      "08_horse Art Poster.pdf": "08_Horse_Art_Poster.pdf",

      # Ram Catalogue
      "2-Ram Catalouge PDF Brochure.pdf": "02_Ram_Catalogue_PDF_Brochure.pdf",

      # Chinese Scenery (Modern)
      "29_Chinese scenery (Morden Art)  Art Poster.pdf": "29_Chinese_Scenery_Modern_Art_Poster.pdf",
      "30_Chinese scenery (Morden Art)  Art Poster.pdf": "30_Chinese_Scenery_Modern_Art_Poster.pdf",
      "31_Chinese scenery (Morden Art)  Art Poster.pdf": "31_Chinese_Scenery_Modern_Art_Poster.pdf",

      # Nature / Modern
      "38_Natural Art Poster.pdf": "38_Nature_Art_Poster.pdf",
      "39_Art Poster Mordern Art 2023.pdf": "39_Modern_Art_Poster_2023.pdf",
      "40_Art Poster Mahakal Art 2023.pdf": "40_Mahakal_Art_Poster_2023.pdf",

      # Modern Art (generic)
      "43.Art Poster Mordern Art.pdf": "43_Modern_Art_Poster.pdf",
      "44.Art Poster Mordern Art.pdf": "44_Modern_Art_Poster.pdf",

      # Gods / Temples
      "45_Art Poster Tirupati Balaji.pdf": "45_Tirupati_Balaji_Art_Poster.pdf",
      "46_Art Poster Ramdarbar.pdf": "46_Ram_Darbar_Art_Poster.pdf",
      "47_Art Poster Shiva Family.pdf": "47_Shiva_Family_Art_Poster.pdf",
      "48_Art Poster Hanuman.pdf": "48_Hanuman_Art_Poster.pdf",

      # LGS
      "49_Art Poster  LGS ( NEW ).pdf": "49_LGS_Art_Poster_New.pdf",

      # Buddha
      "51_Art Postar BUDDHA.pdf": "51_Buddha_Art_Poster.pdf",
      "72_Art Postar BUDDHA.pdf": "72_Buddha_Art_Poster.pdf",

      # Mixed Gods
      "61_Art Poster MIX GOD.pdf": "61_Mixed_Gods_Art_Poster.pdf",

      # Khatu Shyam
      "75_Art Poster Khatu Shyam.pdf": "75_Khatu_Shyam_Art_Poster.pdf",
  }

  print(f"Dry run: {dry_run}\n")

  for old_name, new_name in rename_map.items():
    old_path = base_dir / old_name
    new_path = base_dir / new_name

    if not old_path.exists():
      print(f"[MISSING] {old_name}")
      continue

    print(f"{old_name}  ->  {new_name}")

    if not dry_run:
      old_path.rename(new_path)

  if dry_run:
    print("\nNo files were renamed (dry run).")
  else:
    print("\nAll files renamed successfully.")

In [16]:
rename_poster_pdfs()

Dry run: True

[MISSING] 01_Radha Krishna Art Poster.pdf
[MISSING] 02_Radha Krishna Art Poster_Vertical.pdf
[MISSING] 03_All Mix Art Poster (Chiness Sinnery).pdf
[MISSING] 04_Ganesh Art Poster.pdf
[MISSING] 08_Ambedkar + Buddha  Art Poster.pdf
[MISSING] 09_Ambedkar Art Poster.pdf
[MISSING] 08_horse Art Poster.pdf
[MISSING] 2-Ram Catalouge PDF Brochure.pdf
[MISSING] 29_Chinese scenery (Morden Art)  Art Poster.pdf
[MISSING] 30_Chinese scenery (Morden Art)  Art Poster.pdf
[MISSING] 31_Chinese scenery (Morden Art)  Art Poster.pdf
[MISSING] 38_Natural Art Poster.pdf
[MISSING] 39_Art Poster Mordern Art 2023.pdf
[MISSING] 40_Art Poster Mahakal Art 2023.pdf
[MISSING] 43.Art Poster Mordern Art.pdf
[MISSING] 44.Art Poster Mordern Art.pdf
[MISSING] 45_Art Poster Tirupati Balaji.pdf
[MISSING] 46_Art Poster Ramdarbar.pdf
[MISSING] 47_Art Poster Shiva Family.pdf
[MISSING] 48_Art Poster Hanuman.pdf
[MISSING] 49_Art Poster  LGS ( NEW ).pdf
[MISSING] 51_Art Postar BUDDHA.pdf
[MISSING] 72_Art Postar BUD