In [3]:
import fitz, uuid, json, pathlib, os

pdf_dir = pathlib.Path("../data/raw/pdf")      # adjust if you used another folder
out = []

# OPTION A – easiest
for pdf in list(pdf_dir.glob("*.pdf"))[:10]:   # convert to list, then slice
    doc = fitz.open(pdf)
    for page_no, page in enumerate(doc, 1):
        out.append({
            "chunk_id": str(uuid.uuid4()),
            "source"  : "pdf",
            "doc_path": str(pdf),
            "loc"     : {"page": page_no},
            "text"    : page.get_text()
        })

# OPTION B – iterator-friendly (no big list in memory)
# from itertools import islice
# for pdf in islice(pdf_dir.glob("*.pdf"), 10):
#     ...

os.makedirs("../data/processed", exist_ok=True)
json.dump(out, open("../data/processed/pdf_chunks.json", "w"))

len(out)   # how many page-chunks you just saved


285

In [5]:
from pathlib import Path
print("PDF files found:", len(list(Path("../data/raw/pdf").rglob("*.pdf"))))

PDF files found: 1076


In [7]:
from pathlib import Path
import fitz, tqdm

pdf_dir = Path("../data/raw/pdf")
bad, zero_pages = [], []

for f in tqdm.tqdm(pdf_dir.rglob("*.pdf")):
    try:
        doc = fitz.open(f)
        if len(doc) == 0:
            zero_pages.append(f)
    except Exception as e:
        bad.append((f, str(e)))

print("Bad files   :", len(bad))
print("Zero-page   :", len(zero_pages))
for f, err in bad[:5]:
    print(" →", f.name, "|", err)


1076it [00:01, 1031.28it/s]

Bad files   : 0
Zero-page   : 9





In [9]:
empty_pdfs = []

for pdf in pdf_dir.rglob("*.pdf"):
    try:
        doc = fitz.open(pdf)
        if len(doc) == 0:
            empty_pdfs.append(pdf)
            continue            # ← skip to next file
        for page_no, page in enumerate(doc, 1):
            out.append({
                "chunk_id": str(uuid.uuid4()),
                "source"  : "pdf",
                "doc_path": str(pdf),
                "loc"     : {"page": page_no},
                "text"    : page.get_text()
            })
    except Exception as e:
        print("⚠️  could not open", pdf.name, "→", e)

print("Skipped zero-page PDFs:", len(empty_pdfs))


MuPDF error: format error: No default Layer config

MuPDF error: format error: No default Layer config

MuPDF error: format error: No default Layer config

MuPDF error: format error: No default Layer config

MuPDF error: format error: object out of range (1167 0 R); xref size 1167

MuPDF error: format error: non-page object in page tree

MuPDF error: format error: object out of range (1167 0 R); xref size 1167

MuPDF error: format error: object out of range (1167 0 R); xref size 1167

MuPDF error: format error: object out of range (1167 0 R); xref size 1167

MuPDF error: format error: object out of range (1167 0 R); xref size 1167

MuPDF error: format error: object out of range (1167 0 R); xref size 1167

MuPDF error: format error: object out of range (1167 0 R); xref size 1167

MuPDF error: format error: object out of range (1167 0 R); xref size 1167

MuPDF error: format error: object out of range (1167 0 R); xref size 1167

MuPDF error: format error: object out of range (1167 0 R); x