In [1]:
import fitz  # PyMuPDF
from pathlib import Path

# -------- CONFIG --------
PDF_PATH = "data/raw_pdfs/bitcoin.pdf"  # üëà change this
MAX_CHARS = 500  # limit text output per block for readability
# ------------------------

def inspect_pdf_structure(pdf_path):
    doc = fitz.open(pdf_path)

    for page_number, page in enumerate(doc, start=1):
        print(f"\n{'='*80}")
        print(f"PAGE {page_number}")
        print(f"{'='*80}")

        blocks = page.get_text("dict")["blocks"]

        # Sort blocks top ‚Üí bottom, then left ‚Üí right
        blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))

        for i, block in enumerate(blocks, start=1):
            btype = block["type"]
            x0, y0, x1, y1 = block["bbox"]

            print(f"\n--- Block {i} ---")
            print(f"Type: {'TEXT' if btype == 0 else 'IMAGE' if btype == 1 else 'OTHER'}")
            print(f"BBox: x0={x0:.1f}, y0={y0:.1f}, x1={x1:.1f}, y1={y1:.1f}")

            if btype == 0:  # TEXT
                text = ""
                for line in block.get("lines", []):
                    for span in line.get("spans", []):
                        text += span.get("text", "")

                text = text.strip()
                if text:
                    print("Content:")
                    print(text[:MAX_CHARS] + ("..." if len(text) > MAX_CHARS else ""))
                else:
                    print("Content: [EMPTY TEXT BLOCK]")

            elif btype == 1:  # IMAGE
                img = block.get("image", {})
                print("Image info:")
                print(f"  width:  {img.get('width')}")
                print(f"  height: {img.get('height')}")
                print(f"  colorspace: {img.get('colorspace')}")
                print(f"  bpc: {img.get('bpc')}")

        print("\n")

    doc.close()

# -------- RUN --------
inspect_pdf_structure(PDF_PATH)



PAGE 1

--- Block 1 ---
Type: TEXT
BBox: x0=134.3, y0=94.1, x1=477.8, y1=110.6
Content:
Bitcoin: A Peer-to-Peer Electronic Cash System

--- Block 2 ---
Type: TEXT
BBox: x0=266.0, y0=134.4, x1=346.3, y1=157.2
Content:
Satoshi Nakamotosatoshin@gmx.com

--- Block 3 ---
Type: TEXT
BBox: x0=272.3, y0=157.7, x1=339.9, y1=168.9
Content:
www.bitcoin.org

--- Block 4 ---
Type: TEXT
BBox: x0=146.3, y0=204.7, x1=464.0, y1=364.5
Content:
Abstract.  A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution.  Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-ba...

--- Block 5 ---
Type: TEXT
BBox: x0=108.1, y0=385.9, x1=210.2, y