In [2]:
import fitz  # PyMuPDF
import json

def int_to_rgb(color_int):
    """Convert integer color to RGB tuple."""
    r = (color_int >> 16) & 255
    g = (color_int >> 8) & 255
    b = color_int & 255
    return [r, g, b]

def extract_pdf_info(pdf_path, output_json_path=None):
    doc = fitz.open(pdf_path)
    all_data = []

    for page_num, page in enumerate(doc, start=1):
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    entry = {
                        "page": page_num,
                        "bbox": list(span["bbox"]),  # [x0, y0, x1, y1]
                        "text": span["text"],
                        "font": {
                            "name": span["font"],
                            "size": span["size"],
                            "color": int_to_rgb(span["color"]),  # FIXED here
                        }
                    }
                    all_data.append(entry)

    if output_json_path:
        with open(output_json_path, "w", encoding="utf-8") as f:
            json.dump(all_data, f, indent=2, ensure_ascii=False)

    return all_data

# Example usage
pdf_file = "Test.pdf"
output_json = "Test_info.json"
result = extract_pdf_info(pdf_file, output_json)

# Print first few items
for item in result[:5]:
    print(json.dumps(item, indent=2))


{
  "page": 1,
  "bbox": [
    56.63999938964844,
    12.161123275756836,
    59.00856018066406,
    22.223243713378906
  ],
  "text": " ",
  "font": {
    "name": "HelveticaNeueLT-Light",
    "size": 8.520000457763672,
    "color": [
      0,
      0,
      0
    ]
  }
}
{
  "page": 1,
  "bbox": [
    56.63999938964844,
    792.15869140625,
    59.00856018066406,
    802.2208251953125
  ],
  "text": " ",
  "font": {
    "name": "HelveticaNeueLT-Light",
    "size": 8.520000457763672,
    "color": [
      0,
      0,
      0
    ]
  }
}
{
  "page": 1,
  "bbox": [
    0.0,
    -0.3703346252441406,
    1.934880256652832,
    7.849425315856934
  ],
  "text": " ",
  "font": {
    "name": "HelveticaNeueLT-Light",
    "size": 6.960000038146973,
    "color": [
      0,
      0,
      0
    ]
  }
}
{
  "page": 1,
  "bbox": [
    595.2000122070312,
    -0.13034439086914062,
    597.1348876953125,
    8.089415550231934
  ],
  "text": " ",
  "font": {
    "name": "HelveticaNeueLT-Light",
    "size