In [None]:
!pip install -q --upgrade pip
!pip install -q docling tqdm qdrant-client
!pip install -q --upgrade flash-attn
!pip install -q pypdf

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m1.2/1.8 MB[0m [31m17.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m112.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m125.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m124.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m541.1/541.1 kB[0m [31m25

In [None]:
import torch, logging, os
logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s: %(message)s")
print("CUDA available:", torch.cuda.is_available(),
      "| GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

CUDA available: True | GPU: NVIDIA A100-SXM4-40GB


In [None]:
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
PDF_DIR = Path('/content/drive/Shareddrives/Colab/medicare-policy-docs')
OUT_DIR = Path('/content/drive/Shareddrives/Colab/docling-json')
OUT_DIR.mkdir(parents=True, exist_ok=True)
pdf_files = sorted(PDF_DIR.glob('**/*.pdf'))
print(f"📄  Found {len(pdf_files)} PDFs")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📄  Found 12 PDFs


In [None]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    VlmPipelineOptions, AcceleratorDevice, PdfPipelineOptions, smoldocling_vlm_conversion_options
)
from docling.pipeline.vlm_pipeline import VlmPipeline
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode, TableItem, PictureItem

# GPU + flash‑attention 2
vlm_opts = VlmPipelineOptions()
vlm_opts.accelerator_options.device = AcceleratorDevice.CUDA
vlm_opts.accelerator_options.cuda_use_flash_attention2 = True
vlm_opts.vlm_options = smoldocling_vlm_conversion_options

pdf_opts = PdfPipelineOptions(
    generate_page_images=True,           # needed for visual grounding later
    generate_picture_images=True,
    images_scale=2.0,                    # tweak if VRAM constrained
    pipeline_cls=VlmPipeline,
    pipeline_options=vlm_opts,
)

converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)}
)

In [None]:
from tqdm.auto import tqdm
import time

def convert_batch(files):
  for pdf_path in tqdm(files, desc="Converting", unit="pdf"):
    tqdm.write(f"→ {pdf_path.name}")
    try:
      t0 = time.time()
      conv_res = converter.convert(pdf_path)

      # ---------- create output folders ----------
      doc_stem   = conv_res.input.file.stem
      target_dir = OUT_DIR / doc_stem          # e.g., .../Aetna_EOC
      target_dir.mkdir(parents=True, exist_ok=True)

      # ---------- save lean JSON ----------
      conv_res.document.save_as_json(
        target_dir.with_suffix(".json"),     # .../Aetna_EOC.json
        image_mode=ImageRefMode.REFERENCED,
      )

      # ---------- save page images ----------
      for page in conv_res.document.pages.values():
        png_name = target_dir / f"{doc_stem}-{page.page_no}.png"
        page.image.pil_image.save(png_name, "PNG")

      # ---------- save figure/table crops ----------
      table_ct = picture_ct = 0
      for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
          table_ct += 1
          p = target_dir / f"{doc_stem}-table-{table_ct}.png"
          element.get_image(conv_res.document).save(p, "PNG")
        if isinstance(element, PictureItem):
          picture_ct += 1
          p = target_dir / f"{doc_stem}-picture-{picture_ct}.png"
          element.get_image(conv_res.document).save(p, "PNG")

        logging.info(
          "✓ %s in %.1f s  (pages=%d  tables=%d  pictures=%d)",
          pdf_path.name,
          time.time() - t0,
          len(conv_res.document.pages),
          table_ct,
          picture_ct,
        )

    except Exception as err:
      logging.error("❌ %s – %s", pdf_path.name, err)

convert_batch(pdf_files)

Converting:   0%|          | 0/12 [00:00<?, ?pdf/s]

→ Aetna EOC.pdf
→ Aetna SOB.pdf
→ Cigna EOC.pdf
→ Cigna SOB.pdf
→ Humana EOC.pdf
→ Humana SOB.pdf
→ Regence EOC.pdf
→ Regence SOB.pdf
→ UHC 1 EOC.pdf
→ UHC 1 SOB.pdf
→ UHC 2 EOC.pdf
→ UHC 2 SOB.pdf
