## ⚙️ Setup

In [None]:
# Install uv
!curl -LsSf https://astral.sh/uv/install.sh | sh

# Install deps
!uv pip install --system docling pymupdf

## 📚 Data

In [16]:
import re
from pathlib import Path
from utils import split_markdown_by_spans

DATA_DIR = Path("/datasets/client-data-us/AAA/Redaction")

# Map between unredacted and redacted files
u2r = {}
for doc_dir in DATA_DIR.glob("*/"):
    print(f"📁 Found: {doc_dir.name}")
    redacted_files = list((doc_dir / "Redacted").rglob("*.*"))
    unredacted_files = list((doc_dir / "Unredacted").rglob("*.*"))
    redacted_fnames = [rf.name for rf in redacted_files]
    for i, uf in enumerate(unredacted_files):
        rfn = uf.name.replace(uf.suffix, "-redacted" + uf.suffix)
        if rfn in redacted_fnames:
            u2r[uf] = redacted_files[i]
        
r2u = {v:k for k,v in u2r.items()}
print(f"📚 Total items: {len(r2u)}")

📁 Found: 012100046665
📁 Found: 012200023252
📁 Found: 012300021267
📁 Found: 012300047931
📁 Found: 012300051365
📁 Found: 022200022285
📚 Total items: 66


## 📝 Convert to Markdown

In [None]:
from enum import Enum


class ExportFormat(str, Enum):
    Markdown:str = "md"
    HTML:str = "html"

    
def docling_convert(file_path: str, export_format: str = ExportFormat.Markdown) -> str:
    """Convert a PDF file to text using Docling default conversion.

    Args:
        file_path (str): Path to the PDF file
        export_format (ExportFormat): Export format (Markdown or HTML)
    """
    from docling.document_converter import DocumentConverter

    # Use the default Docling conversion service
    logger.info(f"Using docling to convert {file_path} ➡️ {export_format.name}")
    converter = DocumentConverter()
    result = converter.convert(file_path)
    if export_format == ExportFormat.Markdown:
        return result.document.export_to_markdown()
    elif export_format == ExportFormat.HTML:
        return result.document.export_to_html()
    else:
        raise ValueError(f"Unsupported format: {export_format}")