## ⚙️ Setup

In [14]:
# Install uv
!curl -LsSf https://astral.sh/uv/install.sh | sh

# Install deps
!uv pip install docling pymupdf vllm transformers
# !uv pip install docling pymupdf vllm transformers
# !uv pip install 'pydantic==1.10.14' 'protobuf==3.20.3' 'accelerate>=0.26.0' 
# !uv pip install flash-attn #--no-build-isolation

downloading uv 0.6.12 x86_64-unknown-linux-gnu
no checksums to verify
installing to /root/.local/bin
  uv
  uvx
everything's installed!
[2mUsing Python 3.11.7 environment at: /usr[0m
[2K[2mResolved [1m171 packages[0m [2min 121ms[0m[0m                                       [0m
[2K[2mInstalled [1m6 packages[0m [2min 81ms[0m[0m                                [0m
 [32m+[39m [1mdocling[0m[2m==2.28.4[0m
 [32m+[39m [1mprotobuf[0m[2m==6.30.2[0m
 [32m+[39m [1mpydantic[0m[2m==2.11.2[0m
 [32m+[39m [1mpymupdf[0m[2m==1.25.5[0m
 [32m+[39m [1mtransformers[0m[2m==4.51.0[0m
 [32m+[39m [1mvllm[0m[2m==0.8.3[0m


## 📚 AAA Data

In [1]:
import re
from pathlib import Path
from utils import split_markdown_by_spans

DATA_DIR = Path("/datasets/client-data-us/AAA/Redaction")

# Map between unredacted and redacted files
u2r = {}
for doc_dir in DATA_DIR.glob("*/"):
    print(f"📁 Found: {doc_dir.name}")
    redacted_files = [f for f in (doc_dir / "Redacted").rglob("*.*") if f.suffix != ".md"]
    unredacted_files = [f for f in (doc_dir / "Unredacted").rglob("*.*") if f.suffix != ".md"]
    redacted_fnames = [rf.name for rf in redacted_files]
    for i, uf in enumerate(unredacted_files):
        rfn = uf.name.replace(uf.suffix, "-redacted" + uf.suffix)
        if rfn in redacted_fnames:
            u2r[uf] = redacted_files[i]
        
r2u = {v:k for k,v in u2r.items()} # redacted to unredacted
print(f"📚 Total items: {len(r2u)}")

📁 Found: 012100046665
📁 Found: 012200023252
📁 Found: 012300021267
📁 Found: 012300047931
📁 Found: 012300051365
📁 Found: 022200022285
📚 Total items: 66


In [2]:
from collections import defaultdict

# group data by case
case_files = defaultdict(list)
for fpath in u2r:
    case_files[fpath.parent.parent.parent.name].append(fpath)
    
case_files.keys()

dict_keys(['012100046665', '012200023252', '012300021267', '012300047931', '012300051365', '022200022285'])

In [3]:
import fitz


def pdf_page_count(pdf_path: Path | str):
    return fitz.open(pdf_path).page_count


# Get a count of pages for all the documents
page_counts = {}
for fp in u2r: 
    if fp.suffix == ".pdf":
        page_counts[fp] = pdf_page_count(fp)
        
# Document and Page count per case
for case, files in case_files.items():
    total_pages = sum(page_counts[f] for f in files if f in page_counts)
    print(f"📂 Case '{case}': documents: {len(files)} | pages: {total_pages}")

# Print everything
page_avg_count = sum(c for c in page_counts.values()) / len(page_counts)
print(f"🪣 Total pages: {sum(page_counts.values())}")
print(f"\n📏 Average pages per doc: {page_avg_count:.2f}")

📂 Case '012100046665': documents: 34 | pages: 91
📂 Case '012200023252': documents: 2 | pages: 5
📂 Case '012300021267': documents: 8 | pages: 269
📂 Case '012300047931': documents: 16 | pages: 402
📂 Case '012300051365': documents: 3 | pages: 17
📂 Case '022200022285': documents: 3 | pages: 15
🪣 Total pages: 799

📏 Average pages per doc: 12.106060606060606


## 📝 Convert to Markdown

Here we use `docling` to produce markdown output files. 

> 💡 We can either use the CLI or do it from python

In [None]:
from enum import Enum
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.pipeline.vlm_pipeline import VlmPipeline
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    VlmPipelineOptions,
    granite_vision_vlm_conversion_options,
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
)


class ExportFormat(str, Enum):
    Markdown:str = "md"
    HTML:str = "html"

    
def build_vllm_pipeline_options():
    import platform
    import torch
    
    pipeline_options = VlmPipelineOptions()

    # On GPU systems, enable flash_attention_2 with CUDA
    if torch.cuda.is_available():
        print("🚀 Using CUDA for GPU acceleration.")
        pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
        # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True

    ## Pick a VLM model:
    if "arm" in platform.processor():
        # Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
        print("🍏 Using Apple Silicon implementation for SmolDocling-256M via MLX.")
        pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
    else:
        # Otherwise, we choose SmolDocling-256M by default
        print("🖥️ Using default SmolDocling-256M model for VLM conversion.")
        pipeline_options.vlm_options = smoldocling_vlm_conversion_options

    return pipeline_options


def docling_convert(
    converter,
    file_path: Path | str, 
    export_format: ExportFormat = ExportFormat.Markdown, 
) -> str:
    """Convert a PDF file to text using Docling default conversion.

    Args:
        file_path (str): Path to the PDF file
        export_format (ExportFormat): Export format (Markdown or HTML)
    """

    # Use the default Docling conversion service
    file_path = Path(file_path)
    print(f"🪄 {file_path.name} ➡️ {export_format.name}")
    result = converter.convert(file_path)
    if export_format == ExportFormat.Markdown:
        return result.document.export_to_markdown()
    elif export_format == ExportFormat.HTML:
        return result.document.export_to_html()
    else:
        raise ValueError(f"Unsupported format: {export_format}")
        
        
def convert_doc(doc: Path | str):
    for redacted_file in r2u:
        doc = Path(doc)
        outfile = rf.with_suffix(".md")
        if outfile.exists():
            print(f"🦘 Skipping {rf.name}")
            continue

        md_text = docling_convert(converter, doc)
        with outfile.open("w") as f:
            f.write(md_text)

            
print("🐣 Initializing converter...")
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=build_vllm_pipeline_options(),
        ),
    }
)

# Convert redacted files to makrdown
for redacted_file in r2u:
    convert_doc(Path(redacted_file))

# Convert unredacted files to makrdown
for unredacted_file in u2r:
    convert_doc(Path(unredacted_file))

## 🪣 Tokenization

In [8]:
!uv pip install -q --system tiktoken

In [4]:
md_docs = [mf for mf in DATA_DIR.rglob("*.md") if "redacted" not in mf.name]
print(f"Total markdown files: {len(md_docs)}")

Total markdown files: 43


In [18]:
import tiktoken

# This is not LLaMA 3 tokenizer, but will just give as an idea...
enc = tiktoken.get_encoding("cl100k_base")


total_toks = 0
for doc in md_docs:
    # if "redacted" in doc.name:
    #     continue
        
    text = doc.open().read()
    tokens = enc.encode(text)
    total_toks += len(tokens)
    print(f"{doc.name} 👉 tokens: {len(tokens)}")
    
print(f"Mean tok/doc: {total_toks/len(md_docs)}")

2020-07-02 AAA Clause Pg.5 Sect. 19-redacted.md 👉 tokens: 5364
2021-07-08 Claimant's Demand for Arbitration-redacted.md 👉 tokens: 736
2021-08-02 Respondent's Answering Statement and Counterclaim-redacted.md 👉 tokens: 527
2021-08-02 Respondent's Supplemental Attachment to Answering Statement and Counterclaim-redacted.md 👉 tokens: 413
2021-08-03 Claimant's Answer to Counterclaim-redacted.md 👉 tokens: 224
2021-09-22 Respondent's Counterclaim Withdrawal-redacted.md 👉 tokens: 1094
2021-09-24 Claimant's Amended Claim Amount plus Interest and Penalty-redacted.md 👉 tokens: 328
2021-09-24 Claimant's Amended Claim Amount plus Interest-redacted.md 👉 tokens: 327
2021-12-07 Arbitrator's Final Award-redacted.md 👉 tokens: 1095
012100046665_6365_28557680_Claimant Case Stated-redacted.md 👉 tokens: 498
012100046665_6366_28557682_C-28 Claimant job outline-redacted.md 👉 tokens: 2170
012100046665_6368_28557688_C-26 Claimant bio-redacted.md 👉 tokens: 665
012100046665_6369_28557690_C-25a, 25b 4.11.21 invoice