## ⚙️ Setup

In [3]:
# Install uv
!curl -LsSf https://astral.sh/uv/install.sh | sh

# Install deps
!uv pip install --system docling pymupdf vllm 'accelerate>=0.26.0' 
!uv pip install --system flash-attn --no-build-isolation

downloading uv 0.6.11 x86_64-unknown-linux-gnu
no checksums to verify
installing to /root/.local/bin
  uv
  uvx
everything's installed!
[2mUsing Python 3.11.7 environment at: /usr[0m
[2K[2mResolved [1m169 packages[0m [2min 291ms[0m[0m                                       [0m
[2K[2mPrepared [1m1 package[0m [2min 23ms[0m[0m                                               
[2mUninstalled [1m1 package[0m [2min 9ms[0m[0m
[2K[2mInstalled [1m1 package[0m [2min 5ms[0m[0m                                  [0m
 [31m-[39m [1maccelerate[0m[2m==0.24.1[0m
 [32m+[39m [1maccelerate[0m[2m==1.6.0[0m


## 📚 Data

In [2]:
import re
from pathlib import Path
from utils import split_markdown_by_spans

DATA_DIR = Path("/datasets/client-data-us/AAA/Redaction")

# Map between unredacted and redacted files
u2r = {}
for doc_dir in DATA_DIR.glob("*/"):
    print(f"📁 Found: {doc_dir.name}")
    redacted_files = list((doc_dir / "Redacted").rglob("*.*"))
    unredacted_files = list((doc_dir / "Unredacted").rglob("*.*"))
    redacted_fnames = [rf.name for rf in redacted_files]
    for i, uf in enumerate(unredacted_files):
        rfn = uf.name.replace(uf.suffix, "-redacted" + uf.suffix)
        if rfn in redacted_fnames:
            u2r[uf] = redacted_files[i]
        
r2u = {v:k for k,v in u2r.items()}
print(f"📚 Total items: {len(r2u)}")

📁 Found: 012100046665
📁 Found: 012200023252
📁 Found: 012300021267
📁 Found: 012300047931
📁 Found: 012300051365
📁 Found: 022200022285
📚 Total items: 66


## 📝 Convert to Markdown

Here we use `docling` to produce markdown output files. 

> 💡 We can either use the CLI or do it from python

In [None]:
from enum import Enum
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.pipeline.vlm_pipeline import VlmPipeline
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    VlmPipelineOptions,
    granite_vision_vlm_conversion_options,
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
)


class ExportFormat(str, Enum):
    Markdown:str = "md"
    HTML:str = "html"

    
def build_vllm_pipeline_options():
    import platform
    import torch
    
    pipeline_options = VlmPipelineOptions()

    # On GPU systems, enable flash_attention_2 with CUDA
    if torch.cuda.is_available():
        pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
        # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True

    ## Pick a VLM model:
    if "arm" in platform.processor():
        # Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
        pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
    else:
        # Otherwise, we choose SmolDocling-256M by default
        pipeline_options.vlm_options = smoldocling_vlm_conversion_options
        
    return pipeline_options


def docling_convert(
    converter,
    file_path: str, 
    export_format: ExportFormat = ExportFormat.Markdown, 
) -> str:
    """Convert a PDF file to text using Docling default conversion.

    Args:
        file_path (str): Path to the PDF file
        export_format (ExportFormat): Export format (Markdown or HTML)
    """

    # Use the default Docling conversion service
    print(f"Using docling to convert {file_path} ➡️ {export_format.name}")
    result = converter.convert(file_path)
    if export_format == ExportFormat.Markdown:
        return result.document.export_to_markdown()
    elif export_format == ExportFormat.HTML:
        return result.document.export_to_html()
    else:
        raise ValueError(f"Unsupported format: {export_format}")
        


converter = DocumentConverter(format_options={
    InputFormat.PDF: PdfFormatOption(
        pipeline_cls=VlmPipeline,
        pipeline_options=build_vllm_pipeline_options(),
    ),
})

for redacted_file in r2u:
    rf = Path(redacted_file)
    outfile = rf.with_suffix(".md")
    if outfile.exists():
        print(f"🦘 Skipping {rf.name}")
        continue
        
    print(f"⚙️ Processing: {rf.name} --> {rf.parent}")
    md_text = docling_convert(converter, rf)
    with outfile.open("w") as f:
        f.write(md_text)

🦘 Skipping 2020-07-02 AAA Clause Pg.5 Sect. 19-redacted.pdf
🦘 Skipping 2021-07-08 Claimant's Demand for Arbitration-redacted.pdf
🦘 Skipping 2021-08-02 Respondent's Answering Statement and Counterclaim-redacted.pdf
🦘 Skipping 2021-08-02 Respondent's Supplemental Attachment to Answering Statement and Counterclaim-redacted.pdf
🦘 Skipping 2021-08-03 Claimant's Answer to Counterclaim-redacted.pdf
⚙️ Processing: 2021-09-22 Respondent's Counterclaim Withdrawal-redacted.pdf --> /datasets/client-data-us/AAA/Redaction/012100046665/Redacted/Case Docs - Production - Redacted
Using docling to convert /datasets/client-data-us/AAA/Redaction/012100046665/Redacted/Case Docs - Production - Redacted/2021-09-22 Respondent's Counterclaim Withdrawal-redacted.pdf ➡️ Markdown


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


⚙️ Processing: 2021-09-24 Claimant's Amended Claim Amount plus Interest and Penalty-redacted.pdf --> /datasets/client-data-us/AAA/Redaction/012100046665/Redacted/Case Docs - Production - Redacted
Using docling to convert /datasets/client-data-us/AAA/Redaction/012100046665/Redacted/Case Docs - Production - Redacted/2021-09-24 Claimant's Amended Claim Amount plus Interest and Penalty-redacted.pdf ➡️ Markdown
⚙️ Processing: 2021-09-24 Claimant's Amended Claim Amount plus Interest-redacted.pdf --> /datasets/client-data-us/AAA/Redaction/012100046665/Redacted/Case Docs - Production - Redacted
Using docling to convert /datasets/client-data-us/AAA/Redaction/012100046665/Redacted/Case Docs - Production - Redacted/2021-09-24 Claimant's Amended Claim Amount plus Interest-redacted.pdf ➡️ Markdown
⚙️ Processing: 2021-12-07 Arbitrator's Final Award-redacted.pdf --> /datasets/client-data-us/AAA/Redaction/012100046665/Redacted/Case Docs - Production - Redacted
Using docling to convert /datasets/client

## 🪣 Tokenization

In [8]:
!uv pip install -q --system tiktoken

In [4]:
md_docs = list(DATA_DIR.rglob("*.md"))
print(f"Total markdown files: {len(md_docs)}")

Total markdown files: 43


In [14]:
import tiktoken

# Load the LLaMA 3 tokenizer
enc = tiktoken.get_encoding("cl100k_base")

In [18]:
total_toks = 0
for doc in md_docs:
    # if "redacted" in doc.name:
    #     continue
        
    text = doc.open().read()
    tokens = enc.encode(text)
    total_toks += len(tokens)
    print(f"{doc.name} 👉 tokens: {len(tokens)}")
    
print(f"Mean tok/doc: {total_toks/len(md_docs)}")

2020-07-02 AAA Clause Pg.5 Sect. 19-redacted.md 👉 tokens: 5364
2021-07-08 Claimant's Demand for Arbitration-redacted.md 👉 tokens: 736
2021-08-02 Respondent's Answering Statement and Counterclaim-redacted.md 👉 tokens: 527
2021-08-02 Respondent's Supplemental Attachment to Answering Statement and Counterclaim-redacted.md 👉 tokens: 413
2021-08-03 Claimant's Answer to Counterclaim-redacted.md 👉 tokens: 224
2021-09-22 Respondent's Counterclaim Withdrawal-redacted.md 👉 tokens: 1094
2021-09-24 Claimant's Amended Claim Amount plus Interest and Penalty-redacted.md 👉 tokens: 328
2021-09-24 Claimant's Amended Claim Amount plus Interest-redacted.md 👉 tokens: 327
2021-12-07 Arbitrator's Final Award-redacted.md 👉 tokens: 1095
012100046665_6365_28557680_Claimant Case Stated-redacted.md 👉 tokens: 498
012100046665_6366_28557682_C-28 Claimant job outline-redacted.md 👉 tokens: 2170
012100046665_6368_28557688_C-26 Claimant bio-redacted.md 👉 tokens: 665
012100046665_6369_28557690_C-25a, 25b 4.11.21 invoice