In [None]:
import json
import logging
import time
from pathlib import Path

from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.chunking import HybridChunker
from docling_core.transforms.chunker.hierarchical_chunker import (
    ChunkingDocSerializer,
    ChunkingSerializerProvider,
)
from docling_core.transforms.serializer.markdown import MarkdownTableSerializer

## Document ingestion and conversion

In [None]:
# Change this to a local path or another URL if desired.
# Note: using the default URL requires network access; if offline, provide a
# local file path (e.g., Path("/path/to/file.pdf")).
# source = "https://arxiv.org/pdf/2408.09869"
source = Path("C:\dev\projects\docling-experimentation\data\Vodafone 2025 Annual Report 10.pdf")

In [None]:
# Custom pipeline
# -------------------------------
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
ocr_options = EasyOcrOptions(force_full_page_ocr=False)
pipeline_options.ocr_options = ocr_options
pipeline_options.ocr_options.lang = ["en"]
# pipeline_options.generate_page_images = True  # Include page images in HTML
pipeline_options.accelerator_options = AcceleratorOptions(
    num_threads=4, device=AcceleratorDevice.AUTO
)

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [None]:
conv_result = doc_converter.convert(source=source)

In [None]:
conv_result.__dict__.keys()

In [None]:
doc = conv_result.document

## Confidence scores

In [None]:
print(json.dumps(conv_result.confidence.model_dump(), indent=4))

## Chunking

In [None]:
class MDTableSerializerProvider(ChunkingSerializerProvider):
    def get_serializer(self, doc):
        return ChunkingDocSerializer(
            doc=doc,
            table_serializer=MarkdownTableSerializer(),  # configuring a different table serializer
        )

In [None]:
chunker = HybridChunker(
    serializer_provider=MDTableSerializerProvider(),
)
chunk_iter = chunker.chunk(dl_doc=doc)
chunk_list = list(chunk_iter)

In [None]:
for i, chunk in enumerate(chunk_list):
    if "Revenue" not in chunk.text:
        continue
    
    print(f"=== {i} ===")
    print(f"chunk.text:\n{f'{chunk.text[:300]}…'!r}\n")

    enriched_text = chunker.contextualize(chunk=chunk)
    print(f"chunker.contextualize(chunk):\n{f'{enriched_text[:300]}…'!r}\n")

    print()

In [None]:
i = 51

print(f"=== {i} ===")
print()

print("chunk.text:\n")
print(chunk_list[i].text)
print()

enriched_text = chunker.contextualize(chunk=chunk)
print("chunker.contextualize(chunk):\n")
print(chunker.contextualize(chunk=chunk_list[i]))
print()