In [1]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    docling_model_dir: str
settings = Settings()

In [19]:
import json
import os
from pathlib import Path
import time

os.environ["HF_HOME"] = settings.docling_model_dir

from docling.document_converter import DocumentConverter

In [7]:
# source = "https://arxiv.org/pdf/2408.09869"  # document per local path or URL

fname = "1706.03762v7"
source = f"samples/{fname}.pdf" # attention is all you need

result_dir = f"results/docling/{fname}"
if not os.path.exists(result_dir):
    os.makedirs(result_dir)

# 1. Simple Conversion

In [4]:
converter = DocumentConverter()

In [5]:
result = converter.convert(source)

In [14]:
## Markdown
print(result.document.export_to_markdown()[:300]) 

Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.

## Attention Is All You Need

Ashish Vaswani ∗ Google Brain avaswani@google.com

Noam Shazeer ∗ Google Brain noam@google.com



In [10]:
converted_dict = result.document.export_to_dict()
print(converted_dict.keys())
with open(os.path.join(result_dir, "result.json"), "w") as f:
    f.write(json.dumps(converted_dict, indent = "\t"))

dict_keys(['schema_name', 'version', 'name', 'origin', 'furniture', 'body', 'groups', 'texts', 'pictures', 'tables', 'key_value_items', 'pages'])


# 2. Docling-V2
* https://ds4sd.github.io/docling/v2/#setting-up-a-documentconverter

API Docs
* https://ds4sd.github.io/docling/reference/document_converter/

In [15]:
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

In [None]:
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True

In [None]:
doc_converter = (
    DocumentConverter(  # all of the below is optional, has internal defaults.
        allowed_formats=[
            InputFormat.PDF,
            # InputFormat.IMAGE,
            # InputFormat.DOCX,
            # InputFormat.HTML,
            # InputFormat.PPTX,
        ],  # whitelist formats, non-matching files are ignored.
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options, # pipeline options go here.
                backend=PyPdfiumDocumentBackend # optional: pick an alternative backend
            ),
            # InputFormat.DOCX: WordFormatOption(
            #     pipeline_cls=SimplePipeline # default for office formats and HTML
            # ),
        },
    )
)

## Figure Export Without Table Structure
* https://ds4sd.github.io/docling/examples/export_figures/


Treating table as image
* https://github.com/DS4SD/docling/issues/590

In [27]:
result_dir = f"results/docling/{fname}/figure_export_without_table_structure"
if not os.path.exists(result_dir):
    os.makedirs(result_dir)

result_dir = Path(result_dir)

In [28]:
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem

In [29]:
IMAGE_RESOLUTION_SCALE = 2.0

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
pipeline_options.do_table_structure = False ## Don't Parse Table

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [30]:
start = time.time()
conv_res = doc_converter.convert(source)
end = time.time()
print("Converted in {:.3f}".format(end-start))

Converted in 15.882


In [31]:
doc_filename = conv_res.input.file.stem
# Save page images
for page_no, page in conv_res.document.pages.items():
    page_no = page.page_no
    page_image_filename = result_dir / f"{doc_filename}-{page_no}.png"
    with page_image_filename.open("wb") as fp:
        page.image.pil_image.save(fp, format="PNG")

In [32]:
table_counter = 0
picture_counter = 0
for element, _level in conv_res.document.iterate_items():
    if isinstance(element, TableItem):
        table_counter += 1
        element_image_filename = (
            result_dir / f"{doc_filename}-table-{table_counter}.png"
        )
        with element_image_filename.open("wb") as fp:
            element.get_image(conv_res.document).save(fp, "PNG")

    if isinstance(element, PictureItem):
        picture_counter += 1
        element_image_filename = (
            result_dir / f"{doc_filename}-picture-{picture_counter}.png"
        )
        with element_image_filename.open("wb") as fp:
            element.get_image(conv_res.document).save(fp, "PNG")

In [33]:
# docling_core.types.doc.document.DoclingDocument
# https://github.com/DS4SD/docling-core/blob/127dd2f6f8862e2c74f821cdb3a1995ee0a243cc/docling_core/types/doc/document.py#L1323
type(conv_res.document)

docling_core.types.doc.document.DoclingDocument

In [34]:
# Save markdown with externally referenced pictures
md_filename = result_dir / f"{doc_filename}-with-image-refs.md"

# https://github.com/DS4SD/docling-core/blob/127dd2f6f8862e2c74f821cdb3a1995ee0a243cc/docling_core/types/doc/document.py#L1936
conv_res.document.save_as_markdown(
    md_filename,
    image_mode=ImageRefMode.REFERENCED
)

In [36]:
## Dump DoclingDocument
converted_dict = conv_res.document.export_to_dict()
print(converted_dict.keys())
with open(result_dir / f"{doc_filename}-with-image-refs.json", "w") as f:
    f.write(json.dumps(converted_dict, indent = "\t"))

dict_keys(['schema_name', 'version', 'name', 'origin', 'furniture', 'body', 'groups', 'texts', 'pictures', 'tables', 'key_value_items', 'pages'])


In [45]:
conv_res.document.body.children[:20]

[RefItem(cref='#/texts/0'),
 RefItem(cref='#/texts/1'),
 RefItem(cref='#/texts/2'),
 RefItem(cref='#/texts/3'),
 RefItem(cref='#/texts/4'),
 RefItem(cref='#/texts/5'),
 RefItem(cref='#/texts/6'),
 RefItem(cref='#/groups/0'),
 RefItem(cref='#/texts/11'),
 RefItem(cref='#/texts/12'),
 RefItem(cref='#/texts/13'),
 RefItem(cref='#/texts/14'),
 RefItem(cref='#/texts/15'),
 RefItem(cref='#/texts/16'),
 RefItem(cref='#/texts/17'),
 RefItem(cref='#/texts/18'),
 RefItem(cref='#/texts/19'),
 RefItem(cref='#/texts/20'),
 RefItem(cref='#/texts/21'),
 RefItem(cref='#/texts/22')]

In [50]:
conv_res.document.groups[0].children

[RefItem(cref='#/texts/7'),
 RefItem(cref='#/texts/8'),
 RefItem(cref='#/texts/9'),
 RefItem(cref='#/texts/10')]

In [51]:
conv_res.document.texts[7]

TextItem(self_ref='#/texts/7', parent=RefItem(cref='#/groups/0'), children=[], label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=126.882, t=508.153, r=210.552, b=475.27699999999993, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 46))], orig='Llion Jones ∗ Google Research llion@google.com', text='Llion Jones ∗ Google Research llion@google.com')