In [36]:
import json
import logging
import time
from pathlib import Path

In [37]:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode

In [38]:
source1 = "../Event_Sequence_Task_Typology.pdf"  # document per local path or URL
source2 = "../Event_Sequence_Task_Typology_supp.pdf"  # document per local path or URL

In [39]:
pipeline_options = PdfPipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.mode = (
    TableFormerMode.ACCURATE
)  # use more accurate TableFormer model

pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)

In [40]:
_log = logging.getLogger(__name__)

In [41]:
logging.basicConfig(level=logging.INFO)
start_time = time.time()
conv_result = doc_converter.convert(source1)
end_time = time.time() - start_time
_log.info(f"Document converted in {end_time:.2f} seconds.")

with open("output.md", "w") as md_file:
    md_file.write(conv_result.document.export_to_markdown())

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.pipeline.base_pipeline:Processing document Event_Sequence_Task_Typology.pdf
INFO:docling.document_converter:Finished converting document Event_Sequence_Task_Typology.pdf in 12.87 sec.
INFO:__main__:Document converted in 12.87 seconds.


In [42]:
start_time = time.time()
conv_result = doc_converter.convert(source2)
end_time = time.time() - start_time
_log.info(f"Document converted in {end_time:.2f} seconds.")

with open("output.md", "a") as md_file:
    md_file.write(conv_result.document.export_to_markdown())

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Event_Sequence_Task_Typology_supp.pdf
INFO:docling.document_converter:Finished converting document Event_Sequence_Task_Typology_supp.pdf in 6.24 sec.
INFO:__main__:Document converted in 6.24 seconds.


## Remove references

In [43]:
import re

In [44]:
def remove_references(input_file, output_file):
    try:
        # Read the content of the file
        with open(input_file, "r", encoding="utf-8") as f:
            content = f.read()

        # Remove [ ? ] references using regex
        # This pattern matches [ ? ] with optional whitespace
        cleaned_content = re.sub(r"\s*\[\s*\?\s*\]\s*", " ", content)

        # Write the cleaned content back to file
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(cleaned_content)

        print(f"Successfully removed references from {input_file}")
        print(f"Output saved to {output_file}")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [None]:
# Use the function
input_path = "output.md"
output_path = "output_cleaned.md"

remove_references(input_path, output_path)

## previous pipeline


In [None]:
converter = DocumentConverter()

In [None]:
result = converter.convert(source1)

In [None]:
print(
    result.document.export_to_markdown()
)  # output: "## Docling Technical Report[...]"
with open("output.md", "w") as md_file:
    md_file.write(result.document.export_to_markdown())

In [None]:
import re
from xml.etree import ElementTree as ET
from xml.dom import minidom

In [None]:
def markdown_to_xml(md_file, xml_file):
    # Read markdown file
    with open(md_file, "r") as f:
        content = f.read()

    # Create root element
    root = ET.Element("document")

    # Split content into sections by headers
    sections = re.split(r"(?=## )", content)

    for section in sections:
        if section.strip():
            # Extract header and content
            parts = section.split("\n", 1)
            if len(parts) > 1:
                header = parts[0].strip("# ")
                content = parts[1].strip()

                # Create section element
                section_elem = ET.SubElement(root, "section")
                title_elem = ET.SubElement(section_elem, "title")
                title_elem.text = header
                content_elem = ET.SubElement(section_elem, "content")
                content_elem.text = content

    # Pretty print XML
    xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="    ")

    # Write to file
    with open(xml_file, "w") as f:
        f.write(xml_str)

In [None]:
# Usage
markdown_to_xml("output.md", "output.xml")

In [None]:
result.document.model_dump_json()
with open("output.json", "w") as file:
    file.write(result.document.model_dump_json())