In [3]:
%pip install docling

Collecting docling
  Obtaining dependency information for docling from https://files.pythonhosted.org/packages/7f/62/6aaf9263770df9716f9c8b56a9b2eb186440e54d327d8ea72bb0a7b1dc31/docling-2.31.0-py3-none-any.whl.metadata
  Downloading docling-2.31.0-py3-none-any.whl.metadata (10 kB)
Collecting docling-core[chunking]<3.0.0,>=2.26.0 (from docling)
  Obtaining dependency information for docling-core[chunking]<3.0.0,>=2.26.0 from https://files.pythonhosted.org/packages/80/b1/f966d3516e314b8206fc069f9e6509d67f75331bd4bd29ceb3215a912305/docling_core-2.28.1-py3-none-any.whl.metadata
  Downloading docling_core-2.28.1-py3-none-any.whl.metadata (6.0 kB)
Collecting docling-ibm-models<4.0.0,>=3.4.0 (from docling)
  Obtaining dependency information for docling-ibm-models<4.0.0,>=3.4.0 from https://files.pythonhosted.org/packages/2c/42/c672af0db176e27fe00aa02c4f3168bc639851381fea136e0d7530378309/docling_ibm_models-3.4.2-py3-none-any.whl.metadata
  Downloading docling_ibm_models-3.4.2-py3-none-any.whl.

Preload docling models

pip install -U "huggingface_hub[cli]"
huggingface-cli login
huggingface-cli download ds4sd/docling-models

In [4]:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PipelineOptions, PdfPipelineOptions

import logging
import os
import json
from dotenv import load_dotenv
from pathlib import Path
import glob


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

load_dotenv()
file_type=os.getenv("FILE_TYPE",".PDF")
file_type_docx=os.getenv("FILE_TYPE",".DOCX")
file_source=os.getenv("FILE_SOURCE_LOCATION","sourcedocs")
md_destination=os.getenv("MARKDOWN_LOCATION","sourcedocs")

if file_type.lower() not in ".pdf .docx .odf .txt":
    raise Exception("Invalid or empty file type. Only PDF, DOCX or ODF files supported. Set in FILE_TYPE envar")

if not file_source:
    raise Exception("Invalid or empty file source location. Set in FILE_SOURCE_LOCATION")

if not md_destination:
    raise Exception("Invalid or empty file source location. Set in MARKDOWN_LOCATION")

file_list=[]

Figure out what source of files we're dealing with and then list and filter them. Returning a list of files that we need to process.

In [5]:
def filter_file_ext(filename) -> bool:
    _, file_extension = os.path.splitext(filename)

    if not file_extension:
        return False
    
    if file_extension.lower().strip() in file_type.lower() or file_extension.lower().strip() in file_type_docx.lower():
        return True
    else:
        return False

In [6]:
for file in glob.iglob(file_source+"/*", recursive=False):
    #file_path = Path.joinpath(Path(file_source), file)
    file_path = file
    file_list.append(file_path)
  
filtered_files = filter(filter_file_ext,file_list)

In [7]:
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False


doc_converter = (
    DocumentConverter(  
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.DOCX,
        ],  
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend,pipeline_options=pipeline_options,
            ),
            InputFormat.DOCX: WordFormatOption(
                pipeline_cls=SimplePipeline  
            ),
        },
    )
)

In [8]:
process_files=list(filtered_files)

conv_results = doc_converter.convert_all(
        process_files,
        raises_on_error=False, 
    )
out_path = Path(md_destination)

for res in conv_results:
    with (out_path / f"{res.input.file.stem}.md").open("wb") as fp:
                fp.write(res.document.export_to_markdown().encode("UTF-8"))

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash a6199fbf72cfc8f691d8f6329c854161
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered picture descriptions: ['vlm', 'api']
INFO:docling.pipeline.base_pipeline:Processing document 2304.14953v2-part1.pdf
INFO:docling.document_converter:Finished converting document 2304.14953v2-part1.pdf in 59.47 sec.
INFO:docling.document_converter:Initializing pipeline for SimplePipeline with options hash 4cc01982ae99b46a2a63fcda46c47c35
INFO:docling.pipeline.base_pipe