In [None]:
%pip install docling

Preload docling models

pip install -U "huggingface_hub[cli]"
huggingface-cli login
huggingface-cli download ds4sd/docling-models

In [None]:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PipelineOptions, PdfPipelineOptions

import logging
import os
import json
from dotenv import load_dotenv
from pathlib import Path
import glob


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

load_dotenv()
file_type=os.getenv("FILE_TYPE",".PDF")
file_type_docx=os.getenv("FILE_TYPE",".DOCX")
file_source=os.getenv("FILE_SOURCE_LOCATION","sourcedocs")
md_destination=os.getenv("MARKDOWN_LOCATION","sourcedocs")

if file_type.lower() not in ".pdf .docx .odf .txt":
    raise Exception("Invalid or empty file type. Only PDF, DOCX or ODF files supported. Set in FILE_TYPE envar")

if not file_source:
    raise Exception("Invalid or empty file source location. Set in FILE_SOURCE_LOCATION")

if not md_destination:
    raise Exception("Invalid or empty file source location. Set in MARKDOWN_LOCATION")

file_list=[]

Figure out what source of files we're dealing with and then list and filter them. Returning a list of files that we need to process.

In [None]:
def filter_file_ext(filename) -> bool:
    _, file_extension = os.path.splitext(filename)

    if not file_extension:
        return False
    
    if file_extension.lower().strip() in file_type.lower() or file_extension.lower().strip() in file_type_docx.lower():
        return True
    else:
        return False

In [None]:
for file in glob.iglob(file_source+"/*", recursive=False):
    #file_path = Path.joinpath(Path(file_source), file)
    file_path = file
    file_list.append(file_path)
  
filtered_files = filter(filter_file_ext,file_list)

In [None]:
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False


doc_converter = (
    DocumentConverter(  
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.DOCX,
        ],  
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend,pipeline_options=pipeline_options,
            ),
            InputFormat.DOCX: WordFormatOption(
                pipeline_cls=SimplePipeline  
            ),
        },
    )
)

In [None]:
process_files=list(filtered_files)

conv_results = doc_converter.convert_all(
        process_files,
        raises_on_error=False, 
    )
out_path = Path(md_destination)

for res in conv_results:
    with (out_path / f"{res.input.file.stem}.md").open("wb") as fp:
                fp.write(res.document.export_to_markdown().encode("UTF-8"))