# Advanced Doclings - PDF (Any Documents) to MarkDown with Images and Tables

### Enrich MarkDown with Images Explanation

In [1]:
## hide warning
import warnings
warnings.filterwarnings("ignore")

from dotenv import load_dotenv

load_dotenv('../.env')

True

In [2]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama
import base64

In [3]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama
import base64

model = ChatOpenAI(model="gpt-4o-mini")
# model = ChatOllama(model="llama3.2-vision", base_url="http://localhost:11434")

system_message = SystemMessage("""
                Extract detailed financial information from the provided image.
                Start by identifying the company name, document title, and any relevant details from the header and footer.

                Ensure to:
                    - Thoroughly extract all financial figures and metrics mentioned, such as revenue, profit, assets, liabilities, etc.
                    - Explain the financial data with technical details, including any relevant financial terminology or calculation methods.
                    - Summarize any regulatory or legal information provided in the document.
                
                Provide a complete and detailed description of the image in the form of table if possible.
                       """)

def get_image_description(image_data_path, image_urls, cleaned_content):
    images_data = []
    for url in image_urls:
        url = f"{image_data_path}/{url}".replace("%5C", "/")
        with open(url, "rb") as f:
            img_base64 = base64.b64encode(f.read()).decode("utf-8")

            img_dict = {"type": "image_url",
                        "image_url": {"url": f"data:image/{url.split('.')[-1]};base64,{img_base64}"}}
            
            images_data.append(img_dict)

    text = f"""Here is some reference content for the image. You need to ensure the generated image description fits into the given context.
                Do not write any preamble or explanation other than asked in the task described.

                ### Content to Get The Idea What This Image Is About:
                {cleaned_content}

                Generate a detailed description of the image. 
                Ensure that the description is comprehensive and no important data is missed.
                ### Image Description:"""
    
    text_message = {"type": "text", "text": text}
    
    final_message = [text_message] + images_data

    message = HumanMessage(content=final_message)

    # ResponseError: vision model only supports a single image per message
    # in case of Ollama Model, LLAMA3.2 Vision
    # message = HumanMessage(content=[text_message, images_data[0]])               
    
    response = model.invoke([system_message, message])

    return response.content

### Advanced Doclings - PDF to MarkDown with Images and Tables

In [4]:
# https://ds4sd.github.io/docling/examples/export_figures/
# https://github.com/laxmimerit/agentic-rag-with-langchain-and-langgraph

In [5]:
import logging
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat

IMAGE_RESOLUTION_SCALE = 2.0

def get_pdf_markdown(input_doc_path, target_dir):
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path(input_doc_path)  # Ensure it's a Path object
    target_dir = Path(target_dir)  # Ensure it's a Path object
    output_dir = Path("scratch")  # Intermediate directory for storing images

    # Configure the pipeline options
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    # Initialize the document converter
    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    # Convert the input PDF document
    conv_res = doc_converter.convert(input_doc_path)

    # Ensure the output directories exist
    output_dir.mkdir(parents=True, exist_ok=True)
    target_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Save markdown with externally referenced pictures
    md_filename = target_dir / f"{doc_filename}-with-image-refs.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

    return md_filename

# Usage example
input_doc_path = r"Earnings-Presentation-Q3-2024.pdf"
md_filename = get_pdf_markdown(input_doc_path, target_dir="output")
print(md_filename)

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
Could not load the custom kernel for multi-scale deformable attention: Command '['where', 'cl']' returned non-zero exit status 1.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specifie

output\Earnings-Presentation-Q3-2024-with-image-refs.md


### MarkDown Splitters

In [6]:
md_filename

WindowsPath('output/Earnings-Presentation-Q3-2024-with-image-refs.md')

In [7]:
from langchain_text_splitters import MarkdownHeaderTextSplitter, MarkdownTextSplitter

# md_filename = r"output\Earnings-Presentation-Q3-2024-with-image-refs.md"

def get_markdown_splits(md_filename):
    with open(md_filename, "r", encoding="utf-8") as f:
        markdown_content = f.read()

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
    md_header_splits = markdown_splitter.split_text(markdown_content)

    return md_header_splits

md_header_splits = get_markdown_splits(md_filename)

In [12]:
print(md_header_splits[9].page_content)

## Family Daily Active People (DAP)  
In Billions  
![Image](Earnings-Presentation-Q3-2024-with-image-refs_artifacts%5Cimage_000015_b38533143e867d40a45b28680284201d641d46c933fd05b19a2f1b7b6933c0b4.png)  
We define a daily active person (DAP) as a registered and logged-in user of Facebook, Instagram, Messenger, and/or WhatsApp (collectively, our "Family" of products) who visited at least one of these Family products through a mobile device application or using a web or mobile browser on a given day. The numbers for DAP do not include users on our other products unless they would otherwise qualify as DAP based on their other activities on our Family products.  
We do not require people to use a common identifier or link their accounts to use multiple products in our Family, and therefore must seek to attribute multiple user accounts within and across products to individual people. Our calculations of DAP rely upon complex techniques, algorithms, and machine learning models that seek to e

In [13]:
import re

def extract_image_urls_and_clean_content(page_content):
    # Define the regex pattern to match image URLs
    pattern = r"!\[Image\]\(([^)]+)\)"

    # Find all matches in the page content
    image_urls = re.findall(pattern, page_content)

    # Remove all matched image URLs from the content
    cleaned_content = re.sub(pattern, "", page_content)

    return image_urls, cleaned_content

In [14]:
extract_image_urls_and_clean_content(md_header_splits[9].page_content)

(['Earnings-Presentation-Q3-2024-with-image-refs_artifacts%5Cimage_000015_b38533143e867d40a45b28680284201d641d46c933fd05b19a2f1b7b6933c0b4.png',
  'Earnings-Presentation-Q3-2024-with-image-refs_artifacts%5Cimage_000016_278dad5e9c44f5552629e781407508746b2c8c43e8597f0a69705fe8f19aa660.png'],
 '## Family Daily Active People (DAP)  \nIn Billions  \n  \nWe define a daily active person (DAP) as a registered and logged-in user of Facebook, Instagram, Messenger, and/or WhatsApp (collectively, our "Family" of products) who visited at least one of these Family products through a mobile device application or using a web or mobile browser on a given day. The numbers for DAP do not include users on our other products unless they would otherwise qualify as DAP based on their other activities on our Family products.  \nWe do not require people to use a common identifier or link their accounts to use multiple products in our Family, and therefore must seek to attribute multiple user accounts within an

### Enrich MarkDown with Images Explanation

In [16]:
def enrich_document_with_image(md_header_splits):
    documents = []
    for page in md_header_splits:
        image_urls, cleaned_content = extract_image_urls_and_clean_content(page.page_content)
        # read image from file
        
        image_data_path = "output"
        image_description = get_image_description(image_data_path, image_urls, cleaned_content)
            
        
        merged_content = cleaned_content + "\n\nExtracted Image Description:\n" + image_description

        documents.append(merged_content)

        # print("page: ", page.metadata)
        # print("response: ", image_description)
        # print("\n\n")


    enriched_content = "\n\n".join(documents)

    return enriched_content

enriched_content = enrich_document_with_image(md_header_splits)

with open("output/enriched_content.md", "w", encoding="utf-8") as f:
    f.write(enriched_content)
    f.close()

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

### Reading Entire Directory

In [None]:
# refer previous section