In [1]:
## hide warning
import warnings
warnings.filterwarnings("ignore")

In [30]:
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType

FILE_PATH = r"..\00 Dataset\docs\facebook\Meta-09-30-2024-Exhibit-99-1_FINAL.pdf"

loader = DoclingLoader(file_path=FILE_PATH,
                       export_type=ExportType.MARKDOWN,
                       )

In [31]:
docs = loader.load()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importin

In [33]:
# print(docs[0].page_content)
# write markdown to file
with open("output.md", "w", encoding="utf-8") as f:
    f.write(docs[0].page_content)
    f.close()

### PDF to MarkDown with Images and Tables

In [36]:
import logging
import time
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat

_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0

def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path('localfile path')
    input_doc_path = r"..\00 Dataset\docs\facebook\Earnings-Presentation-Q3-2024.pdf"
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Save markdown with externally referenced pictures
    md_filename = output_dir / f"{doc_filename}-with-image-refs.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

    end_time = time.time() - start_time

    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")

main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importin

### Describe Image with LLM

In [19]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama

import base64

# read image from file
with open(r"scratch\Earnings-Presentation-Q3-2024-with-image-refs_artifacts\image_000004_8139d1246423312e74e78335d99a132a935f7de1b62a31df96dc7d8c91a47a7a.png", "rb") as f:
    image_data = base64.b64encode(f.read()).decode("utf-8")

# model = ChatOpenAI(model="gpt-4o-mini")
model = ChatOllama(model="llama3.2-vision", base_url="http://localhost:11434")
# model = ChatOllama(model="llama3.2", base_url="http://localhost:11434")


system = SystemMessage("""
                Extract detailed financial information from the provided image. Start by identifying the company name, document title, and any relevant details from the header and footer.

                Ensure to:
                    - Thoroughly extract all financial figures and metrics mentioned, such as revenue, profit, assets, liabilities, etc.
                    - Explain the financial data with technical details, including any relevant financial terminology or calculation methods.
                    - Summarize any regulatory or legal information provided in the document.
                
                Provide a complete and detailed description of the image, ensuring no important data is missed.
                       """)

message = HumanMessage(
    content=[
        {"type": "text", "text": "Extract financial information from the image below."},
        {
            "type": "image_url",
            "image_url": {"url": f"data:image/png;base64,{image_data}"}

        }
    ],
)
response = model.invoke([message])
print(response.content)

The image presents a comprehensive overview of financial data across four distinct categories: Rest of World, Asia-Pacific, Europe, and US & Canada. Each category is further divided into six quarters, spanning from Q1'23 to Q3'24.

**Category Breakdown**

*   **Rest of World**
    *   Q1'23: $27,714
    *   Q2'23: $31,999
    *   Q3'23: $32,165
    *   Q4'22: $28,645
    *   Q1'24: $39,071
    *   Q2'24: $40,589
*   **Asia-Pacific**
    *   Q1'23: $5,782
    *   Q2'23: $6,515
    *   Q3'23: $7,050
    *   Q4'22: $5,960
    *   Q1'24: $8,483
    *   Q2'24: $9,492
*   **Europe**
    *   Q1'23: $13,035
    *   Q2'23: $14,422
    *   Q3'23: $15,636
    *   Q4'22: $13,048
    *   Q1'24: $16,847
    *   Q2'24: $17,609
*   **US & Canada**
    *   Q1'23: $5,100
    *   Q2'23: $6,515
    *   Q3'23: $7,050
    *   Q4'22: $5,960
    *   Q1'24: $8,483
    *   Q2'24: $9,492

In conclusion, the image provides a detailed breakdown of financial data across four categories and six quarters. The data hi

### MarkDown Splitters

In [2]:
from langchain_text_splitters import MarkdownHeaderTextSplitter, MarkdownTextSplitter

markdown_path = r"scratch\Earnings-Presentation-Q3-2024-with-image-refs.md"
with open(markdown_path, "r", encoding="utf-8") as f:
    markdown_content = f.read()

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(markdown_content)



In [3]:
import re

def extract_image_urls_and_clean_content(page_content):
    # Define the regex pattern to match image URLs
    pattern = r"!\[Image\]\(([^)]+)\)"

    # Find all matches in the page content
    image_urls = re.findall(pattern, page_content)

    # Remove all matched image URLs from the content
    cleaned_content = re.sub(pattern, "", page_content)

    return image_urls, cleaned_content

In [4]:
for page in md_header_splits:
    # print(f"Header: {page.metadata}")
    # print(f"Content: {page.page_content}")
    # Extract image URLs from the page content
    image_urls, cleaned_content = extract_image_urls_and_clean_content(page.page_content)
    print(f"Image URLs: {image_urls}")
    # print(f"Cleaned Content: {cleaned_content}")
    # print("\n")

# md_header_splits

Image URLs: ['Earnings-Presentation-Q3-2024-with-image-refs_artifacts%5Cimage_000000_3db7e1eab3213fbdaaf83be4c917fbb9fd97bdf42a5033bd7e7e4df4b53f7b18.png']
Image URLs: ['Earnings-Presentation-Q3-2024-with-image-refs_artifacts%5Cimage_000001_278dad5e9c44f5552629e781407508746b2c8c43e8597f0a69705fe8f19aa660.png', 'Earnings-Presentation-Q3-2024-with-image-refs_artifacts%5Cimage_000002_1251d14efb5cba3ebd6d19703f740b8b0bb6b1d1a99275c9eb52941cb5e4c0f1.png']
Image URLs: ['Earnings-Presentation-Q3-2024-with-image-refs_artifacts%5Cimage_000003_7a5af1bd15760003c9cd20a1f0ac84b7eb0585e9c23c8d401c7e4a3226887e25.png', 'Earnings-Presentation-Q3-2024-with-image-refs_artifacts%5Cimage_000004_8139d1246423312e74e78335d99a132a935f7de1b62a31df96dc7d8c91a47a7a.png']
Image URLs: ['Earnings-Presentation-Q3-2024-with-image-refs_artifacts%5Cimage_000005_278dad5e9c44f5552629e781407508746b2c8c43e8597f0a69705fe8f19aa660.png']
Image URLs: ['Earnings-Presentation-Q3-2024-with-image-refs_artifacts%5Cimage_000006_dee32

### Enrich MarkDown with Images Explanation

In [14]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama
import base64

# model = ChatOpenAI(model="gpt-4o-mini")
model = ChatOllama(model="llama3.2-vision", base_url="http://localhost:11434")


system_message = SystemMessage("""
                Extract detailed financial information from the provided image. Start by identifying the company name, document title, and any relevant details from the header and footer.

                Ensure to:
                    - Thoroughly extract all financial figures and metrics mentioned, such as revenue, profit, assets, liabilities, etc.
                    - Explain the financial data with technical details, including any relevant financial terminology or calculation methods.
                    - Summarize any regulatory or legal information provided in the document.
                
                Provide a complete and detailed description of the image, ensuring no important data is missed.
                       """)


documents = []

for page in md_header_splits:
    image_urls, cleaned_content = extract_image_urls_and_clean_content(page.page_content)
    # read image from file
    images_data = []
    for url in image_urls:
        url = f"scratch/{url}".replace("%5C", "/")
        with open(url, "rb") as f:
            img_base64 = base64.b64encode(f.read()).decode("utf-8")
            # {"type": "image_url",
            # "image_url": {"url": f"data:image/png;base64,{image_data}"}}

            images_data.append({"type": "image_url", "image_url": {"url": f"data:image/{url.split('.')[-1]};base64,{img_base64}"}})

    text = f"""Here is some reference content for the image. You need to ensure the generated image description fits into the given context.
                Do not write any preamble or explanation other than asked in the task described.

                ### Content to Get The Idea What This Image Is About:
                {cleaned_content}

                Generate a detailed description of the image. 
                Ensure that the description is comprehensive and no important data is missed.
                ### Image Description:"""
    
    text_message = {"type": "text", "text": text}
    
    message = HumanMessage(content=[text_message] + images_data)      

    # ResponseError: vision model only supports a single image per message
    message = HumanMessage(content=[text_message, images_data[0]])               
    
    response = model.invoke([system_message, message])
    
    merged_content = cleaned_content + "\n\nExtracted Image Description:\n" + response.content

    documents.append(merged_content)

    print("page: ", page.metadata)
    print("response: ", response.content)
    print("\n\n")


enriched_content = "\n\n".join(documents)

with open("enriched_content.md", "w", encoding="utf-8") as f:
    f.write(enriched_content)
    f.close()

page:  {'Header 2': 'Meta Earnings Presentation Q3 2024'}
response:  The image presents a financial report from Meta, detailing its quarterly earnings for the third quarter (Q3) of 2024.

**Header Section**

*   The top section features the company's logo on the left side, accompanied by the title "Meta" in large font.
*   Below the title, the text "Earnings Presentation Q3 2024" is displayed.

**Financial Data**

The report provides detailed financial information for the third quarter of 2024, including:

*   **Revenue**: $34.4 billion
    *   Represented as a bar graph with a blue line indicating growth.
    *   A slight decline in revenue compared to Q3 2023 ($35.8 billion).
*   **Net Income**: $9.2 billion
    *   Shown as a bar graph with a red line indicating a decrease.
    *   Decrease of approximately $1.6 billion compared to Q3 2023 ($10.8 billion).
*   **Operating Expenses**: $15.7 billion (up from $14.4 billion in Q3 2023)
    *   Represented as a bar graph with a green lin

ResponseError: vision model only supports a single image per message