### Enrich MarkDown with Images Explanation

In [None]:
## hide warning
import warnings
warnings.filterwarnings("ignore")

from dotenv import load_dotenv

load_dotenv('../.env')

In [None]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama
import base64

model = ChatOpenAI(model="gpt-4o-mini")
# model = ChatOllama(model="llama3.2-vision", base_url="http://localhost:11434")

system_message = SystemMessage("""
                Extract detailed financial information from the provided image. Start by identifying the company name, document title, and any relevant details from the header and footer.

                Ensure to:
                    - Thoroughly extract all financial figures and metrics mentioned, such as revenue, profit, assets, liabilities, etc.
                    - Explain the financial data with technical details, including any relevant financial terminology or calculation methods.
                    - Summarize any regulatory or legal information provided in the document.
                
                Provide a complete and detailed description of the image, ensuring no important data is missed.
                       """)

def get_image_description(image_data_path, image_urls, cleaned_content):
    images_data = []
    for url in image_urls:
        url = f"{image_data_path}/{url}".replace("%5C", "/")
        with open(url, "rb") as f:
            img_base64 = base64.b64encode(f.read()).decode("utf-8")

            img_dict = {"type": "image_url",
                        "image_url": {"url": f"data:image/{url.split('.')[-1]};base64,{img_base64}"}}
            
            images_data.append(img_dict)

    text = f"""Here is some reference content for the image. You need to ensure the generated image description fits into the given context.
                Do not write any preamble or explanation other than asked in the task described.

                ### Content to Get The Idea What This Image Is About:
                {cleaned_content}

                Generate a detailed description of the image. 
                Ensure that the description is comprehensive and no important data is missed.
                ### Image Description:"""
    
    text_message = {"type": "text", "text": text}
    
    final_message = [text_message] + images_data

    message = HumanMessage(content=final_message)

    # ResponseError: vision model only supports a single image per message
    # in case of Ollama Model, LLAMA3.2 Vision
    # message = HumanMessage(content=[text_message, images_data[0]])               
    
    response = model.invoke([system_message, message])

    return response.content

### Doclings 
- 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
- 📑 Advanced PDF document understanding including page layout, reading order & table structures
- 🧩 Unified, expressive DoclingDocument representation format
- 🤖 Plug-and-play integrations incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
- 🔍 OCR support for scanned PDFs
- 💻 Simple and convenient CLI

In [None]:
# https://github.com/DS4SD/docling

In [None]:
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType

FILE_PATH = r"..\00 Dataset\docs\facebook\Meta-09-30-2024-Exhibit-99-1_FINAL.pdf"
FILE_PATH = r"..\00 Dataset\docs\facebook\Downloadable-BS-Q3-24.xlsx"
# FILE_PATH = "https://pdfobject.com/pdf/sample.pdf"

# get markdown of the files which don't have any figure
def get_markdown_without_figure(input_doc_path, target_dir):
    loader = DoclingLoader(file_path=input_doc_path,
                           export_type=ExportType.MARKDOWN)

    docs = loader.load()

    # doc file name
    doc_filename = Path(input_doc_path).stem

    md_filename =  f"{target_dir}/{doc_filename}-with-image-refs.md"

    # write markdown to file
    with open(md_filename, "w", encoding="utf-8") as f:
        f.write(docs[0].page_content)
        f.close()

    return md_filename


input_doc_path = r"..\00 Dataset\docs\facebook\Earnings-Presentation-Q3-2024.pdf"
md_filename = get_markdown_without_figure(input_doc_path, 'output')

md_filename
# docs

### PDF to MarkDown with Images and Tables

In [None]:
import logging
import time
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat

_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0

def get_pdf_markdown(input_doc_path, target_dir):
    logging.basicConfig(level=logging.INFO)

    # input_doc_path = Path('localfile path')
    
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Save markdown with externally referenced pictures
    md_filename =  f"{target_dir}/{doc_filename}-with-image-refs.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

    return md_filename

input_doc_path = r"..\00 Dataset\docs\facebook\Earnings-Presentation-Q3-2024.pdf"
md_filename = get_pdf_markdown(input_doc_path, target_dir="output")
md_filename

### MarkDown Splitters

In [6]:
from langchain_text_splitters import MarkdownHeaderTextSplitter, MarkdownTextSplitter

# md_filename = r"scratch\Earnings-Presentation-Q3-2024-with-image-refs.md"

def get_markdown_splits(md_filename):
    with open(md_filename, "r", encoding="utf-8") as f:
        markdown_content = f.read()

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
    md_header_splits = markdown_splitter.split_text(markdown_content)

    return md_header_splits

md_header_splits = get_markdown_splits(md_filename)

In [7]:
import re

def extract_image_urls_and_clean_content(page_content):
    # Define the regex pattern to match image URLs
    pattern = r"!\[Image\]\(([^)]+)\)"

    # Find all matches in the page content
    image_urls = re.findall(pattern, page_content)

    # Remove all matched image URLs from the content
    cleaned_content = re.sub(pattern, "", page_content)

    return image_urls, cleaned_content

In [None]:
for page in md_header_splits:
    # print(f"Header: {page.metadata}")
    # print(f"Content: {page.page_content}")
    # Extract image URLs from the page content
    image_urls, cleaned_content = extract_image_urls_and_clean_content(page.page_content)
    print(f"Image URLs: {image_urls}")
    # print(f"Cleaned Content: {cleaned_content}")
    # print("\n")

# md_header_splits

### Enrich MarkDown with Images Explanation

In [None]:
def enrich_document_with_image(md_header_splits):
    documents = []
    for page in md_header_splits:
        image_urls, cleaned_content = extract_image_urls_and_clean_content(page.page_content)
        # read image from file
        
        image_data_path = "scratch"
        image_description = get_image_description(image_data_path, image_urls, cleaned_content)
            
        
        merged_content = cleaned_content + "\n\nExtracted Image Description:\n" + image_description

        documents.append(merged_content)

        # print("page: ", page.metadata)
        # print("response: ", image_description)
        # print("\n\n")


    enriched_content = "\n\n".join(documents)

    return enriched_content

enriched_content = enrich_document_with_image(md_header_splits)

with open("enriched_content.md", "w", encoding="utf-8") as f:
    f.write(enriched_content)
    f.close()

### Reading Entire Directory

In [None]:
input_doc_path = r"..\00 Dataset\docs\facebook\Earnings-Presentation-Q3-2024.pdf"
md_filename = get_pdf_markdown(input_doc_path)

md_header_splits = get_markdown_splits(md_filename)

enriched_content = enrich_document_with_image(md_header_splits)

with open("enriched_content.md", "w", encoding="utf-8") as f:
    f.write(enriched_content)
    f.close()