In [None]:
# References

# https://docling-project.github.io/docling/examples/minimal/
# https://docling-project.github.io/docling/reference/docling_document/#docling_core.types.doc
# https://github.com/casedone/rag-multimodal

In [None]:
# Required Libraries

# pip install docling
# pip install tokencost

In [None]:
import os
import time
import pandas as pd

from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (PdfPipelineOptions, PictureDescriptionApiOptions)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode
from langchain.schema import Document
from langchain_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
from dotenv import load_dotenv

load_dotenv()

In [None]:
# source = "https://arxiv.org/pdf/2408.09869"
source = "https://emcdevstoragev2.blob.core.windows.net/public/efba9f0b-70cc-4dab-b6b7-5812a22c0c37.pdf"

In [None]:
picture_desc_api_option = PictureDescriptionApiOptions(
    url=os.getenv("AZURE_OPENAI_GPT_4O_FULL_ENDPOINT"),
    prompt="Describe this image in sentences in a single paragraph.",
    params={
        "model":"gpt-4o",
        "max_tokens": 200,
        "temperature": 0.5
    },
    headers={
        "api-key": os.getenv("AZURE_OPENAI_API_KEY"),
    },
    timeout=90,
)

In [None]:
# Docling Parse without EasyOCR

# do_ocr is a configuration option in Docling's PDF processing pipeline. 
# When do_ocr is set to True, Docling applies Optical Character Recognition (OCR) to extract text from images or scanned pages in PDF files. 
# If set to False, it will only try to extract selectable or digital text without using OCR. 
# This is useful for handling scanned or image-based documents where text can't be directly extracted.

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.generate_picture_images = True
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = picture_desc_api_option
pipeline_options.enable_remote_services = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [None]:
start_time = time.time()
conv_result = doc_converter.convert(source)
end_time = time.time() - start_time

print(f"Document converted in {end_time:.2f} seconds.")

In [None]:
# Markdown export

mark_down = conv_result.document.export_to_markdown(
    page_break_placeholder="--- PAGE BREAK ---", 
    image_mode=ImageRefMode.PLACEHOLDER
)

In [None]:
page_split = mark_down.split("--- PAGE BREAK ---")
    
documents: list[Document] = []

for i, page in enumerate(page_split):
    doc = Document(
        page_content=page.strip(),
        metadata={"page": i + 1}
    )
    documents.append(doc)

In [None]:
# MKL workaround for duplicate library error
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

azOpenAIembeddings = AzureOpenAIEmbeddings(
    model="text-embedding-ada-002",
    api_version="2023-05-15",
)

vectorstore = FAISS.from_documents(
    documents=documents,
    embedding=azOpenAIembeddings
)

In [None]:
relevant_docs = vectorstore.similarity_search_with_relevance_scores("what securities azure provides?", k=10)

for i, (doc, score) in enumerate(relevant_docs):
    print(f"Relevant Document {doc.metadata['page']} (Score: {score}):\n\n{doc.page_content}\n")
    print('--- END OF RELEVANT DOCUMENT ---')

### Docling table export POC

In [None]:
# Table export

for table_ix, table in enumerate(conv_result.document.tables):
    table_df: pd.DataFrame = table.export_to_dataframe()
    print(f"## Table {table_ix}")
    print(f"{table_df.to_markdown()}")

### Get total token & cost POC

In [None]:
from tokencost import calculate_prompt_cost, count_string_tokens

def get_token_cost(text: str, model_name: str) -> dict:
    prompt = text
    model_name = model_name
    return {
        "tokens": count_string_tokens(prompt, model=model_name),
        "cost": calculate_prompt_cost(prompt, model=model_name)
    }