In [1]:
import langchain
print(langchain.__version__)

0.3.13


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

###Call LLM

In [3]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from typing import List
from langchain_core.documents import Document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=32,
    length_function=len
)

In [4]:
from llama_parse import LlamaParse
from langchain_community.document_loaders import UnstructuredMarkdownLoader

instruction = """The provided document is a PDF file containing structured and unstructured content.
It may include financial information, tables, management discussions, and analyses.
Try to capture the essence of the document, including text, tables, and key highlights.
Be precise and ensure data integrity while processing."""

async def parse_pdf(file_path: str):
  parser = LlamaParse(
      result_type="markdown",
      parsing_instruction=instruction,
      max_timeout=5000,
  )
  return await parser.aload_data(file_path)

async def load_and_combine_documents(folder_path: str, output_file: str):
  combined_content = ""
  for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if filename.endswith('.pdf'):
        print(f"Parsing {filename}...")
        parsed_data = await parse_pdf(file_path)
        combined_content += f"# Document: {filename}\n\n{parsed_data}\n\n"
    else:
        print(f"Unsupported file type: {filename}")
  with open(output_file, "w", encoding="utf-8") as md_file:
      md_file.write(combined_content)
  print(f"All documents combined into {output_file}")

def read_markdown_with_loader(file_path: str):
  loader = UnstructuredMarkdownLoader(file_path)
  documents = loader.load()
  return documents

In [5]:
folder_path = "./dataset"
output_file = "./dataset/combined_documents.md"
await load_and_combine_documents(folder_path, output_file)

Parsing ColPali_2407.01449v3.pdf...
Started parsing the file under job_id efadb68a-cc72-4f97-8c49-a9cf299b9954
All documents combined into ./dataset/combined_documents.md


In [6]:
documents = read_markdown_with_loader(output_file)

In [7]:
splits = text_splitter.split_documents(documents)

###Create and persist Chroma vector store

In [8]:
from langchain_chroma import Chroma

embedding_function = OpenAIEmbeddings()
collection_name = "rag_service_collection_nb_llama_parse"
vectorstore = Chroma.from_documents(collection_name=collection_name, documents=splits, embedding=embedding_function, persist_directory="./chroma_db")
#db.persist()

print("Vector store created and persisted to './chroma_db'")

Vector store created and persisted to './chroma_db'


In [9]:
query = "How to understand documents visually?"

In [10]:
# 5. Perform similarity search
search_results = vectorstore.similarity_search(query, k=5)

print(f"\nTop 2 most relevant chunks for the query: '{query}'\n")
for i, result in enumerate(search_results, 1):
    print(f"Result {i}:")
    print(f"Source: {result.metadata.get('source', 'Unknown')}")
    print(f"Content: {result.page_content}")
    print()


Top 2 most relevant chunks for the query: 'How to understand documents visually?'

Result 1:
Source: ./dataset/combined_documents.md
Content: Visually Rich Document Understanding:\n- Models that encode text alongside visual features have been developed.\n- Large Language Models (LLMs) are combined with Vision Transformers (ViTs) to enhance understanding.\n\n3. PaliGemma Model:\n- A model that integrates visual and textual embeddings, fine-tuned for enhanced performance in tasks like Visual Question Answering and document understanding.\n\n### ViDoRe Benchmark\n- Purpose: To evaluate retrieval systems' ability to match queries to relevant

Result 2:
Source: ./dataset/combined_documents.md
Content: considers visual elements.\n- ColPali Model: A novel architecture that leverages Vision Language Models for better document understanding and retrieval efficiency.\n\n----\n\nFigure 1 Description:\nThe figure illustrates how ColPali identifies relevant document image patches in response to us

In [11]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
retriever.invoke(query)

[Document(metadata={'source': './dataset/combined_documents.md'}, page_content="Visually Rich Document Understanding:\\n- Models that encode text alongside visual features have been developed.\\n- Large Language Models (LLMs) are combined with Vision Transformers (ViTs) to enhance understanding.\\n\\n3. PaliGemma Model:\\n- A model that integrates visual and textual embeddings, fine-tuned for enhanced performance in tasks like Visual Question Answering and document understanding.\\n\\n### ViDoRe Benchmark\\n- Purpose: To evaluate retrieval systems' ability to match queries to relevant"),
 Document(metadata={'source': './dataset/combined_documents.md'}, page_content='considers visual elements.\\n- ColPali Model: A novel architecture that leverages Vision Language Models for better document understanding and retrieval efficiency.\\n\\n----\\n\\nFigure 1 Description:\\nThe figure illustrates how ColPali identifies relevant document image patches in response to user queries, highlighting t

In [12]:
from langchain_core.prompts import ChatPromptTemplate
template = """You are a highly capable assistant specializing in answering questions from visually rich documents. Consider both textual and visual elements as context.

Given the context below:
{context}

And the question:
{question}

Provide a precise and concise answer based solely on the provided context. Do not include any information that is not explicitly present in the context.

Answer:"""
prompt = ChatPromptTemplate.from_template(template)


In [13]:
from langchain.schema.runnable import RunnablePassthrough
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} | prompt
)
rag_chain.invoke(query)

ChatPromptValue(messages=[HumanMessage(content='You are a highly capable assistant specializing in answering questions from visually rich documents. Consider both textual and visual elements as context.\n\nGiven the context below:\n[Document(metadata={\'source\': \'./dataset/combined_documents.md\'}, page_content="Visually Rich Document Understanding:\\\\n- Models that encode text alongside visual features have been developed.\\\\n- Large Language Models (LLMs) are combined with Vision Transformers (ViTs) to enhance understanding.\\\\n\\\\n3. PaliGemma Model:\\\\n- A model that integrates visual and textual embeddings, fine-tuned for enhanced performance in tasks like Visual Question Answering and document understanding.\\\\n\\\\n### ViDoRe Benchmark\\\\n- Purpose: To evaluate retrieval systems\' ability to match queries to relevant"), Document(metadata={\'source\': \'./dataset/combined_documents.md\'}, page_content=\'considers visual elements.\\\\n- ColPali Model: A novel architecture

In [14]:
def docs2str(docs):
  return "\n\n".join(doc.page_content for doc in docs)

In [15]:
rag_chain = (
  {"context": retriever | docs2str, "question": RunnablePassthrough()} | prompt
)
rag_chain.invoke(query)

ChatPromptValue(messages=[HumanMessage(content="You are a highly capable assistant specializing in answering questions from visually rich documents. Consider both textual and visual elements as context.\n\nGiven the context below:\nVisually Rich Document Understanding:\\n- Models that encode text alongside visual features have been developed.\\n- Large Language Models (LLMs) are combined with Vision Transformers (ViTs) to enhance understanding.\\n\\n3. PaliGemma Model:\\n- A model that integrates visual and textual embeddings, fine-tuned for enhanced performance in tasks like Visual Question Answering and document understanding.\\n\\n### ViDoRe Benchmark\\n- Purpose: To evaluate retrieval systems' ability to match queries to relevant\n\nconsiders visual elements.\\n- ColPali Model: A novel architecture that leverages Vision Language Models for better document understanding and retrieval efficiency.\\n\\n----\\n\\nFigure 1 Description:\\nThe figure illustrates how ColPali identifies rel

In [16]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

llm = ChatOpenAI(model="gpt-4o-mini")

In [17]:
rag_chain = (
    {"context": retriever | docs2str, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
question = query
response = rag_chain.invoke(question)
print(response)

Understanding documents visually involves integrating visual features with textual information to enhance retrieval systems. This is achieved through models like the PaliGemma and ColPali, which utilize visual and textual embeddings to improve performance in tasks such as Visual Question Answering and document understanding. Techniques such as Optical Character Recognition (OCR) and captioning are employed to extract and process visual elements, enabling systems to identify relevant document image patches in response to queries. The ViDoRe benchmark serves as a framework for evaluating the effectiveness of these multimodal retrieval systems.
