In [1]:
import langchain
print(langchain.__version__)

0.3.13


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

###Call LLM

In [3]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from typing import List
from langchain_core.documents import Document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=128,
    length_function=len
)

In [4]:
from llama_parse import LlamaParse
from langchain_community.document_loaders import UnstructuredMarkdownLoader

instruction = """The provided document is a PDF file containing structured and unstructured content.
It may include financial information, tables, management discussions, and analyses.
Try to capture the essence of the document, including text, tables, and key highlights.
Be precise and ensure data integrity while processing."""

async def parse_pdf(file_path: str):
  parser = LlamaParse(
      result_type="markdown",
      parsing_instruction=instruction,
      max_timeout=5000,
  )
  return await parser.aload_data(file_path)

async def load_and_combine_documents(folder_path: str, output_file: str):
  combined_content = ""
  for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if filename.endswith('.pdf'):
        print(f"Parsing {filename}...")
        parsed_data = await parse_pdf(file_path)
        combined_content += f"# Document: {filename}\n\n{parsed_data}\n\n"
    else:
        print(f"Unsupported file type: {filename}")
  with open(output_file, "w", encoding="utf-8") as md_file:
      md_file.write(combined_content)
  print(f"All documents combined into {output_file}")

def read_markdown_with_loader(file_path: str):
  loader = UnstructuredMarkdownLoader(file_path)
  documents = loader.load()
  return documents

In [5]:
from langchain.retrievers import ContextualCompressionRetriever, BM25Retriever, EnsembleRetriever
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_community.document_transformers.embeddings_redundant_filter import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import FlashrankRerank
from rank_bm25 import BM25Okapi

def create_bm25_index(chunks: List[Document]) -> BM25Okapi:
  tokenized_chunks = [chunk.page_content.split() for chunk in chunks]
  return BM25Okapi(tokenized_chunks)

def create_bm25_retriever(chunks: List[Document]) -> BM25Retriever:
  bm25_retriever = BM25Retriever.from_documents(chunks)
  return bm25_retriever

def create_flashrank_index(vectorstore):
  retriever = vectorstore.as_retriever(search_kwargs={"k":20})
  compression_retriever = ContextualCompressionRetriever(base_compressor=FlashrankRerank(), base_retriever=retriever)
  return compression_retriever

def create_ensemble_retriever_reranker(vectorstore, bm25_retriever, embeddings) -> EnsembleRetriever:
  retriever_vs = vectorstore.as_retriever(search_kwargs={"k":20})
  bm25_retriever.k =10
  ensemble_retriever = EnsembleRetriever(
      retrievers=[retriever_vs, bm25_retriever],
      weights=[0.5, 0.5]
  )
  redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)

  reranker = FlashrankRerank()
  pipeline_compressor = DocumentCompressorPipeline(transformers=[redundant_filter, reranker])

  compression_pipeline = ContextualCompressionRetriever(base_compressor=pipeline_compressor,
                                                base_retriever=ensemble_retriever)
  return compression_pipeline

In [6]:
folder_path = "./dataset"
output_file = "./dataset/combined_documents.md"
await load_and_combine_documents(folder_path, output_file)

Unsupported file type: combined_documents.md
Parsing ColPali_2407.01449v3.pdf...
Error while parsing the file './dataset/ColPali_2407.01449v3.pdf': Failed to parse the file: {"detail":"Oops! Something went wrong on our end: Internal Server Error. Please try again in a few minutes. If the problem persists, please contact support by clicking the chat icon on cloud.llamaindex.ai providing this correlation ID: 1f123061-9c27-4526-b386-4ec866de72ea"}
All documents combined into ./dataset/combined_documents.md


In [7]:
documents = read_markdown_with_loader(output_file)

In [8]:
splits = text_splitter.split_documents(documents)

###Create and persist Chroma vector store

In [9]:
from langchain_chroma import Chroma

embedding_function = OpenAIEmbeddings()
collection_name = "rag_service_collection_nb_llama_parse"
vectorstore = Chroma.from_documents(collection_name=collection_name, documents=splits, embedding=embedding_function, persist_directory="./chroma_db")
#db.persist()

print("Vector store created and persisted to './chroma_db'")

Vector store created and persisted to './chroma_db'


In [10]:
query = "How to understand documents visually?"

In [11]:
# # 5. Perform similarity search
# search_results = vectorstore.similarity_search(query, k=5)

# print(f"\nTop 2 most relevant chunks for the query: '{query}'\n")
# for i, result in enumerate(search_results, 1):
#     print(f"Result {i}:")
#     print(f"Source: {result.metadata.get('source', 'Unknown')}")
#     print(f"Content: {result.page_content}")
#     print()

In [12]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
retriever.invoke(query)

[Document(metadata={'source': './dataset/combined_documents.md'}, page_content="for text understanding.\\n\\n2. Visually Rich Document Understanding:\\n- Models that encode text alongside visual features have been developed.\\n- Large Language Models (LLMs) are combined with Vision Transformers (ViTs) to enhance understanding.\\n\\n3. PaliGemma Model:\\n- A model that integrates visual and textual embeddings, fine-tuned for enhanced performance in tasks like Visual Question Answering and document understanding.\\n\\n### ViDoRe Benchmark\\n- Purpose: To evaluate retrieval systems' ability to match queries to relevant documents at the page level, addressing the gap in existing benchmarks that focus on either natural images or textual passages.\\n- Design: Includes various modalities (text, figures, tables) and thematic domains (e.g., medical).\\n\\nThis document emphasizes the importance of integrating visual features into retrieval systems to improve performance and better mimic human u

In [13]:
# from langchain.retrievers import ContextualCompressionRetriever
# from langchain.retrievers.document_compressors import FlashrankRerank

# compressor = FlashrankRerank(model="ms-marco-MiniLM-L-12-v2")
# compression_retriever = ContextualCompressionRetriever(
#     base_compressor=compressor, base_retriever=retriever
# )

In [14]:
original_bm25_index = create_bm25_index(splits)
original_reranker = create_flashrank_index(vectorstore)

In [15]:
bm25_retriever_original = create_bm25_retriever(splits)
original_ensemble_retriever_reranker = create_ensemble_retriever_reranker(vectorstore, bm25_retriever_original, embedding_function)

In [16]:
from langchain_core.prompts import ChatPromptTemplate
template = """You are a highly capable assistant specializing in answering questions from visually rich documents. Consider both textual and visual elements as context.

Given the context below:
{context}

And the question:
{question}

Provide a precise and concise answer based solely on the provided context. Do not include any information that is not explicitly present in the context.

Answer:"""
prompt = ChatPromptTemplate.from_template(template)


In [17]:
# from langchain.schema.runnable import RunnablePassthrough
# rag_chain = (
#     {"context": retriever, "question": RunnablePassthrough()} | prompt
# )
# rag_chain.invoke(query)

In [18]:
def docs2str(docs):
  return "\n\n".join(doc.page_content for doc in docs)

In [19]:
# rag_chain = (
#   {"context": retriever | docs2str, "question": RunnablePassthrough()} | prompt
# )
# rag_chain.invoke(query)

In [20]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

llm = ChatOpenAI(model="gpt-4o-mini")

In [21]:
from langchain.schema.runnable import RunnablePassthrough
rag_chain = (
    {"context": original_ensemble_retriever_reranker | docs2str, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
question = query
response = rag_chain.invoke(question)
print(response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown t

To understand documents visually, it is important to integrate visual features alongside text. This can be achieved by using models that encode both text and visual elements, such as combining Large Language Models (LLMs) with Vision Transformers (ViTs). The PaliGemma model, for instance, integrates visual and textual embeddings to enhance tasks like Visual Question Answering and document understanding. Additionally, benchmarks like ViDoRe are designed to evaluate retrieval systems' ability to match queries to relevant documents at the page level, emphasizing the importance of processing visual components like figures and tables.
