In this notebook, I leveraged Docling for high-fidelity PDF parsing, specifically to ensure the structural integrity of complex tables by converting them into Markdown. The processed documents were then partitioned using LlamaIndex’s chunking strategies, with Gemini embeddings persisted in a local ChromaDB instance. I implemented and evaluated two retrieval pipelines: Hybrid Search and Rerank-optimized retrieval. Benchmarking results on specific financial queries (e.g., 'R&D expense in 2025Q3') demonstrated that both methods achieved consistent and accurate performance.

In [None]:
from docling.document_converter import DocumentConverter
from pathlib import Path

source = Path("../data/goog_2025Q3.pdf")
converter = DocumentConverter()
result = converter.convert(source)
# print(result.document.export_to_markdown())

2025-12-24 15:20:38,879 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-24 15:20:39,012 - INFO - Going to convert document batch...
2025-12-24 15:20:39,018 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2025-12-24 15:20:39,031 - INFO - Loading plugin 'docling_defaults'
2025-12-24 15:20:39,035 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-12-24 15:20:39,048 - INFO - Loading plugin 'docling_defaults'
2025-12-24 15:20:39,057 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-12-24 15:20:40,061 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-12-24 15:20:40,086 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-24 15:20:40,103 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\jyflo\miniforge3\envs\fr_rag_side\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-24 15:20:

In [3]:
from llama_index.core import Document

content_md = result.document.export_to_markdown()
file_name = result.input.file.name

meta_data = {
    "file_name": file_name,
    "company": "Alphabet", 
    "year": "2025",
    "quarter": "Q3"
}
doc = Document(text=content_md, metadata=meta_data)

In [None]:
# Initialize Chromadb and turn into LlamaIndex Vector Store
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

db = chromadb.PersistentClient(path="../chroma_db")
chroma_collection = db.get_or_create_collection("financial_reports")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
import os
from dotenv import load_dotenv

load_dotenv(override=True)
llm = GoogleGenAI(
    model="gemini-2.5-flash",
    api_key=os.environ.get("GOOGLE_API_KEY")
)


In [12]:

# Generate Embedding and store into local chromadb

embed_model = GoogleGenAIEmbedding(model_name="models/text-embedding-004")

node_parser = MarkdownElementNodeParser(
    llm=llm, 
    num_workers=4  
)

nodes = node_parser.get_nodes_from_documents([doc])

index = VectorStoreIndex(
    nodes, 
    embed_model=embed_model,
    storage_context=storage_context,
    show_progress=True
)

print(f"Successfully stored {len(nodes)} of nodes from {file_name} in ChromaDB.")

64it [00:00, 62674.63it/s]
2025-12-24 15:50:56,123 - INFO - AFC is enabled with max remote calls: 10.
2025-12-24 15:50:56,130 - INFO - AFC is enabled with max remote calls: 10.
2025-12-24 15:50:56,133 - INFO - AFC is enabled with max remote calls: 10.
2025-12-24 15:50:56,136 - INFO - AFC is enabled with max remote calls: 10.
2025-12-24 15:51:01,911 - INFO - AFC is enabled with max remote calls: 10.
2025-12-24 15:51:02,293 - INFO - AFC is enabled with max remote calls: 10.
2025-12-24 15:51:02,638 - INFO - AFC is enabled with max remote calls: 10.
2025-12-24 15:51:02,838 - INFO - AFC is enabled with max remote calls: 10.
2025-12-24 15:51:06,841 - INFO - AFC is enabled with max remote calls: 10.
2025-12-24 15:51:08,806 - INFO - AFC is enabled with max remote calls: 10.
2025-12-24 15:51:09,462 - INFO - AFC is enabled with max remote calls: 10.
2025-12-24 15:51:10,371 - INFO - AFC is enabled with max remote calls: 10.
2025-12-24 15:51:10,420 - INFO - AFC is enabled with max remote calls: 10

Generating embeddings:   0%|          | 0/202 [00:00<?, ?it/s]

2025-12-24 15:52:53,762 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
2025-12-24 15:52:54,213 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
2025-12-24 15:52:54,570 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
2025-12-24 15:52:54,924 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
2025-12-24 15:52:55,340 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
2025-12-24 15:52:55,668 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
2025-12-24 15:52

Successfully stored 202 of nodes from goog_2025Q3.pdf in ChromaDB.


In [15]:
# Check if the embedding is correctly stored

print(f"List all the collections: {db.list_collections()}")
collection = db.get_collection("financial_reports")
count = collection.count()
print(f"The number of collections: {count}")
assert count == len(nodes)

if count > 0:
    sample = collection.get(limit=2, include=['embeddings', 'documents', 'metadatas'])
    print(f"The sample record id: {sample['ids']}")
    print(f"The sample meta data: {sample['metadatas'][0]}")

    if sample['embeddings'] is not None:
        print(f"Embedding checked! The dimension is {len(sample['embeddings'][0])}")


List all the collections: [Collection(name=financial_reports)]
The number of collections: 202
The sample record id: ['e2e0b256-d602-4898-a6d1-9014b6844c78', '9f1ca5b6-0790-424f-9b13-b440d24cefdb']
The sample meta data: {'year': '2025', '_node_type': 'TextNode', 'document_id': 'ace18e95-631c-454e-af19-3aa4d570974f', 'quarter': 'Q3', 'company': 'Alphabet', 'ref_doc_id': 'ace18e95-631c-454e-af19-3aa4d570974f', 'doc_id': 'ace18e95-631c-454e-af19-3aa4d570974f', 'file_name': 'goog_2025Q3.pdf', '_node_content': '{"id_": "e2e0b256-d602-4898-a6d1-9014b6844c78", "embedding": null, "metadata": {"file_name": "goog_2025Q3.pdf", "company": "Alphabet", "year": "2025", "quarter": "Q3"}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "ace18e95-631c-454e-af19-3aa4d570974f", "node_type": "4", "metadata": {"file_name": "goog_2025Q3.pdf", "company": "Alphabet", "year": "2025", "quarter": "Q3"}, "hash": "bca5990f068a244c9428202bb7a1eb6b90403664ea53d1

In [17]:
# Try one query and similarity search in local chromadb
query_text = "What is the revenue of 2025Q3?"
query_vector = embed_model.get_query_embedding(query_text)

results = collection.query(
    query_embeddings=[query_vector]
)
print("The most relevant document is:", results['documents'][0])


2025-12-24 18:48:46,170 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"


The most relevant document is: ['This table details revenues by geographic region (United States, EMEA, APAC, Other Americas) and hedging gains/losses, along with total revenues, for the three and nine months ended September 30, in both 2024 and 2025, presented as both dollar amounts and their respective percentages of total revenue.,\nwith the following columns:\n- Three Months Ended September 30, 2024 Revenue: Revenue in dollar amounts for the three months ended September 30, 2024.\n- Three Months Ended September 30, 2024 Percentage: Percentage of total revenue for the three months ended September 30, 2024.\n- Three Months Ended September 30, 2025 Revenue: Revenue in dollar amounts for the three months ended September 30, 2025.\n- Three Months Ended September 30, 2025 Percentage: Percentage of total revenue for the three months ended September 30, 2025.\n- Nine Months Ended September 30, 2024 Revenue: Revenue in dollar amounts for the nine months ended September 30, 2024.\n- Nine Mon

In [None]:
# hybrid retrieve 
from llama_index.core import Settings

Settings.llm = GoogleGenAI(model="models/gemini-2.5-flash")
Settings.embed_model = GoogleGenAIEmbedding(model_name="models/text-embedding-004")

index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=Settings.embed_model 
)

fast_hybrid_query_engine = index.as_query_engine(
    vector_store_query_mode="hybrid", 
    similarity_top_k=5,        
    sparse_top_k=5             
)


In [5]:
# Hybrid retrieve and Rerank
from llama_index.core.postprocessor import LLMRerank
from llama_index.core.postprocessor import SentenceTransformerRerank

rerank_postprocessor = SentenceTransformerRerank(
    model="BAAI/bge-reranker-v2-m3", 
    top_n=3  
)

deep_rerank_engine = index.as_query_engine(
    vector_store_query_mode="hybrid",
    similarity_top_k=10,             
    sparse_top_k=10,
    node_postprocessors=[rerank_postprocessor] 
)

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [8]:
# Generate the answer
import nest_asyncio
nest_asyncio.apply()

response_fast = fast_hybrid_query_engine.query("What is the R&D expense of 2025Q3？")
print("--- The response of fast ---")
print(response_fast.response)

response_deep = deep_rerank_engine.query("What is the R&D expense of 2025Q3？")
print("--- The response of deep rerank ---")
print(response_deep.response)

--- The response of fast ---
Research and development expenses for the three months ended September 30, 2025 were $15,151 million.
--- The response of deep rerank ---
Research and development expenses for the three months ended September 30, 2025 were $15,151 million.
