In [1]:
# Required packages:
# pip install -U langchain langgraph langchain-chroma langchain-ollama langchain-community pypdf

import os
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
# ============================================================================
# STEP 1: DOCUMENTS AND DOCUMENT LOADERS
# ============================================================================
print("\n1.1 Loading PDF Document...")
print("Using PyPDFLoader which loads one Document object per PDF page.")
print("For each page, we can access:")
print("- The string content of the page")
print("- Metadata containing the file name and page number")

# Load PDF - works with both online URLs and local file paths
pdf_url = "https://arxiv.org/pdf/2501.04040.pdf"
loader = PyPDFLoader(pdf_url)
documents = loader.load()

print(f"\n✓ Loaded {len(documents)} pages from PDF")


1.1 Loading PDF Document...
Using PyPDFLoader which loads one Document object per PDF page.
For each page, we can access:
- The string content of the page
- Metadata containing the file name and page number

✓ Loaded 174 pages from PDF


In [3]:
sample_doc = documents[0]
print(f"\nSample Document Structure:")
print(f"- Content length: {len(sample_doc.page_content)} characters")
print(f"- Metadata: {sample_doc.metadata}")
print(f"- Content preview: {sample_doc.page_content[:200]}...")


Sample Document Structure:
- Content length: 2140 characters
- Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-02-11T01:48:37+00:00', 'author': '', 'keywords': '', 'moddate': '2025-02-11T01:48:37+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'https://arxiv.org/pdf/2501.04040.pdf', 'total_pages': 174, 'page': 0, 'page_label': '1'}
- Content preview: A Survey on Large Language Models with some Insights
on their Capabilities and Limitations
Andrea Matarazzo
Expedia Group
Italy
a.matarazzo@gmail.com
Riccardo Torlone
Roma Tre University
Italy
riccard...


In [4]:
# ============================================================================
# STEP 2: TEXT SPLITTING  
# ============================================================================
print("\n2.1 Configuring Text Splitter...")
print("- Chunk size: 4096 characters (as specified)")
print("- Overlap: 410 characters (10% overlap)")
print("- Method: Recursive character splitting")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4096,
    chunk_overlap=410,  # 10% of 4096
    length_function=len,
    add_start_index=True,  # Preserves character index as metadata
)

print("\n2.2 Splitting documents into chunks...")
chunks = text_splitter.split_documents(documents)

print(f"\n✓ Split {len(documents)} pages into {len(chunks)} chunks")

chunk_sizes = [len(chunk.page_content) for chunk in chunks]
print(f"\nChunk Analysis:")
print(f"- Average chunk size: {sum(chunk_sizes) / len(chunk_sizes):.0f} characters")
print(f"- Largest chunk: {max(chunk_sizes)} characters")
print(f"- Smallest chunk: {min(chunk_sizes)} characters")


2.1 Configuring Text Splitter...
- Chunk size: 4096 characters (as specified)
- Overlap: 410 characters (10% overlap)
- Method: Recursive character splitting

2.2 Splitting documents into chunks...

✓ Split 174 pages into 183 chunks

Chunk Analysis:
- Average chunk size: 2623 characters
- Largest chunk: 4079 characters
- Smallest chunk: 420 characters


In [5]:
# ============================================================================
# STEP 3: EMBEDDINGS
# ============================================================================
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
    base_url="http://localhost:11434"
)

In [6]:
# ============================================================================
# STEP 4: VECTOR STORES
# ============================================================================
print("\n4.1 Creating Chroma Vector Store...")
print("- Collection name: pdf_collection")
print("- Storage: Local persistent directory")
print("- Embedding function: nomic-embed-text via Ollama")

vector_store = Chroma(
    collection_name="pdf_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_db",
)

vector_store.add_documents(documents=chunks)

print(f"✓ Added {len(chunks)} document chunks to vector store")


4.1 Creating Chroma Vector Store...
- Collection name: pdf_collection
- Storage: Local persistent directory
- Embedding function: nomic-embed-text via Ollama
✓ Added 183 document chunks to vector store


In [17]:
# ============================================================================
# STEP 5: QUERYING THE VECTOR STORE
# ============================================================================


print("\n5.1 Basic Similarity Search")
print("Finding documents most similar to a query using cosine similarity...")

query = "What is the main methods available for RAG?"
results = vector_store.similarity_search(query, k=3)

print(f"\nQuery: '{query}'")
print(f"Retrieved {len(results)} most similar chunks:")

for i, doc in enumerate(results, 1):
    print(f"\n--- Result {i} ---")
    print(f"Content: {doc.page_content[:300]}...")
    print(f"Source: Page {doc.metadata.get('page', 'unknown')}")


5.1 Basic Similarity Search
Finding documents most similar to a query using cosine similarity...

Query: 'What is the main methods available for RAG?'
Retrieved 3 most similar chunks:

--- Result 1 ---
Content: or where the sought information is highly nuanced. IRCoT [223] employs a chain-of-
thought (CoT) approach, using retrieval results to iteratively refine the CoT reasoning
process. ToC (Tree of Clarifications) [284] systematically addresses ambiguities in queries
by constructing clarification trees t...
Source: Page 122

--- Result 2 ---
Content: Figure 65: Technology tree of RAG research. The stages of involving RAG mainly include pre-
training, fine-tuning, and inference. With the emergence of LLMs, research on RAG initially focused
on leveraging the powerful in context learning abilities of LLMs, primarily concentrating on the in-
ference...
Source: Page 118

--- Result 3 ---
Content: Figure 64: Final Pass rates of models across LLM Modulo Iterations. Source: Kambhampati et a

In [8]:
print("\n5.2 Similarity Search with Scores")
print("Same search but with similarity scores to see confidence levels...")

results_with_scores = vector_store.similarity_search_with_score(query, k=2)

for i, (doc, score) in enumerate(results_with_scores, 1):
    print(f"\n--- Result {i} (Similarity Score: {score:.4f}) ---")
    print(f"Content: {doc.page_content[:200]}...")
    print(f"Source: Page {doc.metadata.get('page', 'unknown')}")


5.2 Similarity Search with Scores
Same search but with similarity scores to see confidence levels...

--- Result 1 (Similarity Score: 0.9371) ---
Content: Figure 65: Technology tree of RAG research. The stages of involving RAG mainly include pre-
training, fine-tuning, and inference. With the emergence of LLMs, research on RAG initially focused
on lever...
Source: Page 118

--- Result 2 (Similarity Score: 0.9464) ---
Content: probability for a hypothesis as more evidence or information becomes available. Fundamentally, Bayesian
inference uses prior knowledge, in the form of a prior distribution in order to estimate posteri...
Source: Page 86


In [9]:
print("\n5.3 Metadata Filtering")
print("Using metadata filters to search specific parts of the document...")

# First, let's see what metadata is available
print("\nAvailable metadata in our chunks:")
if chunks:
    sample_metadata = chunks[0].metadata
    print(f"Sample metadata: {sample_metadata}")
    
    # Get unique page numbers for filtering examples
    page_numbers = set()
    for chunk in chunks[:10]:  # Check first 10 chunks
        if 'page' in chunk.metadata:
            page_numbers.add(chunk.metadata['page'])
    print(f"Available page numbers (sample): {sorted(list(page_numbers))[:5]}...")


5.3 Metadata Filtering
Using metadata filters to search specific parts of the document...

Available metadata in our chunks:
Sample metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-02-11T01:48:37+00:00', 'author': '', 'keywords': '', 'moddate': '2025-02-11T01:48:37+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'https://arxiv.org/pdf/2501.04040.pdf', 'total_pages': 174, 'page': 0, 'page_label': '1', 'start_index': 0}
Available page numbers (sample): [0, 1, 2, 3, 4]...


In [10]:
print("\n5.3.1 Filter by Specific Page")
if page_numbers:
    target_page = sorted(list(page_numbers))[0]  # Use first available page
    page_results = vector_store.similarity_search(
        "methodology approach",
        k=2,
        filter={"page": target_page}
    )
    print(f"Searching only in Page {target_page}:")
    for i, doc in enumerate(page_results, 1):
        print(f"  Result {i}: Page {doc.metadata.get('page')} - {doc.page_content[:150]}...")


5.3.1 Filter by Specific Page
Searching only in Page 0:
  Result 1: Page 0 - A Survey on Large Language Models with some Insights
on their Capabilities and Limitations
Andrea Matarazzo
Expedia Group
Italy
a.matarazzo@gmail.com
...


In [11]:
print("\n5.3.2 Filter by Page Range")
if len(page_numbers) > 1:
    # Filter for pages greater than or equal to a certain page
    min_page = sorted(list(page_numbers))[1] if len(page_numbers) > 1 else sorted(list(page_numbers))[0]
    range_results = vector_store.similarity_search(
        "results conclusions",
        k=3,
        filter={"page": {"$gte": min_page}}  # Pages >= min_page
    )
    print(f"Searching in pages >= {min_page}:")
    for i, doc in enumerate(range_results, 1):
        print(f"  Result {i}: Page {doc.metadata.get('page')} - {doc.page_content[:150]}...")



5.3.2 Filter by Page Range
Searching in pages >= 1:
  Result 1: Page 90 - Figure 40: A: Chain of thoughts (in blue) are intermediate reasoning steps towards a final answer.
The input of CoT prompting is a stack of a few (oft...
  Result 2: Page 111 - Figure 57: Reflexion works on decision-making, programming, and reasoning tasks. Source: Shinn
et al. [322]
Figure 58: (a) Diagram of Reflexion. (b) R...
  Result 3: Page 36 - hℓ = hℓ−1 + SA(LN(hℓ−1))
hℓ = hℓ + FFN(LN(hℓ))
where SA is multi-head self-attention, LN is layer-normalization, and FFN is a feed-forward net-
work w...


In [12]:
print("\n5.3.3 Multiple Metadata Filters")
# Complex filtering with multiple conditions
complex_results = vector_store.similarity_search(
    "research findings",
    k=2,
    filter={
        "$and": [
            {"page": {"$gte": 0}},  # Page 0 or higher
            {"source": {"$ne": ""}}  # Has a source
        ]
    }
)

print("Using complex filter (page >= 0 AND has source):")
for i, doc in enumerate(complex_results, 1):
    print(f"  Result {i}: Page {doc.metadata.get('page')} - {doc.page_content[:150]}...")



5.3.3 Multiple Metadata Filters
Using complex filter (page >= 0 AND has source):
  Result 1: Page 118 - Figure 65: Technology tree of RAG research. The stages of involving RAG mainly include pre-
training, fine-tuning, and inference. With the emergence o...
  Result 2: Page 119 - Figure 66: Retrieval-Augmented Generation (RAG) Framework mainly consists of 3 steps. 1) In-
dexing. Documents are split into chunks, encoded into vec...


In [13]:
print("\n5.4 Search by Vector with Filtering")
print("Combining vector search with metadata filtering...")

# Generate embedding for a query and search with filter
query_embedding = embeddings.embed_query("experimental setup")
if page_numbers:
    target_page = sorted(list(page_numbers))[0]
    vector_results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=2,
        filter={"page": target_page}
    )
    print(f"Vector search in Page {target_page}:")
    for i, doc in enumerate(vector_results, 1):
        print(f"  Result {i}: {doc.page_content[:150]}...")



5.4 Search by Vector with Filtering
Combining vector search with metadata filtering...
Vector search in Page 0:
  Result 1: A Survey on Large Language Models with some Insights
on their Capabilities and Limitations
Andrea Matarazzo
Expedia Group
Italy
a.matarazzo@gmail.com
...


In [14]:
# ============================================================================
# STEP 6: RETRIEVERS
# ============================================================================

print("\n6. Creating Retriever...")

# Similarity Retriever
similarity_retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}
)

# MMR Retriever
print("MMR balances similarity with diversity in retrieved results")
mmr_retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 3, "fetch_k": 10, "lambda_mult": 0.5}
)

# Score Threshold Retriever
print("Score threshold retriever only returns documents above similarity threshold")
threshold_retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.5, "k": 5}
)


6. Creating Retriever...
MMR balances similarity with diversity in retrieved results
Score threshold retriever only returns documents above similarity threshold


In [15]:
# ============================================================================
# STEP 7: RAG FOUNDATION
# ============================================================================
final_query = "What are the key contributions of this paper?"
context_docs = similarity_retriever.invoke(final_query)

print(f"\nQuery: '{final_query}'")
print(f"✓ Retrieved {len(context_docs)} relevant document chunks")

# Show what would be sent to LLM
print(f"\nContext that would be sent to LLM:")
for i, doc in enumerate(context_docs[:2], 1):  # Show first 2 for brevity
    print(f"\nChunk {i}: {doc.page_content[:250]}...")


Query: 'What are the key contributions of this paper?'
✓ Retrieved 4 relevant document chunks

Context that would be sent to LLM:

Chunk 1: Figure 56: Overview of the DEPS interactive plannet architecture. Source: Wang et al. [344]
as an explainer to locate the errors in the previous plan. Finally, a planner will refine the plan
using the descriptor and explainer information. To improve ...

Chunk 2: Figure 57: Reflexion works on decision-making, programming, and reasoning tasks. Source: Shinn
et al. [322]
Figure 58: (a) Diagram of Reflexion. (b) Reflexion reinforcement algorithm. Source: Shinn et al.
[322]
process are the notion of short-term an...


In [16]:
context_docs

[Document(id='32a679fe-1fbd-4b4f-84d5-278d7aba5e2c', metadata={'subject': '', 'source': 'https://arxiv.org/pdf/2501.04040.pdf', 'page_label': '111', 'trapped': '/False', 'total_pages': 174, 'title': '', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'moddate': '2025-02-11T01:48:37+00:00', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-02-11T01:48:37+00:00', 'keywords': '', 'author': '', 'producer': 'pdfTeX-1.40.25', 'page': 110, 'start_index': 0}, page_content='Figure 56: Overview of the DEPS interactive plannet architecture. Source: Wang et al. [344]\nas an explainer to locate the errors in the previous plan. Finally, a planner will refine the plan\nusing the descriptor and explainer information. To improve the feasibility of generated plans\nconditioned on the current state, which is the second identified challenge, Wang et al. [344] use\na learned goal-selector to choose the most accessible sub-task based on the 