In [1]:
# Required packages:
# pip install -U langchain langgraph langchain-chroma langchain-ollama langchain-community pypdf

import os
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
# ============================================================================
# STEP 1: DOCUMENTS AND DOCUMENT LOADERS
# ============================================================================
# Load PDF - works with both online URLs and local file paths
pdf_url = "https://arxiv.org/pdf/2501.04040.pdf"
loader = PyPDFLoader(pdf_url)
documents = loader.load()

In [7]:
documents[0]
print(f"\n✓ Loaded {len(documents)} pages from PDF")



✓ Loaded 174 pages from PDF


In [8]:
sample_doc = documents[0]
print(f"\nSample Document Structure:")
print(f"- Content length: {len(sample_doc.page_content)} characters")
print(f"- Metadata: {sample_doc.metadata}")
print(f"- Content preview: {sample_doc.page_content[:200]}...")


Sample Document Structure:
- Content length: 2140 characters
- Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-02-11T01:48:37+00:00', 'author': '', 'keywords': '', 'moddate': '2025-02-11T01:48:37+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'https://arxiv.org/pdf/2501.04040.pdf', 'total_pages': 174, 'page': 0, 'page_label': '1'}
- Content preview: A Survey on Large Language Models with some Insights
on their Capabilities and Limitations
Andrea Matarazzo
Expedia Group
Italy
a.matarazzo@gmail.com
Riccardo Torlone
Roma Tre University
Italy
riccard...


In [10]:
# ============================================================================
# STEP 2: TEXT SPLITTING  
# ============================================================================
print("\n2.1 Configuring Text Splitter...")
print("- Chunk size: 1024 characters (as specified)")
print("- Overlap: 100 characters (10% overlap)")
print("- Method: Recursive character splitting")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=100,  # 10% of 1024
    length_function=len,
    add_start_index=True,  # Preserves character index as metadata
)

print("\n2.2 Splitting documents into chunks...")
chunks = text_splitter.split_documents(documents)

print(f"\n✓ Split {len(documents)} pages into {len(chunks)} chunks")

chunk_sizes = [len(chunk.page_content) for chunk in chunks]
print(f"\nChunk Analysis:")
print(f"- Average chunk size: {sum(chunk_sizes) / len(chunk_sizes):.0f} characters")
print(f"- Largest chunk: {max(chunk_sizes)} characters")
print(f"- Smallest chunk: {min(chunk_sizes)} characters")


2.1 Configuring Text Splitter...
- Chunk size: 1024 characters (as specified)
- Overlap: 100 characters (10% overlap)
- Method: Recursive character splitting

2.2 Splitting documents into chunks...

✓ Split 174 pages into 593 chunks

Chunk Analysis:
- Average chunk size: 858 characters
- Largest chunk: 1024 characters
- Smallest chunk: 25 characters


In [12]:
# chunks

In [13]:
# ============================================================================
# STEP 3: EMBEDDINGS
# ======
# 
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
    base_url="http://localhost:11434"
)

In [16]:
len(embeddings.embed_query("Hello world"))

768

In [None]:
# ============================================================================
# STEP 4: VECTOR STORES
# ============================================================================
print("\n4.1 Creating Chroma Vector Store...")
print("- Collection name: pdf_collection")
print("- Storage: Local persistent directory")
print("- Embedding function: nomic-embed-text via Ollama")

vector_store = Chroma(
    collection_name="pdf_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_db",
)

vector_store.add_documents(documents=chunks)

print(f"✓ Added {len(chunks)} document chunks to vector store")


4.1 Creating Chroma Vector Store...
- Collection name: pdf_collection
- Storage: Local persistent directory
- Embedding function: nomic-embed-text via Ollama
✓ Added 593 document chunks to vector store


In [20]:
# ============================================================================
# STEP 5: QUERYING THE VECTOR STORE
# ============================================================================

print("\n5.1 Basic Similarity Search")
print("Finding documents most similar to a query using cosine similarity...")

query = "What is the main methods available for RAG?"
results = vector_store.similarity_search(query, k=5)

print(f"\nQuery: '{query}'")
print(f"Retrieved {len(results)} most similar chunks:")

for i, doc in enumerate(results, 1):
    print(f"\n--- Result {i} ---")
    print(f"Content: {doc.page_content[:300]}...")
    print(f"Source: Page {doc.metadata.get('page', 'unknown')}")


5.1 Basic Similarity Search
Finding documents most similar to a query using cosine similarity...

Query: 'What is the main methods available for RAG?'
Retrieved 5 most similar chunks:

--- Result 1 ---
Content: Benchmarks such as SQuAD [33], Natural Questions [71], and specialized datasets for re-
trieval tasks are widely used for assessment.
Despite its promise, RAG faces several challenges:
1. Retrieval Latency: Efficiently querying large databases in real time remains a technical
hurdle.
2. Data Quality...
Source: Page 122

--- Result 2 ---
Content: hance the relevance of retrieved data. For indexing, it uses more sophisticated techniques
like sliding window approach, fine-grained segmentation and metadata. It incorporates
additional optimization techniques to streamline the retrieval process [280].
3. Modular RAG: this architecture advances be...
Source: Page 117

--- Result 3 ---
Content: Figure 65: Technology tree of RAG research. The stages of involving RAG mainly include pre-


In [22]:
print("\n5.2 Similarity Search with Scores")
print("Same search but with similarity scores to see confidence levels...")

results_with_scores = vector_store.similarity_search_with_score(query, k=5)

for i, (doc, score) in enumerate(results_with_scores, 1):
    print(f"\n--- Result {i} (Similarity Score: {score:.4f}) ---")
    print(f"Content: {doc.page_content[:200]}...")
    print(f"Source: Page {doc.metadata.get('page', 'unknown')}")


5.2 Similarity Search with Scores
Same search but with similarity scores to see confidence levels...

--- Result 1 (Similarity Score: 0.6112) ---
Content: Benchmarks such as SQuAD [33], Natural Questions [71], and specialized datasets for re-
trieval tasks are widely used for assessment.
Despite its promise, RAG faces several challenges:
1. Retrieval La...
Source: Page 122

--- Result 2 (Similarity Score: 0.6444) ---
Content: hance the relevance of retrieved data. For indexing, it uses more sophisticated techniques
like sliding window approach, fine-grained segmentation and metadata. It incorporates
additional optimization...
Source: Page 117

--- Result 3 (Similarity Score: 0.6950) ---
Content: Figure 65: Technology tree of RAG research. The stages of involving RAG mainly include pre-
training, fine-tuning, and inference. With the emergence of LLMs, research on RAG initially focused
on lever...
Source: Page 118

--- Result 4 (Similarity Score: 0.6959) ---
Content: Self-RAG [262] enab

In [23]:
print("\n5.3 Metadata Filtering")
print("Using metadata filters to search specific parts of the document...")

# First, let's see what metadata is available
print("\nAvailable metadata in our chunks:")
if chunks:
    sample_metadata = chunks[0].metadata
    print(f"Sample metadata: {sample_metadata}")
    
    # Get unique page numbers for filtering examples
    page_numbers = set()
    for chunk in chunks[:10]:  # Check first 10 chunks
        if 'page' in chunk.metadata:
            page_numbers.add(chunk.metadata['page'])
    print(f"Available page numbers (sample): {sorted(list(page_numbers))[:5]}...")


5.3 Metadata Filtering
Using metadata filters to search specific parts of the document...

Available metadata in our chunks:
Sample metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-02-11T01:48:37+00:00', 'author': '', 'keywords': '', 'moddate': '2025-02-11T01:48:37+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'https://arxiv.org/pdf/2501.04040.pdf', 'total_pages': 174, 'page': 0, 'page_label': '1', 'start_index': 0}
Available page numbers (sample): [0, 1, 2]...


In [25]:
print("\n5.3.1 Filter by Specific Page")
if page_numbers:
    target_page = sorted(list(page_numbers))[0]  # Use first available page
    page_results = vector_store.similarity_search(
        "methodology approach",
        k=10,
        filter={"page": target_page}
    )
    print(f"Searching only in Page {target_page}:")
    for i, doc in enumerate(page_results, 1):
        print(f"  Result {i}: Page {doc.metadata.get('page')} - {doc.page_content[:150]}...")



5.3.1 Filter by Specific Page
Searching only in Page 0:
  Result 1: Page 0 - frameworks that integrate external systems, allowing LLMs to handle complex, dynamic
tasks. By analyzing these factors, this paper aims to foster the ...
  Result 2: Page 0 - architectural strategies that drive these capabilities. Emphasizing models like GPT and
LLaMA, we analyze the impact of exponential data and computati...
  Result 3: Page 0 - A Survey on Large Language Models with some Insights
on their Capabilities and Limitations
Andrea Matarazzo
Expedia Group
Italy
a.matarazzo@gmail.com
...


In [27]:
print("\n5.3.3 Multiple Metadata Filters")
# Complex filtering with multiple conditions
complex_results = vector_store.similarity_search(
    "research findings",
    k=2,
    filter={
        "$and": [
            {"page": {"$lte": 10}},  # Page 0 or higher
            {"source": {"$ne": ""}}  # Has a source
        ]
    }
)

print("Using complex filter (page >= 0 AND has source):")
for i, doc in enumerate(complex_results, 1):
    print(f"  Result {i}: Page {doc.metadata.get('page')} - {doc.page_content[:150]}...")



5.3.3 Multiple Metadata Filters
Using complex filter (page >= 0 AND has source):
  Result 1: Page 4 - the transformative impact of LLMs across various domains, including healthcare, finance,
education, law, and scientific research.
• Section 3 focuses ...
  Result 2: Page 4 - The central motivation of this work is therefore to investigate the current capabilities
and boundaries of LLMs, focusing on their ability to generali...


In [28]:
# ============================================================================
# STEP 6: RETRIEVERS
# ============================================================================
print("\n6. Creating Retriever...")

# Similarity Retriever
similarity_retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}
)



6. Creating Retriever...


In [31]:
# ============================================================================
# STEP 7: RAG FOUNDATION
# ============================================================================
# final_query = "What are the key contributions of this paper?"
final_query = "What are the main LLM models used for RAG?"
context_docs = similarity_retriever.invoke(final_query)

print(f"\nQuery: '{final_query}'")
print(f"✓ Retrieved {len(context_docs)} relevant document chunks")


Query: 'What are the main LLM models used for RAG?'
✓ Retrieved 4 relevant document chunks


In [32]:
# Show what would be sent to LLM
print(f"\nContext that would be sent to LLM:")
for i, doc in enumerate(context_docs[:2], 1):  # Show first 2 for brevity
    print(f"\nChunk {i}: {doc.page_content[:250]}...")


Context that would be sent to LLM:

Chunk 1: Figure 64: Final Pass rates of models across LLM Modulo Iterations. Source: Kambhampati et al.
[379]
3. Domain Adaptability: RAG enables LLMs to integrate domain-specific information,
improving performance in specialized areas like law, medicine, and...

Chunk 2: Figure 65: Technology tree of RAG research. The stages of involving RAG mainly include pre-
training, fine-tuning, and inference. With the emergence of LLMs, research on RAG initially focused
on leveraging the powerful in context learning abilities o...
