In [1]:
%pip install -Uq "unstructured[all-docs]"
%pip install -Uq langchain_community langchain langchain_openai
%pip install -Uq langchain_chroma
%pip install dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
from typing import List

# Unstructured for document parsing
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

# Langchain components
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
def partition_document(file_path:str):
    """Extract elements from a PDF document using unstructured."""
    print(f"Partitioning document: {file_path}")

    elements = partition_pdf(
        filename=file_path, # Path to the PDF file
        strategy="hi_res", # use the most accurate (but slower) processing method of extraction
        infer_table_structure=True, # Keep tables as structured HTML, not jumped text
        extract_image_block_types=['Image'], # Grab image found in th pdf
        extract_image_block_to_payload=True # Store images as base64 data you can use to display them
    )

    print(f"Extracted {len(elements)} elements")
    return elements

# Test with you pdf files
file_path = "K:/RAG/docs/paper.pdf"
elements = partition_document(file_path)

Partitioning document: K:/RAG/docs/paper.pdf


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Extracted 138 elements


In [3]:
len(elements)

138

In [4]:
# ALL types of different atomic elements we see from unstructured data
set([str(type(element)) for element in elements])

{"<class 'unstructured.documents.elements.FigureCaption'>",
 "<class 'unstructured.documents.elements.Footer'>",
 "<class 'unstructured.documents.elements.Header'>",
 "<class 'unstructured.documents.elements.Image'>",
 "<class 'unstructured.documents.elements.ListItem'>",
 "<class 'unstructured.documents.elements.NarrativeText'>",
 "<class 'unstructured.documents.elements.Table'>",
 "<class 'unstructured.documents.elements.Text'>",
 "<class 'unstructured.documents.elements.Title'>"}

In [5]:
elements[30].to_dict()

{'type': 'NarrativeText',
 'element_id': 'a6976c4055597825c98bc7515dec34be',
 'text': 'Inspired by [17], we believe that a RAG mechanism can en- hance LLM spatial reasoning and offer a promising direction for enhancing context-aware, personalized itinerary recommendations. By coupling generative language models with spatial and contex- tual knowledge, spatial RAG systems can retrieve geographically grounded information, generate personalized walking itineraries, and present them to the user in natural language. This approach enables dynamic and adaptive recommendations that align with user intents and local context. In this short research paper, we thus investigate the following research question:',
 'metadata': {'detection_class_prob': 0.9541615843772888,
  'is_extracted': 'true',
  'coordinates': {'points': ((np.float64(881.877197265625),
     np.float64(976.7021422222222)),
    (np.float64(881.877197265625), np.float64(1275.5838088888888)),
    (np.float64(1557.1773681640625), np.fl

In [6]:
# Gather all the images
images = [element for element in elements if element.category == "Image"]
print(f"Found {len(images)} images")
images[7].to_dict()

Found 8 images


{'type': 'Image',
 'element_id': 'd183ee86ca3d85f334ea55645bdd51ba',
 'text': 'Legend Ke @ Place des Vosges - Jardin des plantes ® Café de la Paix — LLM-CB Route — WalkRAG Route:',
 'metadata': {'coordinates': {'points': ((np.float64(199.48888888888888),
     np.float64(1198.5017777777778)),
    (np.float64(199.48888888888888), np.float64(1539.236111111111)),
    (np.float64(766.7553888888888), np.float64(1539.236111111111)),
    (np.float64(766.7553888888888), np.float64(1198.5017777777778))),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2025-12-07T01:15:30',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 4,
  'image_base64': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAFUAjgDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8Q

In [7]:
def create_chunks_by_title(elements):
    """Create chunks of text based on the title of the document"""
    print("Creating chunks by title...")

    chunks = chunk_by_title(
        elements, # The parsed PDF elements from the PDF parser
        max_characters=3000, # Hard limit - never exceed 3000 characters per chunk
        new_after_n_chars=2400, # Try to start a new chunk after 2400 characters
        combine_text_under_n_chars=500 # Merge chunks that are under 500 characters with neighbors
    )
    print(f"Created {len(chunks)} chunks")
    return chunks

chunks = create_chunks_by_title(elements)

Creating chunks by title...
Created 16 chunks


In [8]:
# View all chunks
chunks

[<unstructured.documents.elements.CompositeElement at 0x26043d05a90>,
 <unstructured.documents.elements.CompositeElement at 0x26036ec0690>,
 <unstructured.documents.elements.CompositeElement at 0x26036ec07d0>,
 <unstructured.documents.elements.CompositeElement at 0x26036aca650>,
 <unstructured.documents.elements.CompositeElement at 0x26036aca780>,
 <unstructured.documents.elements.CompositeElement at 0x26036a86330>,
 <unstructured.documents.elements.CompositeElement at 0x26036a6fdf0>,
 <unstructured.documents.elements.CompositeElement at 0x26036a6f570>,
 <unstructured.documents.elements.CompositeElement at 0x26036aae550>,
 <unstructured.documents.elements.CompositeElement at 0x26036aae350>,
 <unstructured.documents.elements.CompositeElement at 0x26036ace030>,
 <unstructured.documents.elements.CompositeElement at 0x26043d77a70>,
 <unstructured.documents.elements.CompositeElement at 0x26036ab1d30>,
 <unstructured.documents.elements.CompositeElement at 0x26036ab1010>,
 <unstructured.docum

In [9]:
# all unique types
set([str(type(chunk)) for chunk in chunks])

{"<class 'unstructured.documents.elements.CompositeElement'>"}

In [10]:
chunks[2].to_dict()

{'type': 'CompositeElement',
 'element_id': '66671613-cc36-4024-8c93-93adcf762b1f',
 'text': 'To answer this RQ, we design and evaluate WalkRAG, a spatial RAG framework with a conversational interface for recommending walkable urban itineraries. Leveraging LLMs, users can ask in natu- ral language for itineraries respecting specific spatial constraints and personal preferences, enabling personalized and context-aware route generation. Moreover, to further enhance the engagement and improve the walking experience, they can interactively retrieve information about the route or the points of interest (POIs) located along their walking paths.\n\n∗Both authors contributed equally to this research.\n\nPermission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for

In [15]:
def separate_content_types(chunk):
    """Analyse what types of content are in the chunk"""
    content_data = {
        "text": chunk.text,
        "tables": [],
        "images": [],
        "types": ["text"]
    }

    if hasattr(chunk, "metadata") and hasattr(chunk.metadata, "orig_elements"):
        for element in chunk.metadata.orig_elements:
            element_type = type(element).__name__

            # Tables
            if element_type == "Table":
                table_html = getattr(element.metadata, "text_as_html", element.text)
                content_data["tables"].append(table_html)
                content_data["types"].append("table")

            # Images
            elif element_type == "Image":
                if hasattr(element, "metadata") and hasattr(element.metadata, "image_base64"):
                    content_data["images"].append(element.metadata.image_base64)
                    content_data["types"].append("image")

    content_data["types"] = list(set(content_data["types"]))
    return content_data


def create_ai_enchanced_summary(text: str, tables: List[str], images: List[str]) -> str:
    """Create an AI enhanced searchable summary (supports images)"""
    try:
        import os

        model = ChatOpenAI(
            api_key=os.getenv("OPENROUTER_API_KEY"),
            base_url="https://openrouter.ai/api/v1",
            model="openai/gpt-4o-mini",
            temperature=0.3
        )

        prompt = f"""
You are creating a searchable summary of the following content.

CONTENT TO ANALYZE
-------------------
TEXT:
{text}
"""

        # Add tables
        if tables:
            prompt += "\nTABLES:\n"
            for i, table in enumerate(tables):
                prompt += f"\nTable {i+1}:\n{table}\n"

        # Instructions (trigger if tables or images exist)
        if tables or images:
            prompt += """
TASK:
Generate a comprehensive, searchable description that covers:

- Key facts, numbers, and data points from text and tables
- Main topics and concepts discussed
- Questions this content could answer
- Visual content analysis (charts, diagrams, patterns in images)
- Alternative search terms users might use

Make it detailed and searchable — prioritize findability over brevity.

SEARCHABLE DESCRIPTION:
"""

        # Build message content
        message_content = [{"type": "text", "text": prompt}]

        # Attach images safely (limit count)
        MAX_IMAGES = 3
        for image_base64 in images[:MAX_IMAGES]:
            message_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
            })

        messages = HumanMessage(content=message_content)
        result = model.invoke([messages])

        return result.content

    except Exception as e:
        print(f"Error creating AI enhanced summary: {str(e)}")
        return text



def summarise_chunks(chunks):
    """Process all chunks with AI summarise"""
    print("Processing chunks with AI summarise")

    langchain_documents =[]
    total_chunks = len(chunks)

    for i,chunk in enumerate(chunks):
        current_chunk = i+1
        print(f"Processing chunk {current_chunk}/{total_chunks}")

        # Analyse chunk content
        content_data = separate_content_types(chunk)

        # Debug prints
        print(f"Types found: {content_data['types']}")
        print(f"Tables: {len(content_data['tables'])}")
        print(f"Images: {len(content_data['images'])}")

        # Create a AI enchanced summary if chunk has tables/images
        if content_data['tables'] or content_data['images']:
            print(f"Creating AI enhanced summary")
            try:
                enchanced_content = create_ai_enchanced_summary(
                    content_data['text'],
                    content_data['tables'],
                    content_data['images']
                )
                if enchanced_content != content_data["text"]:
                    print("AI enhanced summary created successfully")
                else:
                    print("Fallback summary used")
                print(f"Enhanced content: {enchanced_content[:200]}")
            except Exception as e:
                print(f"Error creating AI enhanced summary: {str(e)}")
                enchanced_content = content_data['text']
        
        else:
            print(f"Chunk {current_chunk}/{total_chunks} has no tables or images")
            enchanced_content = content_data['text']

        # Create Langchain document with rich metadata

        doc = Document(
            page_content = enchanced_content,
            metadata={
                "original_content":json.dumps({
                    "raw_text":content_data['text'],
                    "tables":content_data['tables'],
                    "images":content_data['images']
                }) 
            }
        )        
        langchain_documents.append(doc)
    print(f"Processed {len(langchain_documents)} chunks")
    return langchain_documents

processed_chunks = summarise_chunks(chunks)

Processing chunks with AI summarise
Processing chunk 1/16
Types found: ['text']
Tables: 0
Images: 0
Chunk 1/16 has no tables or images
Processing chunk 2/16
Types found: ['text']
Tables: 0
Images: 0
Chunk 2/16 has no tables or images
Processing chunk 3/16
Types found: ['text']
Tables: 0
Images: 0
Chunk 3/16 has no tables or images
Processing chunk 4/16
Types found: ['text']
Tables: 0
Images: 0
Chunk 4/16 has no tables or images
Processing chunk 5/16
Types found: ['text']
Tables: 0
Images: 0
Chunk 5/16 has no tables or images
Processing chunk 6/16
Types found: ['text']
Tables: 0
Images: 0
Chunk 6/16 has no tables or images
Processing chunk 7/16
Types found: ['text']
Tables: 0
Images: 0
Chunk 7/16 has no tables or images
Processing chunk 8/16
Types found: ['text', 'image']
Tables: 0
Images: 7
Creating AI enhanced summary
AI enhanced summary created successfully
Enhanced content: ### SEARCHABLE DESCRIPTION

**Title:** WalkRAG Assessment: Spatially-Enhanced Retrieval-Augmented Generation f

In [16]:
processed_chunks

[Document(metadata={'original_content': '{"raw_text": "5\\n\\n2025\\n\\n2\\n\\n0\\n\\n2\\n\\nc e D 4 ] R I . s c [ 1 v 0 9 7 4 0 . 2 1 5\\n\\n2\\n\\n:\\n\\nv\\n\\narXiv\\n\\ni\\n\\nX\\n\\nr\\n\\na\\n\\nSpatially-Enhanced Retrieval-Augmented Generation for Walkability and Urban Discovery\\n\\nMaddalena Amendola, Chiara Pugliese\\u2217\\n\\nRaffaele Perego, Chiara Renso\\n\\nIIT-CNR\\n\\nISTI-CNR\\n\\nPisa, Italy\\n\\nPisa, Italy\\n\\nAbstract\\n\\nLarge Language Models (LLMs) have become foundational tools in artificial intelligence, supporting a wide range of applications beyond traditional natural language processing, including urban systems and tourist recommendations. However, their tendency to hallucinate and their limitations in spatial retrieval and reasoning are well known, pointing to the need for novel solutions. Retrieval- augmented generation (RAG) has recently emerged as a promising way to enhance LLMs with accurate, domain-specific, and timely information. Spatial RAG exte

In [17]:
def create_vector_store(documents,persistance_directory="database/chroma_db"):
    """Create a vector store from a list of documents"""
    print("Creating embeddings and storing in ChromaDB..")
    embeddings_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    # Create a vector store
    print("---Creating vector store---")
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings_model,
        persist_directory=persistance_directory,
        collection_metadata={"hnsw:space":"cosine"}
    )
    print("---Finished creating vector store---")
    print(f"Vector store created at {persistance_directory}")
    return vector_store

db = create_vector_store(processed_chunks)

Craeting embeddings and storing in ChromaDB..
---Creating vector store---
---Finished creating vector store---
Vector store created at database/chroma_db


In [18]:
def export_chunks_to_json(chunks, output_file):
    """Export LangChain Document chunks to a JSON file"""

    serializable_chunks = []

    for i, chunk in enumerate(chunks):
        serializable_chunks.append({
            "chunk_id": i + 1,
            "page_content": chunk.page_content,
            "metadata": chunk.metadata
        })

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(serializable_chunks, f, indent=4, ensure_ascii=False)

    print(f"✅ Exported {len(serializable_chunks)} chunks to {output_file}")


In [21]:
# After your retrieval pipeline is ready, you can use it to answer questions
query = "What does Figure 1 show in WalkRAG?"
retriever = db.as_retriever(search_kwargs={"k": 5})
relevant_docs = retriever.invoke(query)

export_chunks_to_json(relevant_docs, "relevant_docs.json")

✅ Exported 5 chunks to relevant_docs.json


In [None]:
def run_complete_ingestion_pipeline(pdf_path:str):
    """Run the complete RAG ingestion pipeline"""
    print("--Starting RAG Ingestion Pipeline--")
    print("="*50)

    # Step 1: Partition
    elements = partition_document(pdf_path)

    # Step 2: Chunk
    chunks = create_chunks_by_title(elements)

    # Step 3: AI summary
    summarise_chunks = summarise_chunks(chunks)

    # Step 4: Vector Store
    db = create_vector_store(summarise_chunks,persist_directory="dbv2/ChromaDB")

    print("="*50)
    print("--RAG Ingestion Pipeline Complete--")

    return db

In [None]:
db = run_complete_ingestion_pipeline(r"K:\RAG\docs\paper.pdf")

In [None]:
query = "What is WalkRAG ?"

retriever = db.as_retriever(search_kwargs={"k": 3})
chunks = retriever.invoke(query)

def generate_final_answer(chunks,query):
    """Generate final answer from chunks"""

    try:
        model = ChatOpenAI(
        api_key=os.getenv("OPENROUTER_API_KEY"),
        base_url="https://openrouter.ai/api/v1",
        model="openai/gpt-4o-mini",
        temperature=0.3
        )

        prompt = f"""Based on the given context, answer the question.
        Question: {query}
        Context to analyze:
        """ 
        for i,chunk in enumerate(chunks):
            prompt += f"\n\nContext {i+1}: {chunk.page_content}"

            if "original_content" in chunk.metadata:
                original_data=json.loads(chunk.metadata['original_content'])

                # add raw text 
                raw_text = original_data['raw_text',""]
                if raw_text:
                    prompt += f"\n\nRaw Text: {raw_text}"

                # Add tables as HTML
                tables = original_data.get('tables',[])
                if tables:
                    prompt += "\n\nTables:"
                    for i,table in enumerate(tables):
                        prompt += f"\n\nTable {i+1}:\n{table}"
            
            prompt += "\n\nAnswer: "
        
        prompt += """
        Please provide a concise and accurate answer using text, tables, images and raw text.
        If the context is insufficient to answer the question, respond with 'I don't know' or 'I'm not sure'.
        """
        
        # Build message content starting with text
        message_content = [
            {
                "type":"text",
                "text":prompt
            }
        ]

        # Add all images form all chunks
        for chunk in chunks:
            if "original_content" in chunk.metadata:
                original_data=json.loads(chunk.metadata['original_content'])
                images = original_data.get('images_base64',[])
                
                # Attach images safely (limit count)
                MAX_IMAGES = 3
                for image_base64 in images[:MAX_IMAGES]:
                    message_content.append({
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
                    })

        messages = HumanMessage(content=message_content)
        result = model.invoke([messages])

        return result.content
    except Exception as e:
        print("Error generating final answer:",str(e))
        return str(e)           
    