In [None]:
# Install required packages (run once)
# !pip install unstructured langchain langchain-openai langchain-chroma chromadb openai

import json
from typing import List

# Unstructured for document parsing
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

# LangChain components
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [18]:
def partition_document(file_path: str):
    """Extract elements from PDF using unstructured"""
    print(f"📄 Partitioning document: {file_path}")
    
    elements = partition_pdf(
        filename=file_path,
        strategy="hi_res",
        infer_table_structure=True,
        extract_image_block_types=["Image"],
        extract_image_block_to_payload=True
    )
    
    print(f"✅ Extracted {len(elements)} elements")
    return elements

# Test with your PDF file
file_path = "./docs/rag_patterns.pdf"  # Change this to your PDF path
elements = partition_document(file_path)

📄 Partitioning document: ./docs/rag_patterns.pdf
✅ Extracted 90 elements


In [19]:
elements

[<unstructured.documents.elements.Text at 0x14fbf4280>,
 <unstructured.documents.elements.Text at 0x14fbf4130>,
 <unstructured.documents.elements.NarrativeText at 0x14fbf41a0>,
 <unstructured.documents.elements.Image at 0x14fb42350>,
 <unstructured.documents.elements.Image at 0x14fb42510>,
 <unstructured.documents.elements.Title at 0x14fbf4360>,
 <unstructured.documents.elements.Image at 0x14fa93040>,
 <unstructured.documents.elements.NarrativeText at 0x14fbf42f0>,
 <unstructured.documents.elements.NarrativeText at 0x14fbf4210>,
 <unstructured.documents.elements.Text at 0x14fbf4590>,
 <unstructured.documents.elements.Title at 0x14fbf4670>,
 <unstructured.documents.elements.NarrativeText at 0x14fbf4750>,
 <unstructured.documents.elements.Text at 0x14fbf4830>,
 <unstructured.documents.elements.Text at 0x14fbf4910>,
 <unstructured.documents.elements.Text at 0x14fbf49f0>,
 <unstructured.documents.elements.Text at 0x14fbf4ad0>,
 <unstructured.documents.elements.Title at 0x14fbf4a60>,
 <unst

In [None]:
def create_chunks_by_title(elements):
    """Create intelligent chunks using title-based strategy"""
    print("🔨 Creating smart chunks...")
    
    chunks = chunk_by_title(
        elements,
        max_characters=6000,
        combine_text_under_n_chars=1000,
        new_after_n_chars=3000
    )
    
    print(f"✅ Created {len(chunks)} chunks")
    return chunks

# Create chunks
chunks = create_chunks_by_title(elements)

🔨 Creating smart chunks...
✅ Created 4 chunks


In [24]:
chunks

[<unstructured.documents.elements.CompositeElement at 0x14f686c10>,
 <unstructured.documents.elements.CompositeElement at 0x14f6843d0>,
 <unstructured.documents.elements.CompositeElement at 0x14f686cf0>,
 <unstructured.documents.elements.CompositeElement at 0x14f684d00>]

In [None]:
def separate_content_types(chunk):
    """Analyze what types of content are in a chunk"""
    content_data = {
        'text': chunk.text,
        'tables': [],
        'images': [],
        'types': ['text']
    }
    
    # Check for tables and images in original elements
    if hasattr(chunk, 'metadata') and hasattr(chunk.metadata, 'orig_elements'):
        for element in chunk.metadata.orig_elements:
            element_type = type(element).__name__
            
            # Handle tables
            if element_type == 'Table':
                content_data['types'].append('table')
                table_html = getattr(element.metadata, 'text_as_html', element.text)
                content_data['tables'].append(table_html)
            
            # Handle images
            elif element_type == 'Image':
                if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_base64'):
                    content_data['types'].append('image')
                    content_data['images'].append(element.metadata.image_base64)
    
    content_data['types'] = list(set(content_data['types']))
    return content_data

def create_ai_enhanced_summary(text: str, tables: List[str], images: List[str]) -> str:
    """Create AI-enhanced summary for mixed content"""
    
    try:
        # Initialize LLM (needs vision model for images)
        llm = ChatOpenAI(model="gpt-4o", temperature=0)
        
        # Build the text prompt
        prompt_text = f"""You are creating a searchable description for document content retrieval.

        CONTENT TO ANALYZE:
        TEXT CONTENT:
        {text}

        """
        
        # Add tables if present
        if tables:
            prompt_text += "TABLES:\n"
            for i, table in enumerate(tables):
                prompt_text += f"Table {i+1}:\n{table}\n\n"
        
                prompt_text += """
                YOUR TASK:
                Generate a comprehensive, searchable description that covers:

                1. Key facts, numbers, and data points from text and tables
                2. Main topics and concepts discussed  
                3. Questions this content could answer
                4. Visual content analysis (charts, diagrams, patterns in images)
                5. Alternative search terms users might use

                Make it detailed and searchable - prioritize findability over brevity.

                SEARCHABLE DESCRIPTION:"""

        # Build message content starting with text
        message_content = [{"type": "text", "text": prompt_text}]
        
        # Add images to the message
        for image_base64 in images:
            message_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
            })
        
        # Send to AI and get response
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        
        return response.content
        
    except Exception as e:
        print(f"     ❌ AI summary failed: {e}")
        # Fallback to simple summary
        summary = f"{text[:300]}..."
        if tables:
            summary += f" [Contains {len(tables)} table(s)]"
        if images:
            summary += f" [Contains {len(images)} image(s)]"
        return summary

def summarise_chunks(chunks):
    """Process all chunks with AI Summaries"""
    print("🧠 Processing chunks with AI Summaries...")
    
    langchain_documents = []
    total_chunks = len(chunks)
    
    for i, chunk in enumerate(chunks):
        current_chunk = i + 1
        print(f"   Processing chunk {current_chunk}/{total_chunks}")
        
        # Analyze chunk content
        content_data = separate_content_types(chunk)
        
        # Debug prints
        print(f"     Types found: {content_data['types']}")
        print(f"     Tables: {len(content_data['tables'])}, Images: {len(content_data['images'])}")
        
        # Create AI-enhanced summary if chunk has tables/images
        if content_data['tables'] or content_data['images']:
            print(f"     → Creating AI summary for mixed content...")
            try:
                enhanced_content = create_ai_enhanced_summary(
                    content_data['text'],
                    content_data['tables'], 
                    content_data['images']
                )
                print(f"     → AI summary created successfully")
                print(f"     → Enhanced content preview: {enhanced_content[:200]}...")
            except Exception as e:
                print(f"     ❌ AI summary failed: {e}")
                enhanced_content = content_data['text']
        else:
            print(f"     → Using raw text (no tables/images)")
            enhanced_content = content_data['text']
        
        # Create LangChain Document with rich metadata
        doc = Document(
            page_content=enhanced_content,
            metadata={
                "source": "docs/abc.pdf",
                "original_content": json.dumps({
                    "raw_text": content_data['text'],
                    "tables_html": content_data['tables'],
                    "images_base64": content_data['images']
                })
            }
        )
        
        langchain_documents.append(doc)
    
    print(f"✅ Processed {len(langchain_documents)} chunks")
    return langchain_documents


# Process chunks with AI
processed_chunks = summarise_chunks(chunks)

🧠 Processing chunks with AI Summaries...
   Processing chunk 1/4
     Types found: ['image', 'text']
     Tables: 0, Images: 4
     → Creating AI summary for mixed content...
     → AI summary created successfully
     → Enhanced content preview: **Searchable Description for Document Content Retrieval:**

---

**Title:** Guide to 7 Popular Agentic RAG System Architectures

**Author:** Dipanjan (DJ)

**Overview:**
This document serves as a comp...
   Processing chunk 2/4
     Types found: ['image', 'text']
     Tables: 0, Images: 2
     → Creating AI summary for mixed content...
     → AI summary created successfully
     → Enhanced content preview: **Searchable Description for Document Content Retrieval**

**Key Facts, Numbers, and Data Points:**
- **Methodologies Discussed:** 
  - Query Planning Agentic RAG (Retrieval-Augmented Generation)
  - ...
   Processing chunk 3/4
     Types found: ['image', 'text']
     Tables: 0, Images: 3
     → Creating AI summary for mixed content...
     

In [47]:
processed_chunks

[Document(metadata={'source': 'docs/abc.pdf', 'original_content': '{"raw_text": "Guide\\n\\nGuide to\\n\\n7 Popular Agentic RAG System Architectures\\n\\nis part of Not found in either database Vector DB Selection DB2 DB3 iS Vv Vector Database 1 Vector Database 2 Vector Database 3 |< VB1: This VB3: This inlcude data to inlcude data to Generate Code mere Integrate into LLM Prompt Generate Charts > The agent will decide the vector database most relevant to the ee ee ee to Generate charts uery. 4 7 ito Generate code | Response Type Selection Route to Failsafe Text Generate Text Response Return \\u2018Sorry, I don\'t have the information you\'re looking for\\u2019\\n\\nAgentic RAG Workflow\\n\\nis part of Vector DB Selection > | Vector Database 1 | Es 1 | Vector Database 2 | or 2 | Vector Database 3 | | Vector Database 3 | 3 \\\\< VB1: This [rome | Retrieve Context | Context VB3: This inleude data to inlcude data to G te Code enerate Co aa \\u2014 Generate Charts to Generate code The agent

In [48]:
def export_chunks_to_json(processed_chunks, filename="chunks_export.json"):
    """Export processed chunks to clean JSON format"""
    export_data = []
    
    for i, doc in enumerate(processed_chunks):
        chunk_data = {
            "chunk_id": i + 1,
            "enhanced_content": doc.page_content,
            "metadata": {
                "source": doc.metadata.get("source", ""),
                "original_content": json.loads(doc.metadata.get("original_content", "{}"))
            }
        }
        export_data.append(chunk_data)
    
    # Save to file
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(export_data, f, indent=2, ensure_ascii=False)
    
    print(f"✅ Exported {len(export_data)} chunks to {filename}")
    return export_data

# Export your chunks
json_data = export_chunks_to_json(processed_chunks)

✅ Exported 4 chunks to chunks_export.json


In [42]:
def create_vector_store(documents, persist_directory="testing/chroma_db"):
    """Create and persist ChromaDB vector store"""
    print("🔮 Creating embeddings and storing in ChromaDB...")
        
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    
    # Create ChromaDB vector store
    print("--- Creating vector store ---")
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_directory, 
        collection_metadata={"hnsw:space": "cosine"}
    )
    print("--- Finished creating vector store ---")
    
    print(f"✅ Vector store created and saved to {persist_directory}")
    return vectorstore

# Create the vector store
db = create_vector_store(processed_chunks)

🔮 Creating embeddings and storing in ChromaDB...
--- Creating vector store ---
--- Finished creating vector store ---
✅ Vector store created and saved to testing/chroma_db


In [43]:
# Query the vector store
query = "Different types of RAG architectures?"

retriever = db.as_retriever(search_kwargs={"k": 5})
chunks = retriever.invoke(query)

# Visualize results
print(f"🔍 Query: {query}")
print(f"📊 Found {len(chunks)} relevant chunks:\n")

for i, chunk in enumerate(chunks, 1):
    print(f"--- Chunk {i} ---")
    print(f"Content: {chunk.page_content}...")
    
    # Show original content if available
    if 'original_content' in chunk.metadata:
        original = json.loads(chunk.metadata['original_content'])
        if original['tables_html']:
            print(f"📊 Contains {len(original['tables_html'])} table(s)")
        if original['images_base64']:
            print(f"🖼️ Contains {len(original['images_base64'])} image(s)")
    
    print(f"Source: {chunk.metadata.get('source', 'Unknown')}")
    print("-" * 50)

🔍 Query: Different types of RAG architectures?
📊 Found 4 relevant chunks:

--- Chunk 1 ---
Content: **Searchable Description for Document Content Retrieval:**

**Title:** 7 Agentic RAG System Architectures to Build AI Agents  
**Author:** Dipanjan (DJ)  
**Last Updated:** January 7, 2025  
**Estimated Read Time:** 26 minutes  

**Key Facts, Numbers, and Data Points:**
- The document discusses **seven distinct architectures** of Agentic Retrieval-Augmented Generation (RAG) systems.
- It highlights the evolution of AI agents from **2023 to 2025**, emphasizing the transition from static tools to dynamic, agent-driven processes.
- The article suggests that **2025** will be the "Year of AI Agents," indicating a significant shift in productivity and industry practices.
- The content includes specific architectures such as **Speculative RAG** and **Self Route Agentic RAG**, detailing their functionalities and efficiencies.

**Main Topics and Concepts Discussed:**
1. **Agentic RAG Systems:** A

In [16]:
def run_complete_ingestion_pipeline(pdf_path: str):
    """Run the complete RAG ingestion pipeline"""
    print("🚀 Starting RAG Ingestion Pipeline")
    print("=" * 50)
    
    # Step 1: Partition
    elements = partition_document(pdf_path)
    
    # Step 2: Chunk
    chunks = create_chunks_by_title(elements)
    
    # Step 3: AI Summarisation
    summarised_chunks = summarise_chunks(chunks)
    
    # Step 4: Vector Store
    db = create_vector_store(summarised_chunks)
    
    print("🎉 Pipeline completed successfully!")
    return db

# Run the complete pipeline

In [17]:
db = run_complete_ingestion_pipeline("./docs/rag_patterns.pdf")

🚀 Starting RAG Ingestion Pipeline
📄 Partitioning document: ./docs/rag_patterns.pdf
✅ Extracted 90 elements
🔨 Creating smart chunks...
✅ Created 4 chunks
🧠 Processing chunks with AI enhancement...
   Processing chunk 1/4
   Processing chunk 2/4
   Processing chunk 3/4
   Processing chunk 4/4
✅ Processed 4 chunks
🔮 Creating embeddings and storing in ChromaDB...
--- Creating vector store ---
--- Finished creating vector store ---
✅ Vector store created and saved to db1/chroma_db
🎉 Pipeline completed successfully!


In [None]:
# Query the vector store
query = "What are the different agentic RAG system patterns?"

retriever = db.as_retriever(search_kwargs={"k": 5})
chunks = retriever.invoke(query)

# Visualize results
print(f"🔍 Query: {query}")
print(f"📊 Found {len(chunks)} relevant chunks:\n")

for i, chunk in enumerate(chunks, 1):
    print(f"--- Chunk {i} ---")
    print(f"Content: {chunk.page_content}...")
    
    # Show original content if available
    if 'original_content' in chunk.metadata:
        original = json.loads(chunk.metadata['original_content'])
        if original['tables_html']:
            print(f"📊 Contains {len(original['tables_html'])} table(s)")
        if original['images_base64']:
            print(f"🖼️ Contains {len(original['images_base64'])} image(s)")
    
    print(f"Source: {chunk.metadata.get('source', 'Unknown')}")
    print("-" * 50)