# Week 5: Building Complete RAG Systems

## Learning Objectives
- Build production-quality RAG pipelines
- Implement document chunking strategies
- Handle multiple document sources
- Improve retrieval quality
- Add metadata and filtering

In [None]:
import os
import numpy as np
import json
from datetime import datetime
from dotenv import load_dotenv
from google import genai
from google.genai import types
import google.auth

load_dotenv()
creds, project = google.auth.default()
client = genai.Client(vertexai=True, project=project, location="us-central1")

## Part 1: Document Processing Pipeline

In [None]:
class Document:
    """Represents a document with metadata"""
    def __init__(self, content, metadata=None):
        self.content = content
        self.metadata = metadata or {}
        self.metadata['created'] = datetime.now().isoformat()
    
    def __repr__(self):
        return f"Document(content={self.content[:50]}..., metadata={self.metadata})"

class DocumentProcessor:
    """Process and chunk documents"""
    
    @staticmethod
    def chunk_by_sentences(text, chunk_size=3, overlap=1):
        """Chunk by sentence count"""
        sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]
        chunks = []
        
        for i in range(0, len(sentences), chunk_size - overlap):
            chunk = ' '.join(sentences[i:i + chunk_size])
            if chunk:
                chunks.append(chunk)
        
        return chunks
    
    @staticmethod
    def chunk_by_words(text, chunk_size=200, overlap=50):
        """Chunk by word count"""
        words = text.split()
        chunks = []
        
        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            if chunk:
                chunks.append(chunk)
        
        return chunks

print("✓ Document classes ready")

## Part 2: Advanced Document Store

In [None]:
class RAGDocumentStore:
    """Advanced document store with metadata and filtering"""
    
    def __init__(self):
        self.chunks = []
        self.embeddings = []
        self.metadata = []
    
    def add_document(self, document, chunk_strategy='sentences', chunk_size=3):
        """Add document with chunking"""
        processor = DocumentProcessor()
        
        if chunk_strategy == 'sentences':
            chunks = processor.chunk_by_sentences(document.content, chunk_size)
        else:
            chunks = processor.chunk_by_words(document.content, chunk_size)
        
        for i, chunk in enumerate(chunks):
            # Get embedding
            response = client.models.embed_content(
                model="text-embedding-004",
                contents=chunk.replace("\n", " ")
            )
            embedding = response.embeddings[0].values
            
            # Store with metadata
            self.chunks.append(chunk)
            self.embeddings.append(embedding)
            
            chunk_metadata = document.metadata.copy()
            chunk_metadata['chunk_index'] = i
            chunk_metadata['total_chunks'] = len(chunks)
            self.metadata.append(chunk_metadata)
        
        return len(chunks)
    
    def search(self, query, top_k=5, filters=None):
        """Search with optional metadata filtering"""
        response = client.models.embed_content(
            model="text-embedding-004",
            contents=query
        )
        query_embedding = response.embeddings[0].values
        
        # Calculate similarities
        similarities = []
        for i, emb in enumerate(self.embeddings):
            # Apply filters
            if filters:
                skip = False
                for key, value in filters.items():
                    if key not in self.metadata[i] or self.metadata[i][key] != value:
                        skip = True
                        break
                if skip:
                    continue
            
            sim = np.dot(query_embedding, emb) / (np.linalg.norm(query_embedding) * np.linalg.norm(emb))
            similarities.append((i, sim))
        
        # Sort and get top k
        similarities.sort(key=lambda x: x[1], reverse=True)
        top_results = similarities[:top_k]
        
        return [
            {
                "chunk": self.chunks[idx],
                "similarity": sim,
                "metadata": self.metadata[idx]
            }
            for idx, sim in top_results
        ]
    
    def __len__(self):
        return len(self.chunks)

print("✓ RAG document store ready")

## Part 3: Complete RAG System

In [None]:
class RAGSystem:
    """Complete RAG system"""
    
    def __init__(self):
        self.store = RAGDocumentStore()
    
    def add_document(self, content, metadata=None, **kwargs):
        """Add document to system"""
        doc = Document(content, metadata)
        chunks_added = self.store.add_document(doc, **kwargs)
        return chunks_added
    
    def query(self, question, top_k=3, filters=None, temperature=0.3):
        """Query system with RAG"""
        # Retrieve
        results = self.store.search(question, top_k=top_k, filters=filters)
        
        if not results:
            return {
                "answer": "No relevant documents found.",
                "sources": []
            }
        
        # Build context
        context = "\n\n".join([
            f"[Source {i+1}] {r['chunk']}"
            for i, r in enumerate(results)
        ])
        
        # Generate
        prompt = f"""Answer the question based on the provided context. 
If the answer is not in the context, say so.

Context:
{context}

Question: {question}

Answer:"""
        
        response = client.models.generate_content(
            model="gemini-1.5-flash",
            contents=prompt,
            config=types.GenerateContentConfig(temperature=temperature)
        )
        
        return {
            "answer": response.text,
            "sources": results
        }
    
    def stats(self):
        """Get system statistics"""
        return {
            "total_chunks": len(self.store),
            "unique_documents": len(set(m.get('source', '') for m in self.store.metadata))
        }

print("✓ RAG system ready")

## Part 4: Example - Medical Knowledge Base

In [None]:
# Create RAG system
rag = RAGSystem()

# Add documents with metadata
documents = [
    {
        "content": """Hypertension, or high blood pressure, is a common condition where blood 
        pressure is consistently elevated. Treatment includes lifestyle changes like diet and 
        exercise, and medications such as ACE inhibitors or diuretics. Regular monitoring is essential.""",
        "metadata": {"topic": "cardiovascular", "source": "clinical_guide", "date": "2024"}
    },
    {
        "content": """Type 2 diabetes is characterized by insulin resistance and high blood sugar. 
        Management includes blood glucose monitoring, dietary modifications, exercise, and medications 
        like metformin. Complications can affect kidneys, eyes, and nerves if uncontrolled.""",
        "metadata": {"topic": "endocrine", "source": "clinical_guide", "date": "2024"}
    },
    {
        "content": """Asthma is a chronic respiratory condition causing airway inflammation and bronchospasm. 
        Symptoms include wheezing, shortness of breath, and coughing. Treatment involves inhaled 
        corticosteroids for prevention and bronchodilators for acute symptoms.""",
        "metadata": {"topic": "respiratory", "source": "clinical_guide", "date": "2024"}
    }
]

for doc in documents:
    chunks = rag.add_document(doc["content"], doc["metadata"], chunk_strategy='sentences', chunk_size=2)
    print(f"Added document: {chunks} chunks")

print(f"\nSystem stats: {rag.stats()}")

In [None]:
# Query the system
questions = [
    "What medications are used to treat high blood pressure?",
    "How is diabetes managed?",
    "Tell me about respiratory conditions"
]

for q in questions:
    print(f"\nQ: {q}")
    print("="*70)
    result = rag.query(q, top_k=2)
    print(f"A: {result['answer']}")
    print("\nSources used:")
    for i, src in enumerate(result['sources'], 1):
        print(f"  {i}. [Score: {src['similarity']:.3f}] {src['chunk'][:80]}...")
        print(f"     Metadata: {src['metadata']}")

## Part 5: Filtered Search

In [None]:
# Search only cardiovascular topics
result = rag.query(
    "What treatments are available?",
    filters={"topic": "cardiovascular"},
    top_k=2
)

print("Filtered to cardiovascular only:")
print(result['answer'])

## Key Takeaways

1. **Chunk strategically** - Consider document structure
2. **Use metadata** - Filter and organize retrieved content
3. **Balance chunk size** - Too small loses context, too large loses precision
4. **Track sources** - Always show where information came from
5. **Test retrieval quality** - Verify relevant chunks are found

## Next Week

Best practices and production patterns:
- Error handling
- Cost optimization
- Testing strategies
- Deployment considerations