# Performance Optimization and Deployment

In this notebook, we'll learn how to optimize RAG systems for production deployment and analyze performance trade-offs.

## Learning Objectives
By the end of this notebook, you will:
1. Optimize RAG systems for speed and memory usage
2. Analyze latency and cost trade-offs
3. Implement caching and batch processing
4. Learn about deployment strategies and scaling
5. Monitor and debug production RAG systems


## Setup and Imports

Let's import the libraries we need for performance optimization and analysis.


In [None]:
# Standard library imports
import json
import time
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Any
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import psutil
import threading
from contextlib import contextmanager

# Add project root to path
import sys
sys.path.append(str(Path.cwd().parent))

# Import our modules
from src.optimization.performance_analysis import PerformanceProfiler, CostAnalyzer, LatencyAnalyzer, PerformanceOptimizer
from src.retrieval.retrieval_system import RetrievalSystem, RetrievalConfig
from src.models.llm_models import RAGGenerator, PromptTemplate
from src.config import DATA_DIR

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

# Create sample data for performance testing
print("Creating sample data for performance testing...")

sample_chunks = [
    {
        'id': f'chunk_{i}',
        'text': f'This is sample document {i} with some content about machine learning and artificial intelligence.',
        'title': f'Document {i}',
        'source': 'test',
        'chunk_id': f'chunk_{i}'
    }
    for i in range(100)  # Create 100 sample chunks
]

print(f"Created {len(sample_chunks)} sample chunks for performance testing")

# Initialize performance monitoring
profiler = PerformanceProfiler()
cost_analyzer = CostAnalyzer()
latency_analyzer = LatencyAnalyzer()

print("Performance monitoring tools initialized!")


## Performance Profiling

Let's profile our RAG system to identify bottlenecks and optimization opportunities.


In [None]:
# Create a retrieval system for performance testing
print("Setting up retrieval system for performance testing...")
retrieval_config = RetrievalConfig(top_k=5, use_reranking=False)
retrieval_system = RetrievalSystem(retrieval_config)
retrieval_system.add_documents(sample_chunks)

# Test queries for performance evaluation
test_queries = [
    "What is machine learning?",
    "How does artificial intelligence work?",
    "Explain deep learning",
    "What are neural networks?",
    "Tell me about computer vision"
]

print(f"Created retrieval system with {len(sample_chunks)} documents")
print(f"Prepared {len(test_queries)} test queries")

# Performance testing function
def performance_test():
    """Run performance tests on the retrieval system."""
    results = []
    
    for i, query in enumerate(test_queries):
        print(f"Testing query {i+1}/{len(test_queries)}: {query}")
        
        # Profile the retrieval
        with profiler.profile_request(f"query_{i}", "retrieval"):
            start_time = time.time()
            retrieved_docs = retrieval_system.retrieve(query, method="dense", top_k=5)
            end_time = time.time()
            
            # Record latency
            latency_analyzer.record_latency("retrieval", end_time - start_time, {
                'query': query,
                'num_results': len(retrieved_docs)
            })
            
            results.append({
                'query': query,
                'duration': end_time - start_time,
                'num_results': len(retrieved_docs),
                'results': retrieved_docs
            })
    
    return results

# Run performance tests
print("\nRunning performance tests...")
performance_results = performance_test()

# Analyze results
print("\nPerformance Analysis:")
print("=" * 40)

durations = [result['duration'] for result in performance_results]
print(f"Average retrieval time: {np.mean(durations):.3f} seconds")
print(f"Min retrieval time: {np.min(durations):.3f} seconds")
print(f"Max retrieval time: {np.max(durations):.3f} seconds")
print(f"Standard deviation: {np.std(durations):.3f} seconds")

# Show detailed results
print(f"\nDetailed Results:")
for i, result in enumerate(performance_results):
    print(f"{i+1}. Query: {result['query'][:50]}...")
    print(f"   Duration: {result['duration']:.3f}s, Results: {result['num_results']}")

# Get performance summary
summary = profiler.get_performance_summary()
print(f"\nPerformance Summary:")
for key, value in summary.items():
    print(f"  {key}: {value}")
