In [None]:
"""
RAG Modular Architecture Showcase
=================================

This notebook demonstrates how to use the modular RAG evaluation system for:
- Building and configuring RAG pipelines
- Using the flexible metrics system
- Running systematic evaluations
- Comparing different configurations
- Analyzing results

Architecture Overview:
- Core: Configuration management and type definitions
- Metrics: Pluggable metrics registry with RAGAS and custom metrics
- Evaluation: Modular evaluator that works with registered metrics
- Pipelines: Different pipeline patterns (linear, parallel)
- System: Main RAG orchestrator

Key Features:
- 🏗️ Factory Pattern: Consistent component creation
- 🔌 Strategy Pattern: Swappable implementations  
- 📊 Metrics Registry: Add custom metrics without touching core code
- ⚡ Async First: High-performance async operations
- 🎯 Type Safe: Full type hints for better IDE support
"""

# ============================================================================
# 1. SETUP AND ENVIRONMENT CONFIGURATION
# ============================================================================

In [None]:
# Core imports
import asyncio
import nest_asyncio
import pandas as pd
from pathlib import Path
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime

# Apply nest_asyncio to handle async in Jupyter
nest_asyncio.apply()

# Import configuration and system components
from core.config import (
    ConfigManager, RAGConfig, LoaderConfig, ChunkerConfig,
    EmbeddingConfig, StorageConfig, RetrievalConfig, 
    GenerationConfig, MetricsConfig
)
from system.rag_system import ModularRAGSystem
from utils.helpers import create_project_directories, verify_api_keys

# Import metrics components
from core.metrics_registry import BaseMetric, MetricRequirements, MetricResult

# Setup project directories
create_project_directories()

# Verify API keys
api_keys = verify_api_keys()
if not api_keys.get("OPENAI_API_KEY"):
    print("⚠️ Please set OPENAI_API_KEY in your .env file")
else:
    print("✅ Environment configured successfully")

# ============================================================================
# 2. BASIC CONFIGURATION
# ============================================================================

In [None]:
# Create a comprehensive configuration for the RAG system
config = RAGConfig(
    # Experiment metadata
    experiment_name="rag_showcase",
    tags=["showcase", "modular", "evaluation"],
    
    # Document loading configuration
    loader=LoaderConfig(
        type="text",                    # Options: "text", "text_image", "none"
        pdf_extract_images=False,       # Whether to extract images from PDFs
        supported_formats=["pdf", "txt", "docx"]
    ),
    
    # Chunking strategy configuration
    chunker=ChunkerConfig(
        method="recursive",             # Options: "recursive", "semantic", "sentence", "fixed", "sliding_window"
        chunk_size=500,                 # Characters per chunk
        chunk_overlap=50,               # Overlap between chunks
        semantic_threshold=0.8          # For semantic chunker only
    ),
    
    # Embedding model configuration
    embedding=EmbeddingConfig(
        provider="openai",              # Options: "openai", "cohere", "huggingface", "sentence_transformers"
        model="text-embedding-3-small", # Specific model to use
        dimension=1536,                 # Embedding dimension
        batch_size=100                  # Batch size for embedding
    ),
    
    # Vector storage configuration
    storage=StorageConfig(
        type="faiss",                   # Options: "faiss", "chroma", "pinecone", "weaviate", "qdrant"
        persist=True,                   # Whether to save vector store to disk
        metric="cosine"                 # Distance metric for similarity
    ),
    
    # Retrieval strategy configuration
    retrieval=RetrievalConfig(
        strategy="vector",              # Options: "vector", "bm25", "hybrid", "mmr", "rerank"
        top_k=5,                        # Number of documents to retrieve
        search_type="similarity",       # Type of search to perform
        hybrid_weights=[0.7, 0.3],      # Weights for hybrid retrieval [vector, bm25]
        mmr_lambda=0.5                  # For MMR diversity
    ),
    
    # Generation model configuration
    generation=GenerationConfig(
        provider="openai",              # Options: "openai", "anthropic", "cohere", "huggingface", "ollama"
        model="gpt-4o-mini",           # Specific model to use
        temperature=0.0,                # 0.0 = deterministic, higher = more creative
        max_tokens=1000,                # Maximum tokens in response
        prompt_template="default"       # Which prompt template to use
    ),
    
    # Metrics configuration - using the new flexible system
    metrics=MetricsConfig(
        metric_names=["ragas_faithfulness", "ragas_answer_relevancy", "response_time"],
        metric_groups=["performance"]   # Will include all performance metrics
    ),
    
    # Pipeline configuration
    pipeline_type="linear"              # Options: "linear", "parallel", "iterative"
)

# Create configuration manager to handle the config
config_manager = ConfigManager(config)
print(f"Configuration created with ID: {config.experiment_id}")
print(f"Variant ID: {config.get_variant_id()}")

# ============================================================================
# 3. INITIALIZE RAG SYSTEM
# ============================================================================

In [None]:
# Initialize the modular RAG system with our configuration
rag_system = ModularRAGSystem(config_manager)
rag_system.initialize_components()

# Check what metrics are available in the system
display(Markdown("### Available Metrics"))

# Metrics that work without reference answers
no_ref_metrics = rag_system.metric_factory.registry.get_available_metrics(has_reference=False)
print(f"Without reference: {no_ref_metrics}")

# Metrics that require reference answers
ref_metrics = rag_system.metric_factory.registry.get_available_metrics(has_reference=True)
print(f"With reference: {ref_metrics}")

# Get metric groups
print("\nMetric groups available:")
if hasattr(rag_system.metric_factory.registry, '_metric_groups'):
    for group_name in rag_system.metric_factory.registry._metric_groups:
        metrics_in_group = rag_system.metric_factory.registry.get_metrics_by_group(group_name)
        print(f"- {group_name}: {[m.name for m in metrics_in_group]}")

# ============================================================================
# 4. DOCUMENT PROCESSING
# ============================================================================

In [None]:
# Specify the path to your documents
DOCUMENT_PATH = "./documents/sample.pdf"  # Update this to your document path

# Check if document exists
if not Path(DOCUMENT_PATH).exists():
    print(f"⚠️ Document not found at {DOCUMENT_PATH}")
    print("Creating a sample document for demonstration...")
    # For demo purposes, create a simple text file
    Path("./documents").mkdir(exist_ok=True)
    with open("./documents/sample.txt", "w") as f:
        f.write("""
        Retrieval-Augmented Generation (RAG) is a powerful technique that combines 
        information retrieval with language generation. The system works by first 
        retrieving relevant documents from a knowledge base, then using these documents 
        as context for generating accurate and informed responses.
        
        The modular architecture of our RAG system provides several key benefits:
        1. Flexibility: Easy component swapping through configuration
        2. Extensibility: Add new strategies without modifying core code
        3. Testability: Each component can be tested independently
        4. Performance: Optimized async operations throughout
        5. Observability: Comprehensive logging and metrics
        
        Key components include:
        - Document loader and processor
        - Text chunking strategy
        - Embedding model for vectorization
        - Vector store for efficient retrieval
        - Retrieval strategy
        - Language model for generation
        - Evaluation metrics
        """)
    DOCUMENT_PATH = "./documents/sample.txt"

# Load and process documents
chunks = rag_system.load_and_process_documents(DOCUMENT_PATH)

# Create or load vector store
# force_rebuild=True will recreate the vector store even if it exists
vector_store = rag_system.create_or_load_vector_store(chunks, force_rebuild=True)

print(f"\n📊 Processing Summary:")
print(f"- Chunks created: {len(chunks)}")
print(f"- Average chunk size: {sum(len(c.page_content) for c in chunks) / len(chunks):.0f} chars")

# Test the vector store with a sample query
rag_system.vs_manager.test_retrieval("What is RAG?", k=3)

# ============================================================================
# 5. BUILDING RAG PIPELINES
# ============================================================================

In [None]:
# Build a linear pipeline (standard RAG flow)
pipeline = rag_system.build_pipeline(pipeline_type="linear")

# Alternative: Build a parallel pipeline with multiple retrievers
# This uses both vector and BM25 retrieval in parallel
# pipeline = rag_system.build_pipeline(pipeline_type="parallel")

# ============================================================================
# 6. BASIC QUERYING WITH EVALUATION
# ============================================================================

In [None]:
async def run_simple_query():
    """Demonstrate simple query without evaluation"""
    
    question = "What is the main topic of the document?"
    
    # Query without evaluation - just get the answer
    result = await rag_system.query(
        question=question,
        evaluate=False  # No evaluation metrics
    )
    
    print(f"\n❓ Question: {result['question']}")
    print(f"💡 Answer: {result['answer']}")
    print(f"⏱️  Response time: {result['response_time']:.2f}s")
    print(f"📄 Retrieved {result['num_contexts']} contexts")
    
    return result

# Run the simple query
simple_result = await run_simple_query()

# ============================================================================
# 7. QUERY WITH AUTOMATIC EVALUATION (NO REFERENCE)
# ============================================================================

In [None]:
async def run_evaluated_query():
    """Demonstrate query with evaluation metrics but no reference answer"""
    
    # Query with automatic evaluation using modular system
    result = await rag_system.query(
        question="What are the key components of the RAG system?",
        evaluate=True,                  # Enable evaluation
        use_modular=True,               # Use new modular evaluation system
        metrics=["ragas_faithfulness", "ragas_answer_relevancy", "response_time"]
    )
    
    print(f"\n💡 Answer: {result['answer']}")
    
    # Display evaluation results
    if 'evaluation' in result:
        print("\n📊 Evaluation Results:")
        for metric_name, metric_result in result['evaluation'].items():
            if metric_result['error']:
                print(f"❌ {metric_name}: Error - {metric_result['error']}")
            else:
                print(f"✅ {metric_name}: {metric_result['value']:.3f} "
                      f"(computed in {metric_result['computation_time']:.2f}s)")
    
    return result

# Run the evaluated query
evaluated_result = await run_evaluated_query()

# ============================================================================
# 8. EVALUATION WITH REFERENCE ANSWERS
# ============================================================================

In [None]:
async def run_reference_evaluation():
    """Demonstrate evaluation with ground truth reference answers"""
    
    # Define test cases with reference answers
    test_cases = [
        {
            "question": "What is RAG?",
            "reference": "RAG (Retrieval-Augmented Generation) is a technique that combines "
                        "retrieval systems with language models to generate more accurate "
                        "and contextual responses."
        },
        {
            "question": "What are the benefits of modular architecture?",
            "reference": "The modular architecture provides flexibility, extensibility, "
                        "testability, performance optimization, and observability."
        },
        {
            "question": "What metrics are available?",
            "reference": "The system supports RAGAS metrics (faithfulness, answer relevancy, "
                        "context precision/recall), custom metrics (response time, token "
                        "efficiency, semantic similarity), and is extensible for new metrics."
        }
    ]
    
    # Run evaluation on all test cases
    results = []
    for test_case in test_cases:
        print(f"\n❓ Evaluating: {test_case['question']}")
        
        result = await rag_system.query(
            question=test_case["question"],
            reference=test_case["reference"],  # Provide reference for comparison
            evaluate=True,
            use_modular=True,
            metrics=["ragas_faithfulness", "ragas_answer_relevancy", 
                    "ragas_context_precision", "semantic_similarity"]
        )
        results.append(result)
    
    # Convert results to DataFrame for analysis
    eval_df = pd.DataFrame([
        {
            'question': r['question'],
            'answer': r['answer'][:100] + '...',  # Truncate for display
            **{k: v['value'] for k, v in r.get('evaluation', {}).items() 
               if not v.get('error')}
        }
        for r in results
    ])
    
    print("\n📊 Evaluation Summary:")
    print(eval_df.to_string())
    
    return results, eval_df

# Run reference evaluation
ref_results, ref_df = await run_reference_evaluation()

# ============================================================================
# 9. USING METRIC GROUPS
# ============================================================================

In [None]:
def demonstrate_metric_groups():
    """Show how to use predefined metric groups"""
    
    print("\n📋 Recommended Metric Sets:")
    
    # Different use cases have different recommended metrics
    use_cases = ["general", "quality_focus", "performance_focus", "no_reference"]
    
    for use_case in use_cases:
        metrics = rag_system.metric_factory.get_recommended_metrics(use_case)
        print(f"\n{use_case}:")
        for metric in metrics:
            print(f"  - {metric}")
    
    return use_cases

# Show metric groups
use_cases = demonstrate_metric_groups()

# ============================================================================
# 10. A/B TESTING DIFFERENT CONFIGURATIONS
# ============================================================================

In [None]:
async def run_ab_testing():
    """Compare different RAG configurations"""
    
    # Create configuration variants to test
    configs = {
        "small_chunks": RAGConfig(
            experiment_name="small_chunks",
            chunker=ChunkerConfig(
                method="recursive",
                chunk_size=200,      # Smaller chunks
                chunk_overlap=20
            ),
            retrieval=RetrievalConfig(
                strategy="vector",
                top_k=7              # More chunks to compensate
            ),
            generation=GenerationConfig(
                model="gpt-4o-mini",
                temperature=0.0
            )
        ),
        
        "large_chunks": RAGConfig(
            experiment_name="large_chunks",
            chunker=ChunkerConfig(
                method="recursive",
                chunk_size=1000,     # Larger chunks
                chunk_overlap=100
            ),
            retrieval=RetrievalConfig(
                strategy="vector",
                top_k=3              # Fewer chunks needed
            ),
            generation=GenerationConfig(
                model="gpt-4o-mini",
                temperature=0.0
            )
        ),
        
        "hybrid_retrieval": RAGConfig(
            experiment_name="hybrid_retrieval",
            chunker=ChunkerConfig(
                method="recursive",
                chunk_size=500,
                chunk_overlap=50
            ),
            retrieval=RetrievalConfig(
                strategy="hybrid",    # Use both vector and BM25
                top_k=5,
                hybrid_weights=[0.7, 0.3]
            ),
            generation=GenerationConfig(
                model="gpt-4o-mini",
                temperature=0.0
            )
        )
    }
    
    # Test questions for comparison
    test_questions = [
        "What is the main purpose of this system?",
        "How does the modular architecture work?",
        "What are the benefits of using this approach?"
    ]
    
    # Run experiments for each configuration
    experiment_results = {}
    
    for config_name, config_variant in configs.items():
        print(f"\n🧪 Testing configuration: {config_name}")
        
        # Create new system with variant configuration
        variant_manager = ConfigManager(config_variant)
        variant_system = ModularRAGSystem(variant_manager)
        variant_system.initialize_components()
        
        # Reuse existing vector store to save time
        variant_system.vs_manager = rag_system.vs_manager
        variant_system.retriever_factory.vector_store = vector_store
        variant_system.retriever_factory.documents = chunks
        
        # Build pipeline with new configuration
        variant_system.build_pipeline()
        
        # Run tests
        variant_results = []
        for question in test_questions:
            result = await variant_system.query(
                question=question,
                evaluate=True,
                metrics=["ragas_faithfulness", "ragas_answer_relevancy", "response_time"]
            )
            variant_results.append(result)
            print(f"  ✓ Tested: {question[:50]}...")
        
        experiment_results[config_name] = variant_results
    
    return experiment_results, test_questions

# Run A/B testing
experiment_results, test_questions = await run_ab_testing()

# ============================================================================
# 11. RESULTS ANALYSIS AND VISUALIZATION
# ============================================================================

In [None]:
def analyze_experiment_results(experiment_results):
    """Analyze and visualize A/B test results"""
    
    # Prepare data for analysis
    analysis_data = []
    for config_name, results in experiment_results.items():
        for result in results:
            row = {
                'config': config_name,
                'question': result['question'],
                'response_time': result['response_time'],
                'num_contexts': result['num_contexts']
            }
            
            # Add evaluation metrics
            if 'evaluation' in result:
                for metric_name, metric_data in result['evaluation'].items():
                    if not metric_data.get('error'):
                        row[metric_name] = metric_data['value']
            
            analysis_data.append(row)
    
    # Create DataFrame for analysis
    analysis_df = pd.DataFrame(analysis_data)
    
    # Display summary statistics
    print("\n📊 Summary Statistics by Configuration:")
    summary = analysis_df.groupby('config').agg({
        'ragas_faithfulness': 'mean',
        'ragas_answer_relevancy': 'mean',
        'response_time': 'mean',
        'num_contexts': 'mean'
    }).round(3)
    print(summary)
    
    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    fig.suptitle('Configuration Comparison', fontsize=16)
    
    # 1. Faithfulness comparison
    sns.boxplot(data=analysis_df, x='config', y='ragas_faithfulness', ax=axes[0, 0])
    axes[0, 0].set_title('Faithfulness Scores')
    axes[0, 0].set_xticklabels(axes[0, 0].get_xticklabels(), rotation=45)
    axes[0, 0].set_ylim(0, 1.1)
    
    # 2. Answer relevancy comparison
    sns.boxplot(data=analysis_df, x='config', y='ragas_answer_relevancy', ax=axes[0, 1])
    axes[0, 1].set_title('Answer Relevancy Scores')
    axes[0, 1].set_xticklabels(axes[0, 1].get_xticklabels(), rotation=45)
    axes[0, 1].set_ylim(0, 1.1)
    
    # 3. Response time comparison
    sns.boxplot(data=analysis_df, x='config', y='response_time', ax=axes[1, 0])
    axes[1, 0].set_title('Response Time (seconds)')
    axes[1, 0].set_xticklabels(axes[1, 0].get_xticklabels(), rotation=45)
    
    # 4. Create a summary comparison
    ax = axes[1, 1]
    ax.axis('off')
    
    # Create a text summary
    best_config = summary.idxmax()
    worst_config = summary.idxmin()
    
    summary_text = "Performance Summary:\n\n"
    summary_text += "Best performers:\n"
    for metric, config in best_config.items():
        summary_text += f"- {metric}: {config} ({summary.loc[config, metric]:.3f})\n"
    
    summary_text += "\nAreas for improvement:\n"
    for metric, config in worst_config.items():
        if metric != 'response_time':  # Lower is better for response time
            summary_text += f"- {metric}: {config} ({summary.loc[config, metric]:.3f})\n"
    
    ax.text(0.1, 0.9, summary_text, transform=ax.transAxes, 
            fontsize=12, verticalalignment='top', fontfamily='monospace')
    
    plt.tight_layout()
    plt.show()
    
    return analysis_df, summary

# Analyze the experiment results
analysis_df, summary = analyze_experiment_results(experiment_results)

# ============================================================================
# 12. CREATING CUSTOM METRICS
# ============================================================================

In [None]:
class CompletenessMetric(BaseMetric):
    """
    Custom metric to evaluate answer completeness.
    
    This metric checks:
    1. Whether the answer addresses key question words
    2. Whether the answer length is reasonable
    3. Overall coverage of the question topic
    """
    
    def __init__(self):
        super().__init__(
            name="answer_completeness",
            requirements=MetricRequirements(
                requires_reference=False,    # Doesn't need ground truth
                requires_contexts=True,      # Uses retrieved contexts
                requires_question=True,      # Needs the question
                requires_answer=True         # Needs the generated answer
            )
        )
        
    async def compute(self, **kwargs) -> MetricResult:
        """Compute the completeness score"""
        start_time = time.time()
        
        # Extract inputs
        question = kwargs['question']
        answer = kwargs['answer']
        contexts = kwargs['contexts']
        
        # Simple heuristic: check if answer addresses key question words
        question_words = set(question.lower().split())
        answer_words = set(answer.lower().split())
        
        # Remove common words that don't carry meaning
        common_words = {
            'the', 'a', 'an', 'is', 'are', 'was', 'were', 
            'what', 'how', 'why', 'when', 'where', 'who',
            'of', 'to', 'for', 'with', 'in', 'on', 'at'
        }
        question_words -= common_words
        
        # Calculate coverage of question words in answer
        covered_words = question_words.intersection(answer_words)
        coverage = len(covered_words) / len(question_words) if question_words else 0
        
        # Check if answer length is reasonable (normalize to 50 words)
        word_count = len(answer.split())
        length_score = min(1.0, word_count / 50)
        
        # Check if answer uses information from contexts
        context_text = ' '.join(contexts).lower()
        context_words = set(context_text.split()) - common_words
        answer_context_overlap = answer_words.intersection(context_words)
        context_usage = len(answer_context_overlap) / len(answer_words) if answer_words else 0
        
        # Combined score with weights
        completeness_score = (
            coverage * 0.4 +        # 40% weight on addressing question
            length_score * 0.3 +    # 30% weight on answer length
            context_usage * 0.3     # 30% weight on using context
        )
        
        return MetricResult(
            metric_name=self.name,
            value=completeness_score,
            metadata={
                "question_coverage": coverage,
                "length_score": length_score,
                "context_usage": context_usage,
                "answer_word_count": word_count,
                "covered_keywords": list(covered_words)[:5]  # Top 5 covered words
            },
            computation_time=time.time() - start_time
        )

# Register the custom metric
completeness_metric = CompletenessMetric()
rag_system.metric_factory.registry.register_metric(
    completeness_metric,
    groups=["custom", "quality"]  # Add to custom and quality groups
)

print(f"✅ Registered custom metric: {completeness_metric.name}")

# Test the custom metric
async def test_custom_metric():
    """Test our custom completeness metric"""
    
    result = await rag_system.query(
        question="What are all the components of the RAG pipeline and how do they work together?",
        evaluate=True,
        metrics=["answer_completeness", "ragas_faithfulness", "response_time"]
    )
    
    # Display custom metric results
    if 'evaluation' in result and 'answer_completeness' in result['evaluation']:
        completeness_data = result['evaluation']['answer_completeness']
        
        print("\n📊 Answer Completeness Analysis:")
        print(f"Overall Score: {completeness_data['value']:.3f}")
        print("\nDetailed Breakdown:")
        for key, value in completeness_data['metadata'].items():
            print(f"  - {key}: {value}")
    
    return result

# Run custom metric test
custom_result = await test_custom_metric()

# ============================================================================
# 13. BATCH EVALUATION AND EXPORT
# ============================================================================

In [None]:
async def run_batch_evaluation():
    """Run evaluation on multiple questions and export results"""
    
    # Prepare a comprehensive test suite
    test_suite = [
        {"question": "What is the purpose of the metrics registry?", "category": "architecture"},
        {"question": "How do I add a new evaluation metric?", "category": "usage"},
        {"question": "What are the different pipeline types available?", "category": "features"},
        {"question": "How does the vector store work?", "category": "technical"},
        {"question": "What configuration options are available?", "category": "configuration"},
        {"question": "How can I optimize retrieval performance?", "category": "optimization"},
        {"question": "What are the benefits of modular design?", "category": "architecture"},
        {"question": "How do I export evaluation results?", "category": "usage"}
    ]
    
    # Run batch evaluation
    batch_results = []
    print("\n🔄 Running batch evaluation...")
    
    for i, test in enumerate(test_suite):
        print(f"\n[{i+1}/{len(test_suite)}] Evaluating: {test['question'][:50]}...")
        
        result = await rag_system.query(
            question=test["question"],
            evaluate=True,
            metrics=["ragas_faithfulness", "ragas_answer_relevancy", 
                    "answer_completeness", "response_time"]
        )
        
        # Add category to result
        result['category'] = test['category']
        batch_results.append(result)
    
    # Convert to DataFrame for analysis
    batch_df = pd.DataFrame([
        {
            'question': r['question'],
            'category': r['category'],
            'answer_preview': r['answer'][:100] + '...',
            **{k: v['value'] for k, v in r.get('evaluation', {}).items() 
               if not v.get('error')}
        }
        for r in batch_results
    ])
    
    # Display results
    print("\n📊 Batch Evaluation Results:")
    print(batch_df.to_string())
    
    # Export results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"./results/rag_showcase_{timestamp}.csv"
    batch_df.to_csv(output_file, index=False)
    print(f"\n📁 Results exported to: {output_file}")
    
    # Also save detailed results as JSON
    import json
    json_file = f"./results/rag_showcase_detailed_{timestamp}.json"
    with open(json_file, 'w') as f:
        json.dump(batch_results, f, indent=2, default=str)
    print(f"📁 Detailed results saved to: {json_file}")
    
    return batch_df, batch_results

# Run batch evaluation
batch_df, batch_results = await run_batch_evaluation()

# ============================================================================
# 14. SAVE CONFIGURATION AND FINAL SUMMARY
# ============================================================================

In [None]:
# Save successful configuration for future use
config_path = "./configs/showcase_config.yaml"
config_manager.save_config(config_path)
print(f"\n✅ Configuration saved to: {config_path}")

# Create a final summary report
print("\n" + "="*60)
print("🎉 RAG MODULAR ARCHITECTURE SHOWCASE - COMPLETE")
print("="*60)

print("\n📋 What we demonstrated:")
print("✓ Simple configuration with smart defaults")
print("✓ Component swapping through configuration")
print("✓ Flexible metrics system with custom metrics")
print("✓ Query evaluation with and without references")
print("✓ A/B testing different configurations")
print("✓ Results analysis and visualization")
print("✓ Batch evaluation and export")

print("\n🏆 Key Architecture Benefits:")
print("1. Factory Pattern:")
print("   - Consistent component creation")
print("   - Easy to add new strategies")
print("   - No code changes needed")

print("\n2. Strategy Pattern:")
print("   - Swappable implementations")
print("   - Clean interfaces")
print("   - Independent testing")

print("\n3. Metrics Registry:")
print("   - Pluggable metrics")
print("   - Custom metrics without core changes")
print("   - Organized by groups")

print("\n4. Async Operations:")
print("   - High performance")
print("   - Concurrent processing")
print("   - Non-blocking evaluation")

print("\n💡 Next Steps:")
print("1. Try different configurations")
print("2. Add your own custom metrics")
print("3. Extend with new strategies")
print("4. Run large-scale experiments")
print("5. Deploy to production")

print("\n" + "="*60)
print("Happy experimenting with the modular RAG system! 🚀")
print("="*60)