# Advanced Retrieval Systems

In this notebook, we'll explore advanced retrieval techniques including hybrid search, reranking, and query expansion.

## Learning Objectives
By the end of this notebook, you will:
1. Implement hybrid search combining dense and sparse retrieval
2. Learn about reranking techniques to improve result quality
3. Explore query expansion and reformulation strategies
4. Understand the trade-offs between different retrieval methods
5. Build a complete retrieval pipeline with multiple strategies


## Setup and Imports

Let's import the libraries we need and set up our retrieval system.


In [1]:
# Standard library imports
import json
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Any
import time
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

# Add project root to path
import sys
sys.path.append(str(Path.cwd().parent))

# Import our modules
from src.retrieval.retrieval_system import RetrievalSystem, RetrievalConfig
from src.models.embedding_models import BGEEmbeddingModel, E5EmbeddingModel
from src.models.reranker_models import BGEReranker
from src.config import DATA_DIR

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

# Load sample data
chunks_file = DATA_DIR / "processed" / "all_chunks.json"
if chunks_file.exists():
    with open(chunks_file, 'r', encoding='utf-8') as f:
        all_chunks = json.load(f)
    print(f"Loaded {len(all_chunks)} chunks from previous notebook")
else:
    print("Creating sample data...")
    all_chunks = [
        {
            'id': 'chunk1',
            'text': 'Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data.',
            'title': 'Machine Learning',
            'source': 'wikipedia',
            'chunk_id': 'chunk_1'
        },
        {
            'id': 'chunk2',
            'text': 'Deep learning uses neural networks with multiple layers to process complex data patterns.',
            'title': 'Deep Learning', 
            'source': 'wikipedia',
            'chunk_id': 'chunk_2'
        },
        {
            'id': 'chunk3',
            'text': 'Natural language processing is a field of AI that focuses on the interaction between computers and human language.',
            'title': 'NLP',
            'source': 'wikipedia',
            'chunk_id': 'chunk_3'
        },
        {
            'id': 'chunk4',
            'text': 'Computer vision is a field of artificial intelligence that trains computers to interpret and understand visual information.',
            'title': 'Computer Vision',
            'source': 'wikipedia',
            'chunk_id': 'chunk_4'
        },
        {
            'id': 'chunk5',
            'text': 'Reinforcement learning is a type of machine learning where agents learn to make decisions through trial and error.',
            'title': 'Reinforcement Learning',
            'source': 'wikipedia',
            'chunk_id': 'chunk_5'
        }
    ]
    print(f"Created {len(all_chunks)} sample chunks")


ModuleNotFoundError: No module named 'rank_bm25'

## Building a Complete Retrieval System

Let's create a comprehensive retrieval system with multiple strategies.


In [None]:
# Create retrieval system with different configurations
print("Creating retrieval systems...")

# Configuration 1: Basic retrieval
basic_config = RetrievalConfig(
    top_k=5,
    rerank_top_k=3,
    use_reranking=False,
    similarity_threshold=0.5
)

# Configuration 2: With reranking
rerank_config = RetrievalConfig(
    top_k=10,
    rerank_top_k=5,
    use_reranking=True,
    similarity_threshold=0.3
)

# Configuration 3: Hybrid search
hybrid_config = RetrievalConfig(
    top_k=10,
    rerank_top_k=5,
    use_reranking=True,
    similarity_threshold=0.3,
    hybrid_alpha=0.7,
    hybrid_beta=0.3
)

# Create retrieval systems
basic_retrieval = RetrievalSystem(basic_config)
rerank_retrieval = RetrievalSystem(rerank_config)
hybrid_retrieval = RetrievalSystem(hybrid_config)

# Add documents to all systems
print("Adding documents to retrieval systems...")
basic_retrieval.add_documents(all_chunks)
rerank_retrieval.add_documents(all_chunks)
hybrid_retrieval.add_documents(all_chunks)

print(f"Added {len(all_chunks)} documents to each retrieval system")
print("Retrieval systems ready!")
