# Week 5: Hybrid Search Evaluation

## Test Queries:
1. machine learning
2. neural networks  
3. transformer models
4. attention mechanism
5. language models
6. deep learning
7. natural language processing
8. computer vision
9. reinforcement learning
10. speech recognition

In [4]:
from hybrid_searcher import HybridSearcher
from sentence_transformers import SentenceTransformer
import pandas as pd

# Initialize
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
searcher = HybridSearcher(
    faiss_index_path="embeddings/faiss.index",
    db_path="arxiv_hybrid_final.db",
    embedding_model=embedding_model
)

test_queries = [
    "machine learning", "neural networks", "transformer models", "attention mechanism", "language models",
    "deep learning", "natural language processing", "computer vision", "reinforcement learning", "speech recognition"
]

results = []
print("\n## Evaluation Results:")

for query in test_queries:
    # Get results from all methods
    vector_results = searcher.vector_search(query, k=10)
    keyword_results = searcher.db.keyword_search(query, k=10)
    hybrid_results = searcher.hybrid_search(query, k=10)

    # Use hybrid top results as ground truth
    relevant_chunks = [r['chunk_id'] for r in hybrid_results[:5]]

    # Calculate recall@3
    def recall_at_k(results, relevant, k=3):
        top_k = [r['chunk_id'] for r in results[:k]]
        return len(set(top_k) & set(relevant)) / len(relevant) if relevant else 0
    
    vector_recall = recall_at_k(vector_results, relevant_chunks)
    keyword_recall = recall_at_k(keyword_results, relevant_chunks)
    hybrid_recall = recall_at_k(hybrid_results, relevant_chunks)
    
    results.append({
        'query': query,
        'vector_recall@3': round(vector_recall, 3),
        'keyword_recall@3': round(keyword_recall, 3), 
        'hybrid_recall@3': round(hybrid_recall, 3),
        'improvement': round(hybrid_recall - max(vector_recall, keyword_recall), 3)
    })

# Create results table
df = pd.DataFrame(results)
print("\n### Recall@3 Results:")
print(df.to_string(index=False))

# Summary
print(f"\n### Summary:")
print(f"Average Vector Recall@3:   {df['vector_recall@3'].mean():.3f}")
print(f"Average Keyword Recall@3:  {df['keyword_recall@3'].mean():.3f}") 
print(f"Average Hybrid Recall@3:   {df['hybrid_recall@3'].mean():.3f}")
print(f"Average Improvement:       {df['improvement'].mean():.3f}")

hybrid_better = len(df[df['improvement'] > 0])
print(f"Hybrid performs better on: {hybrid_better}/10 queries")

print("\n## Evaluation complete!")

Hybrid database initialized at: arxiv_hybrid_final.db

## Evaluation Results:

### Recall@3 Results:
                      query  vector_recall@3  keyword_recall@3  hybrid_recall@3  improvement
           machine learning              0.2               0.2              0.6          0.4
            neural networks              0.0               0.2              0.6          0.4
         transformer models              0.2               0.2              0.6          0.4
        attention mechanism              0.4               0.4              0.6          0.2
            language models              0.4               0.4              0.6          0.2
              deep learning              0.2               0.4              0.6          0.2
natural language processing              0.0               0.4              0.6          0.2
            computer vision              0.2               0.4              0.6          0.2
     reinforcement learning              0.4               0.6