# RAG System Demo: Querying arXiv cs.CL Papers

This notebook demonstrates the Retrieval-Augmented Generation (RAG) system built for searching through 50 arXiv cs.CL papers.

## Setup

In [None]:
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from pathlib import Path
from typing import List, Dict, Tuple

# Configuration
INDEX_DIR = Path("data/index")
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'

## Load Resources

Load the FAISS index, chunks, and metadata.

In [None]:
# Load embedding model
print("Loading embedding model...")
model = SentenceTransformer(EMBEDDING_MODEL)
print(f"Loaded model: {EMBEDDING_MODEL}")

# Load FAISS index
print("\nLoading FAISS index...")
index_path = INDEX_DIR / "faiss_index.bin"
faiss_index = faiss.read_index(str(index_path))
print(f"Loaded index with {faiss_index.ntotal} vectors")

# Load chunks
print("\nLoading chunks...")
chunks_path = INDEX_DIR / "chunks.json"
with open(chunks_path, 'r', encoding='utf-8') as f:
    chunks = json.load(f)
print(f"Loaded {len(chunks)} chunks")

# Load metadata
print("\nLoading metadata...")
metadata_path = INDEX_DIR / "metadata.json"
with open(metadata_path, 'r', encoding='utf-8') as f:
    metadata = json.load(f)
print(f"Loaded metadata for {len(metadata)} chunks")

# Count unique papers
unique_papers = len(set(m['paper_id'] for m in metadata))
print(f"\nTotal unique papers: {unique_papers}")

## Define Search Function

In [None]:
def search_papers(query: str, k: int = 3) -> List[Dict]:
    """
    Search for relevant passages based on a query.
    
    Args:
        query: Search query string
        k: Number of top results to return
        
    Returns:
        List of dictionaries containing search results
    """
    # Encode query
    query_embedding = model.encode([query])[0]
    
    # Normalize (index was normalized)
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    
    # Search
    query_vector = np.array([query_embedding]).astype('float32')
    distances, indices = faiss_index.search(query_vector, k)
    
    # Format results
    results = []
    for i, (idx, distance) in enumerate(zip(indices[0], distances[0])):
        if idx < len(chunks):
            results.append({
                'rank': i + 1,
                'distance': float(distance),
                'paper_id': metadata[idx]['paper_id'],
                'paper_title': metadata[idx]['paper_title'],
                'chunk_index': metadata[idx]['chunk_index'],
                'text': chunks[idx]
            })
    
    return results

def display_results(query: str, results: List[Dict]):
    """
    Display search results in a readable format.
    """
    print(f"\n{'='*80}")
    print(f"QUERY: {query}")
    print(f"{'='*80}\n")
    
    for result in results:
        print(f"Rank {result['rank']} | Distance: {result['distance']:.4f}")
        print(f"Paper: {result['paper_title']}")
        print(f"Paper ID: {result['paper_id']} | Chunk: {result['chunk_index']}")
        print(f"\nText excerpt:")
        # Show first 500 characters
        text_preview = result['text'][:500] + "..." if len(result['text']) > 500 else result['text']
        print(text_preview)
        print(f"\n{'-'*80}\n")

## Example Queries

Let's try several different types of queries to demonstrate the system's capabilities.

### Query 1: Transformer Models

In [None]:
query1 = "What are transformer models and how do they work?"
results1 = search_papers(query1, k=3)
display_results(query1, results1)

### Query 2: Attention Mechanisms

In [None]:
query2 = "Explain attention mechanisms in natural language processing"
results2 = search_papers(query2, k=3)
display_results(query2, results2)

### Query 3: Large Language Models

In [None]:
query3 = "How do large language models learn from data?"
results3 = search_papers(query3, k=3)
display_results(query3, results3)

### Query 4: Model Training

In [None]:
query4 = "What techniques are used for training language models?"
results4 = search_papers(query4, k=3)
display_results(query4, results4)

### Query 5: Evaluation Metrics

In [None]:
query5 = "How do we evaluate the performance of NLP models?"
results5 = search_papers(query5, k=3)
display_results(query5, results5)

## Statistics and Analysis

Let's analyze the retrieval results.

In [None]:
# Collect all results
all_queries = [
    (query1, results1),
    (query2, results2),
    (query3, results3),
    (query4, results4),
    (query5, results5)
]

# Analyze paper distribution
from collections import Counter

retrieved_papers = []
for query, results in all_queries:
    for result in results:
        retrieved_papers.append(result['paper_id'])

paper_counts = Counter(retrieved_papers)

print("\n" + "="*80)
print("ANALYSIS OF RETRIEVAL RESULTS")
print("="*80)

print(f"\nTotal queries: {len(all_queries)}")
print(f"Total results retrieved: {len(retrieved_papers)}")
print(f"Unique papers in results: {len(paper_counts)}")

print(f"\nMost frequently retrieved papers:")
for paper_id, count in paper_counts.most_common(5):
    # Find paper title
    title = next(m['paper_title'] for m in metadata if m['paper_id'] == paper_id)
    print(f"  {count}x - {paper_id}")
    print(f"      {title[:80]}...")

# Average distances
all_distances = []
for query, results in all_queries:
    all_distances.extend([r['distance'] for r in results])

print(f"\nRetrieval quality (L2 distances):")
print(f"  Average distance: {np.mean(all_distances):.4f}")
print(f"  Min distance: {np.min(all_distances):.4f}")
print(f"  Max distance: {np.max(all_distances):.4f}")

## Custom Query

Try your own query!

In [None]:
# Enter your own query here
custom_query = "Your question here"
custom_results = search_papers(custom_query, k=3)
display_results(custom_query, custom_results)