# RAG LLM Evaluation with Embedding Metrics

This notebook evaluates RAG performance by:
1. Taking a question and selected document IDs
2. Creating prompts with retrieved documents
3. Passing to an LLM (**Ollama - runs locally, no API limits!**)
4. Comparing generated answers with ground truth using:
   - **Text-based metrics** (exact match, F1 score)
   - **Embedding-based metrics** (cosine similarity, euclidean distance)


## 1. Setup and Imports


In [86]:
import pandas as pd
import numpy as np
import os
import ollama
from typing import List, Dict, Tuple
import json
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time
from bert_score import score as bert_score


## 2. Load Data


In [87]:
# Load Q&A data
qa_df = pd.read_csv('data/rag-mini-wikipedia_q_and_a.csv')
print(f"Loaded {len(qa_df)} questions")
print(qa_df.head())


Loaded 918 questions
                                            question     answer  id
0  Was Abraham Lincoln the sixteenth President of...        yes   0
1  Did Lincoln sign the National Banking Act of 1...        yes   2
2                   Did his mother die of pneumonia?         no   4
3      How many long was Lincoln's formal education?  18 months   6
4       When did Lincoln begin his political career?       1832   8


In [88]:
# Load document data
doc_df = pd.read_csv('data/rag-mini-wikipedia_document.csv')
print(f"Loaded {len(doc_df)} documents")
print(doc_df.head())


Loaded 3200 documents
                                             passage  id
0  Uruguay (official full name in  ; pron.  , Eas...   0
1  It is bordered by Brazil to the north, by Arge...   1
2  Montevideo was founded by the Spanish in the e...   2
3  The economy is largely based in agriculture (m...   3
4  According to Transparency International, Urugu...   4


## 3. Setup Ollama (Local LLM)

**‚úÖ Ollama is already running on your system!**

You have these models installed:
- `llama3` (8B params) ‚Üê **Using this one**
- `gemma3` (3B params)

**To use a different model:**
```bash
# Change OLLAMA_MODEL in the cell below to one of:
OLLAMA_MODEL = "llama3"   # Currently selected
OLLAMA_MODEL = "gemma3"   # Alternative option
```




In [89]:
# Configure Ollama
OLLAMA_MODEL = "llama3"  # Using your installed model

# Test connection to Ollama
try:
    # List available models
    models_response = ollama.list()
    
    # Parse the response - it might be a dict with 'models' key or have different structure
    if isinstance(models_response, dict) and 'models' in models_response:
        models_list = models_response['models']
    else:
        models_list = models_response
    
    # Extract model names - handle different response formats
    available_models = []
    for model in models_list:
        if isinstance(model, dict):
            # Try different possible keys
            name = model.get('name') or model.get('model') or str(model)
            available_models.append(name)
        else:
            available_models.append(str(model))
    
    print(f"‚úì Ollama is running!")
    print(f"üì¶ Available models:")
    for model in available_models:
        print(f"   - {model}")
    
    # Check if selected model is available
    if any(OLLAMA_MODEL in str(model_name) for model_name in available_models):
        print(f"\n‚úì Using model: {OLLAMA_MODEL}")
    else:
        print(f"\n‚ö†Ô∏è  Model '{OLLAMA_MODEL}' not found.")
        print(f"Available: {available_models}")
        print(f"Run: ollama pull {OLLAMA_MODEL}")
        
except Exception as e:
    import traceback
    print(f"‚ùå Error connecting to Ollama: {e}")
    print(f"\nDebug info:")
    traceback.print_exc()
    print("\nMake sure Ollama is running.")
    print("\nTo start Ollama:")
    print("  1. Open the Ollama app from Applications, OR")
    print("  2. Run in terminal: ollama serve")


‚úì Ollama is running!
üì¶ Available models:
   - ('models', [Model(model='llama3:latest', modified_at=datetime.datetime(2025, 10, 7, 13, 18, 0, 391725, tzinfo=TzInfo(-14400)), digest='365c0bd3c000a25d28ddbf732fe1c6add414de7275464c4e4d1c3b5fcb5d8ad1', size=4661224676, details=ModelDetails(parent_model='', format='gguf', family='llama', families=['llama'], parameter_size='8.0B', quantization_level='Q4_0')), Model(model='gemma3:latest', modified_at=datetime.datetime(2025, 10, 7, 12, 9, 38, 308655, tzinfo=TzInfo(-14400)), digest='a2af6cc3eb7fa8be8504abaf9b04e88f17a119ec3f04a3addf55f92841195f5a', size=3338801804, details=ModelDetails(parent_model='', format='gguf', family='gemma3', families=['gemma3'], parameter_size='4.3B', quantization_level='Q4_K_M'))])

‚úì Using model: llama3


## 4. Setup Embedding Model


In [90]:
# Initialize embedding model for semantic similarity evaluation
# Using the SAME model as in Embedding_process.ipynb for consistency
embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
print(f"Embedding model loaded: multi-qa-mpnet-base-dot-v1")

def get_documents_by_ids(doc_df: pd.DataFrame, doc_ids: List[int]) -> List[str]:
    """
    Retrieve document passages by their IDs.
    
    Args:
        doc_df: DataFrame containing documents
        doc_ids: List of document IDs to retrieve
    
    Returns:
        List of document passages
    """
    documents = []
    for doc_id in doc_ids:
        doc = doc_df[doc_df['id'] == doc_id]
        if not doc.empty:
            documents.append(doc['passage'].values[0])
        else:
            print(f"Warning: Document ID {doc_id} not found")
    return documents


Embedding model loaded: multi-qa-mpnet-base-dot-v1


In [91]:
def create_rag_prompt(question: str, documents: List[str]) -> str:
    """
    Create a RAG prompt with question and retrieved documents.
    
    Args:
        question: The user's question
        documents: List of retrieved document passages
    
    Returns:
        Formatted prompt string
    """
    context = "\n\n".join([f"Document {i+1}:\n{doc}" for i, doc in enumerate(documents)])
    
    prompt = f"""You are a precise question-answering assistant. Based on the provided documents, answer the question with ONLY the direct answer. Do not include explanations, context, or additional information.

Context Documents:
{context}

Question: {question}

Instructions:
- Provide ONLY the direct answer to the question
- Do NOT add phrases like "Based on the documents..." or "According to..."
- Do NOT provide explanations or reasoning
- If the answer is a single word, number, or short phrase, return only that
- If the answer requires a sentence, make it as brief as possible

Answer:"""
    
    return prompt


In [92]:
def query_llm(prompt: str, temperature: float = 0.0, max_retries: int = 3, base_delay: float = 1.0) -> str:
    """
    Send prompt to Ollama (local LLM) and get response with retry logic.
    
    Args:
        prompt: The prompt to send
        temperature: Sampling temperature (0.0 for deterministic)
        max_retries: Maximum number of retry attempts  
        base_delay: Base delay between retries (will use exponential backoff)
    
    Returns:
        Generated answer from LLM, or None if all retries fail
    """
    for attempt in range(max_retries):
        try:
            response = ollama.generate(
                model=OLLAMA_MODEL,
                prompt=prompt,
                options={
                    'temperature': temperature,
                    'num_predict': 500,  # Max tokens
                }
            )
            
            # Extract the response text
            answer = response.get('response', '').strip()
            return answer if answer else None
            
        except Exception as e:
            error_msg = str(e)
            
            # Check if Ollama is not running
            if "connection" in error_msg.lower() or "refused" in error_msg.lower():
                print(f"‚ùå Cannot connect to Ollama. Make sure it's running: ollama serve")
                return None
            
            # Generic error with retry
            print(f"‚ö†Ô∏è  Error calling Ollama (attempt {attempt + 1}/{max_retries}): {error_msg}")
            if attempt < max_retries - 1:
                wait_time = base_delay * (2 ** attempt)
                print(f"   Retrying in {wait_time:.1f}s...")
                time.sleep(wait_time)
            else:
                print(f"‚ùå Failed after {max_retries} attempts")
                return None
    
    return None


In [93]:
def normalize_answer(text: str) -> str:
    """
    Normalize answer text for comparison.
    
    Args:
        text: Text to normalize
    
    Returns:
        Normalized text
    """
    # Convert to lowercase
    text = text.lower().strip()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text


In [94]:
def exact_match_score(predicted: str, ground_truth: str) -> bool:
    """
    Calculate exact match score (normalized).
    
    Args:
        predicted: Predicted answer
        ground_truth: Ground truth answer
    
    Returns:
        True if answers match exactly (after normalization)
    """
    return normalize_answer(predicted) == normalize_answer(ground_truth)


In [95]:
def contains_answer_score(predicted: str, ground_truth: str) -> bool:
    """
    Check if predicted answer contains the ground truth.
    
    Args:
        predicted: Predicted answer
        ground_truth: Ground truth answer
    
    Returns:
        True if predicted contains ground truth
    """
    return normalize_answer(ground_truth) in normalize_answer(predicted)


In [96]:
def f1_score(predicted: str, ground_truth: str) -> float:
    """
    Calculate token-level F1 score.
    
    Args:
        predicted: Predicted answer
        ground_truth: Ground truth answer
    
    Returns:
        F1 score (0.0 to 1.0)
    """
    pred_tokens = set(normalize_answer(predicted).split())
    gt_tokens = set(normalize_answer(ground_truth).split())
    
    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        return 0.0
    
    common_tokens = pred_tokens.intersection(gt_tokens)
    
    if len(common_tokens) == 0:
        return 0.0
    
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(gt_tokens)
    
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1


**Note:** You created embeddings with 4 different models in `Embedding_process.ipynb`:
- `bert-base-uncased`
- `multi-qa-mpnet-base-dot-v1` ‚Üê **Using this one**
- `intfloat/e5-small-v2`
- `hkunlp/instructor-large`

You can change the model above to test which embedding model gives the best evaluation correlation!


## 5. Embedding-based Evaluation Metrics


### Alternative: Use E5 or Instructor models (uncomment to use)


**‚ö†Ô∏è Important Note on Embedding Metrics:**

Embedding-based metrics work best for **longer text** (paragraphs, sentences). For **very short answers** like "yes" vs "no", they can give misleading results:

- ‚úÖ **Good for**: Comparing longer answers where semantic meaning is clear
- ‚ùå **Not ideal for**: Single-word answers, especially yes/no questions
- üí° **Recommendation**: For short answers, rely more on **text-based metrics** (exact match, F1 score)


In [97]:
# # Option 1: E5 model (requires "query: " prefix for answers)
# embedding_model = SentenceTransformer('intfloat/e5-small-v2')
# 
# def compute_embedding_metrics_e5(predicted: str, ground_truth: str) -> Dict[str, float]:
#     # Add "query: " prefix as done in Embedding_process.ipynb
#     emb1 = embedding_model.encode(["query: " + predicted])[0]
#     emb2 = embedding_model.encode(["query: " + ground_truth])[0]
#     
#     from sklearn.preprocessing import normalize
#     emb1 = normalize([emb1])[0]
#     emb2 = normalize([emb2])[0]
#     
#     return {
#         'cosine_similarity': float(np.dot(emb1, emb2)),
#         'euclidean_distance': float(np.linalg.norm(emb1 - emb2)),
#         'dot_product_similarity': float(np.dot(emb1, emb2)),
#         'manhattan_distance': float(np.sum(np.abs(emb1 - emb2)))
#     }

# # Option 2: Instructor model (requires instruction)
# from InstructorEmbedding import INSTRUCTOR
# embedding_model = INSTRUCTOR('hkunlp/instructor-large')
# 
# def compute_embedding_metrics_instructor(predicted: str, ground_truth: str) -> Dict[str, float]:
#     # Add instruction as done in Embedding_process.ipynb
#     emb1 = embedding_model.encode([["Represent the answer:", predicted]])[0]
#     emb2 = embedding_model.encode([["Represent the answer:", ground_truth]])[0]
#     
#     # Note: Instructor embeddings are NOT normalized in Embedding_process.ipynb
#     return {
#         'cosine_similarity': float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))),
#         'euclidean_distance': float(np.linalg.norm(emb1 - emb2)),
#         'dot_product_similarity': float(np.dot(emb1, emb2)),
#         'manhattan_distance': float(np.sum(np.abs(emb1 - emb2)))
#     }


In [98]:
def compute_embedding_metrics(predicted: str, ground_truth: str) -> Dict[str, float]:
    """
    Compute multiple embedding-based metrics between predicted and ground truth answers.
    
    Metrics:
    - Cosine Similarity: Measures angular similarity (0 to 1, higher is better)
    - Euclidean Distance: L2 distance between embeddings (lower is better)
    - Dot Product: Inner product of normalized embeddings (0 to 1, higher is better)
    - Manhattan Distance: L1 distance between embeddings (lower is better)
    
    Args:
        predicted: Predicted answer
        ground_truth: Ground truth answer
    
    Returns:
        Dictionary with embedding-based metrics
    """
    # Generate embeddings
    emb1 = embedding_model.encode([predicted])[0]
    emb2 = embedding_model.encode([ground_truth])[0]
    
    # Normalize embeddings (same as in Embedding_process.ipynb)
    from sklearn.preprocessing import normalize
    emb1 = normalize([emb1])[0]
    emb2 = normalize([emb2])[0]
    
    # Cosine similarity (for normalized vectors, this equals dot product)
    cos_sim = np.dot(emb1, emb2)
    
    # Euclidean distance
    euclidean_dist = np.linalg.norm(emb1 - emb2)
    
    # Dot product similarity (same as cosine for normalized embeddings)
    dot_product = np.dot(emb1, emb2)
    
    # Manhattan distance
    manhattan_dist = np.sum(np.abs(emb1 - emb2))
    
    return {
        'cosine_similarity': float(cos_sim),
        'euclidean_distance': float(euclidean_dist),
        'dot_product_similarity': float(dot_product),
        'manhattan_distance': float(manhattan_dist)
    }


### üìä Understanding the Metrics

**For SHORT answers (yes/no, single words):**
- ‚úÖ **Exact Match** - Most reliable! 0 or 1, no ambiguity
- ‚úÖ **F1 Score** - Good for token overlap
- ‚ùå **Embedding Metrics** - Can be misleading (e.g., "yes" vs "no" might have high similarity)

**For LONG answers (sentences, paragraphs):**
- ‚úÖ **Embedding Metrics** - Excellent! Captures semantic meaning
- ‚úÖ **F1 Score** - Good for word overlap
- ‚ö†Ô∏è **Exact Match** - Too strict, rarely matches

**Example of the issue:**
```
Ground Truth: "yes"
Predicted: "no"
Exact Match: 0.0 ‚úì (Correct - they're different)
Cosine Similarity: 0.933 ‚úó (Misleading - seems similar but they're opposite!)
```

**Why?** Embeddings capture that both are short, single-word boolean answers in similar contexts, not that they're semantically opposite.


## 5.5 BERTScore Metric


**‚öôÔ∏è BERTScore Model Options:**

The default model (`microsoft/deberta-xlarge-mnli`) is the most accurate but slower. For faster computation:

```python
# Fast (but less accurate)
compute_bertscore(predicted, ground_truth, model_type='bert-base-uncased')

# Balanced
compute_bertscore(predicted, ground_truth, model_type='roberta-large')

# Most accurate (default, slower)
compute_bertscore(predicted, ground_truth, model_type='microsoft/deberta-xlarge-mnli')
```

For batch evaluation, consider using the faster model to save time!


In [99]:
def compute_bertscore(predicted: str, ground_truth: str, lang: str = 'en', model_type: str = 'distilbert-base-uncased') -> Dict[str, float]:
    """
    Compute BERTScore metrics between predicted and ground truth answers.
    
    BERTScore leverages pre-trained BERT embeddings and computes token-level similarity
    using cosine similarity. It returns precision, recall, and F1 scores.
    
    - **Precision**: How much of the predicted answer is relevant?
    - **Recall**: How much of the ground truth is captured?
    - **F1**: Harmonic mean of precision and recall
    
    Args:
        predicted: Predicted answer
        ground_truth: Ground truth answer
        lang: Language code (default: 'en' for English)
        model_type: BERT model to use. Options:
            - 'distilbert-base-uncased' (default, fastest, ~250MB)
            - 'bert-base-uncased' (fast, ~420MB)
            - 'roberta-large' (slower but more accurate, ~1.4GB)
            - 'microsoft/deberta-xlarge-mnli' (slowest but most accurate, ~1.5GB)
    
    Returns:
        Dictionary with BERTScore precision, recall, and F1
    """
    # BERTScore expects lists of predictions and references
    P, R, F1 = bert_score([predicted], [ground_truth], lang=lang, model_type=model_type, verbose=False)
    
    return {
        'bertscore_precision': float(P[0]),
        'bertscore_recall': float(R[0]),
        'bertscore_f1': float(F1[0])
    }


**‚öôÔ∏è BERTScore Speed Guide:**

The default model (`distilbert-base-uncased`) is **much faster** and good enough for most cases:

| Model | Speed | Size | Use Case |
|-------|-------|------|----------|
| `distilbert-base-uncased` ‚úÖ | Fastest (~1 sec) | 250MB | **Default - Recommended** |
| `bert-base-uncased` | Fast (~2 sec) | 420MB | Slightly more accurate |
| `roberta-large` | Slow (~5 sec) | 1.4GB | High accuracy needed |
| `microsoft/deberta-xlarge-mnli` | Very Slow (~10 sec) | 1.5GB | Research-grade only |

**üí° Speed Tips:**
- **First run is slower** - it downloads the model (one-time delay)
- Use DistilBERT for batch evaluations (100+ questions)
- If still too slow, set `use_bertscore=False` to disable it
- To use a different model: `compute_bertscore(pred, gt, model_type='bert-base-uncased')`


**To disable BERTScore** (if it's too slow for your use case):

```python
result = evaluate_rag_single(
    question_id=10,
    doc_ids=[1],
    qa_df=qa_df,
    doc_df=doc_df,
    use_embedding_metrics=True,
    use_bertscore=False,  # ‚Üê Disable BERTScore
    verbose=True
)
```


### üí° Recommendations

**Option 1: Filter your analysis**
When analyzing results, separate short vs long answers:
```python
# After getting results_df
short_answers = results_df[results_df['ground_truth'].str.split().str.len() <= 2]
long_answers = results_df[results_df['ground_truth'].str.split().str.len() > 2]

# For short answers: focus on exact_match and f1_score
# For long answers: embedding metrics are more reliable
```

**Option 2: Use text metrics primarily**
For this RAG evaluation, **Exact Match** and **F1 Score** are your most reliable metrics across all answer lengths.

### üìä About BERTScore

**BERTScore** is a learned metric that uses contextualized embeddings from BERT models to evaluate text generation:

- **Better than embedding similarity**: Uses token-level matching with contextual embeddings
- **Better than F1**: Captures semantic similarity, not just exact word overlap
- **Works well for**: Both short and long answers
- **Ranges**: All scores are between 0 and 1 (higher is better)
  - Precision: How much of the predicted answer is relevant
  - Recall: How much of the ground truth is captured
  - F1: Harmonic mean (most commonly used)

**‚ö†Ô∏è Performance Note:** BERTScore is slower than other metrics (~1 sec per evaluation with DistilBERT). The default uses a fast model, but you can disable it with `use_bertscore=False` if speed is critical.


## 6. RAG Evaluation Pipeline


In [100]:
def evaluate_rag_single(
    question_id: int,
    doc_ids: List[int],
    qa_df: pd.DataFrame,
    doc_df: pd.DataFrame,
    use_embedding_metrics: bool = True,
    use_bertscore: bool = True,
    verbose: bool = True
) -> Dict:
    """
    Evaluate RAG for a single question.
    
    Args:
        question_id: ID of the question in qa_df
        doc_ids: List of document IDs to use for context
        qa_df: DataFrame with questions and answers
        doc_df: DataFrame with documents
        use_embedding_metrics: Whether to compute embedding-based metrics
        use_bertscore: Whether to compute BERTScore metrics
        verbose: Whether to print detailed output
    
    Returns:
        Dictionary with evaluation results
    """
    # Get question and ground truth
    qa_row = qa_df[qa_df['id'] == question_id]
    if qa_row.empty:
        print(f"Error: Question ID {question_id} not found")
        return None
    
    question = qa_row['question'].values[0]
    ground_truth = qa_row['answer'].values[0]
    
    # Retrieve documents
    documents = get_documents_by_ids(doc_df, doc_ids)
    
    if not documents:
        print("Error: No documents retrieved")
        return None
    
    # Create prompt
    prompt = create_rag_prompt(question, documents)
    
    # Query LLM
    predicted_answer = query_llm(prompt)
    
    if predicted_answer is None:
        print("Error: Failed to get LLM response")
        return None
    
    # Calculate text-based metrics
    exact_match = exact_match_score(predicted_answer, ground_truth)
    contains = contains_answer_score(predicted_answer, ground_truth)
    f1 = f1_score(predicted_answer, ground_truth)
    
    # Prepare results
    results = {
        'question_id': question_id,
        'question': question,
        'ground_truth': ground_truth,
        'predicted_answer': predicted_answer,
        'doc_ids': doc_ids,
        'num_docs': len(documents),
        'exact_match': exact_match,
        'contains_answer': contains,
        'f1_score': f1
    }
    
    # Calculate embedding-based metrics
    if use_embedding_metrics:
        embedding_metrics = compute_embedding_metrics(predicted_answer, ground_truth)
        results.update(embedding_metrics)
    
    # Calculate BERTScore
    if use_bertscore:
        bertscore_metrics = compute_bertscore(predicted_answer, ground_truth)
        results.update(bertscore_metrics)
    
    if verbose:
        print("="*80)
        print(f"Question ID: {question_id}")
        print(f"Question: {question}")
        print(f"\nRetrieved Documents: {doc_ids}")
        print(f"Number of Documents: {len(documents)}")
        print(f"\nGround Truth: {ground_truth}")
        print(f"Predicted Answer: {predicted_answer}")
        print(f"\nText-based Metrics:")
        print(f"  Exact Match: {exact_match}")
        print(f"  Contains Answer: {contains}")
        print(f"  F1 Score: {f1:.3f}")
        
        if use_embedding_metrics:
            print(f"\nEmbedding-based Metrics:")
            print(f"  Cosine Similarity: {results['cosine_similarity']:.4f}")
            print(f"  Euclidean Distance: {results['euclidean_distance']:.4f}")
            print(f"  Dot Product Similarity: {results['dot_product_similarity']:.4f}")
            print(f"  Manhattan Distance: {results['manhattan_distance']:.4f}")
        
        if use_bertscore:
            print(f"\nBERTScore Metrics:")
            print(f"  Precision: {results['bertscore_precision']:.4f}")
            print(f"  Recall: {results['bertscore_recall']:.4f}")
            print(f"  F1: {results['bertscore_f1']:.4f}")
        
        print("="*80)
    
    return results


## 7. Single Example Evaluation


In [101]:
# Example: Evaluate a single question with selected documents
# Replace question_id and doc_ids with your actual values

question_id = 10  # Question ID from qa_df
doc_ids = [1]  # Document IDs you determined are best for this question

result = evaluate_rag_single(
    question_id=question_id,
    doc_ids=doc_ids,
    qa_df=qa_df,
    doc_df=doc_df,
    use_embedding_metrics=True,  # Enable embedding-based metrics
    use_bertscore=True,  # Enable BERTScore metrics
    verbose=True
)


Question ID: 10
Question: What did The Legal Tender Act of 1862 establish?

Retrieved Documents: [1]
Number of Documents: 1

Ground Truth: the United States Note, the first paper currency in United States history
Predicted Answer: Greenback currency.

Text-based Metrics:
  Exact Match: False
  Contains Answer: False
  F1 Score: 0.182

Embedding-based Metrics:
  Cosine Similarity: 0.6025
  Euclidean Distance: 0.8916
  Dot Product Similarity: 0.6025
  Manhattan Distance: 19.2048

BERTScore Metrics:
  Precision: 0.7555
  Recall: 0.6857
  F1: 0.7189


## 8. Batch Evaluation from CSV

If you have a CSV file with retrieval results (question IDs and their retrieved document IDs), you can evaluate all of them at once!


In [102]:
def parse_doc_ids(doc_ids_str) -> List[int]:
    """
    Parse document IDs from various string formats.
    
    Handles formats like:
    - "[1, 2, 3]" (list format)
    - "1, 2, 3" (comma-separated)
    - "1 2 3" (space-separated)
    - "1" (single ID)
    
    Args:
        doc_ids_str: String or list of document IDs
    
    Returns:
        List of integer document IDs
    """
    # If already a list, convert to integers and return
    if isinstance(doc_ids_str, list):
        return [int(x) for x in doc_ids_str]
    
    # Convert to string and clean up
    doc_ids_str = str(doc_ids_str).strip()
    
    # Remove brackets if present
    doc_ids_str = doc_ids_str.replace('[', '').replace(']', '')
    
    # Try comma-separated first
    if ',' in doc_ids_str:
        return [int(x.strip()) for x in doc_ids_str.split(',') if x.strip()]
    
    # Try space-separated
    if ' ' in doc_ids_str:
        return [int(x.strip()) for x in doc_ids_str.split() if x.strip()]
    
    # Single ID
    return [int(doc_ids_str)]


In [103]:
def evaluate_rag_batch(
    retrieval_results_df: pd.DataFrame,
    qa_df: pd.DataFrame,
    doc_df: pd.DataFrame,
    question_id_col: str = 'question_id',
    doc_ids_col: str = 'document_idx',
    use_embedding_metrics: bool = True,
    use_bertscore: bool = True,
    verbose: bool = False,
    show_progress: bool = True
) -> pd.DataFrame:
    """
    Evaluate RAG for multiple questions from a CSV file.
    
    The question_id column in retrieval_results_df should match the INDEX (row number) in qa_df.
    
    Args:
        retrieval_results_df: DataFrame with question_id and doc_ids columns
        qa_df: DataFrame with questions and answers (will be looked up by index)
        doc_df: DataFrame with documents
        question_id_col: Name of the column containing question IDs (default: 'question_id')
        doc_ids_col: Name of the column containing document IDs (default: 'document_idx')
        use_embedding_metrics: Whether to compute embedding-based metrics
        use_bertscore: Whether to compute BERTScore metrics
        verbose: Whether to print detailed output for each question
        show_progress: Whether to show progress updates
    
    Returns:
        DataFrame with evaluation results for all questions
    """
    results_list = []
    total = len(retrieval_results_df)
    
    for row_idx, row in retrieval_results_df.iterrows():
        # Get question index from the specified column
        question_idx = int(row[question_id_col])
        
        doc_ids_str = row[doc_ids_col]
        
        # Parse document IDs
        try:
            doc_ids = parse_doc_ids(doc_ids_str)
        except Exception as e:
            print(f"‚ö†Ô∏è  Warning: Could not parse doc_ids for question at index {question_idx}: {e}")
            continue
        
        # Show progress
        if show_progress and not verbose:
            if (row_idx + 1) % 10 == 0 or (row_idx + 1) == total:
                print(f"Progress: {row_idx + 1}/{total} questions evaluated...")
        
        # Get question and ground truth by index from qa_df
        try:
            qa_row = qa_df.iloc[question_idx]
            question = qa_row['question']
            ground_truth = qa_row['answer']
        except (IndexError, KeyError) as e:
            print(f"‚ö†Ô∏è  Warning: Could not find question at index {question_idx} in qa_df: {e}")
            continue
        
        # Retrieve documents
        documents = get_documents_by_ids(doc_df, doc_ids)
        
        if not documents:
            print(f"‚ö†Ô∏è  Warning: No documents retrieved for question {question_idx}")
            continue
        
        # Create prompt
        prompt = create_rag_prompt(question, documents)
        
        # Query LLM
        predicted_answer = query_llm(prompt)
        
        if predicted_answer is None:
            print(f"‚ö†Ô∏è  Warning: Failed to get LLM response for question {question_idx}")
            continue
        
        # Calculate text-based metrics
        exact_match = exact_match_score(predicted_answer, ground_truth)
        contains = contains_answer_score(predicted_answer, ground_truth)
        f1 = f1_score(predicted_answer, ground_truth)
        
        # Prepare results
        result = {
            'question_idx': question_idx,
            'question': question,
            'ground_truth': ground_truth,
            'predicted_answer': predicted_answer,
            'doc_ids': doc_ids,
            'num_docs': len(documents),
            'exact_match': exact_match,
            'contains_answer': contains,
            'f1_score': f1
        }
        
        # Calculate embedding-based metrics
        if use_embedding_metrics:
            embedding_metrics = compute_embedding_metrics(predicted_answer, ground_truth)
            result.update(embedding_metrics)
        
        # Calculate BERTScore
        if use_bertscore:
            bertscore_metrics = compute_bertscore(predicted_answer, ground_truth)
            result.update(bertscore_metrics)
        
        if verbose:
            print("="*80)
            print(f"Question Index: {question_idx}")
            print(f"Question: {question}")
            print(f"\nRetrieved Documents: {doc_ids}")
            print(f"Number of Documents: {len(documents)}")
            print(f"\nGround Truth: {ground_truth}")
            print(f"Predicted Answer: {predicted_answer}")
            print(f"\nText-based Metrics:")
            print(f"  Exact Match: {exact_match}")
            print(f"  Contains Answer: {contains}")
            print(f"  F1 Score: {f1:.3f}")
            
            if use_embedding_metrics:
                print(f"\nEmbedding-based Metrics:")
                print(f"  Cosine Similarity: {result['cosine_similarity']:.4f}")
                print(f"  Euclidean Distance: {result['euclidean_distance']:.4f}")
                print(f"  Dot Product Similarity: {result['dot_product_similarity']:.4f}")
                print(f"  Manhattan Distance: {result['manhattan_distance']:.4f}")
            
            if use_bertscore:
                print(f"\nBERTScore Metrics:")
                print(f"  Precision: {result['bertscore_precision']:.4f}")
                print(f"  Recall: {result['bertscore_recall']:.4f}")
                print(f"  F1: {result['bertscore_f1']:.4f}")
            
            print("="*80)
        
        results_list.append(result)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results_list)
    
    if show_progress:
        print(f"\n‚úì Completed! Evaluated {len(results_df)} questions.")
    
    return results_df


### 8.1 Expected CSV Format

Your CSV file should have at least these two columns:

| question_id | doc_ids |
|-------------|---------|
| 0 | [1, 5, 10] |
| 2 | [3, 7] |
| 4 | [2, 4, 8, 12] |

**Supported doc_ids formats:**
- `[1, 5, 10]` - List format (with brackets)
- `1, 5, 10` - Comma-separated
- `1 5 10` - Space-separated
- `1` - Single document ID

**Column names can be different** - just specify them in the function call!


### 8.2 Load Your Retrieval Results

**Option 1: If you have a CSV file**


In [104]:
# Load your retrieval results CSV
# Replace 'your_file.csv' with your actual file path
retrieval_results = pd.read_csv('retrieval_results_heuristic_k5.csv')

# Preview the data
print(f"Loaded {len(retrieval_results)} retrieval results")
print("\nFirst few rows:")
print(retrieval_results.head())
print("\nColumn names:", retrieval_results.columns.tolist())


Loaded 400 retrieval results

First few rows:
   question_id                                      question_text  \
0          646  Is the smallest penguin species the Little Blu...   
1          646  Is the smallest penguin species the Little Blu...   
2          646  Is the smallest penguin species the Little Blu...   
3          646  Is the smallest penguin species the Little Blu...   
4          873  What does a citizen use to propose changes to ...   

              document_idx            embedding_method  
0  953,1182,1192,1204,2595           bert-base-uncased  
1      952,953,967,975,979  multi-qa-mpnet-base-dot-v1  
2  950,2467,2471,2596,2642     hkunlp-instructor-large  
3      951,952,953,979,981        intfloat-e5-small-v2  
4      25,656,657,658,2141           bert-base-uncased  

Column names: ['question_id', 'question_text', 'document_idx', 'embedding_method']


### 8.3 Run Batch Evaluation


In [109]:
# Run batch evaluation
# The 'question_id' column in your CSV contains the INDEX to look up in qa_df
# For example: question_id=646 will look up qa_df.iloc[646]
results_df = evaluate_rag_batch(
    retrieval_results_df=retrieval_results,
    qa_df=qa_df,
    doc_df=doc_df,
    question_id_col='question_id',   # Column in CSV containing the question index
    doc_ids_col='document_idx',      # Column containing the document IDs
    use_embedding_metrics=True,      # Set to False to skip embedding metrics
    use_bertscore=True,              # Set to True to enable BERTScore (slower)
    verbose=False,                   # Set to True to see details for each question
    show_progress=True               # Shows progress updates
)

# Display results
print("\n" + "="*80)
print("EVALUATION RESULTS SUMMARY")
print("="*80)
print(f"Total questions evaluated: {len(results_df)}")
print(f"\nAverage scores:")
print(f"  Exact Match: {results_df['exact_match'].mean():.3f}")
print(f"  F1 Score: {results_df['f1_score'].mean():.3f}")
if 'cosine_similarity' in results_df.columns:
    print(f"  Cosine Similarity: {results_df['cosine_similarity'].mean():.3f}")
if 'bertscore_f1' in results_df.columns:
    print(f"  BERTScore F1: {results_df['bertscore_f1'].mean():.3f}")


Progress: 10/400 questions evaluated...
Progress: 20/400 questions evaluated...
Progress: 30/400 questions evaluated...
Progress: 40/400 questions evaluated...
Progress: 50/400 questions evaluated...
Progress: 60/400 questions evaluated...
Progress: 70/400 questions evaluated...
Progress: 80/400 questions evaluated...
Progress: 90/400 questions evaluated...
Progress: 100/400 questions evaluated...
Progress: 110/400 questions evaluated...
Progress: 120/400 questions evaluated...
Progress: 130/400 questions evaluated...
Progress: 140/400 questions evaluated...
Progress: 150/400 questions evaluated...
Progress: 160/400 questions evaluated...
Progress: 170/400 questions evaluated...
Progress: 180/400 questions evaluated...
Progress: 190/400 questions evaluated...
Progress: 200/400 questions evaluated...
Progress: 210/400 questions evaluated...
Progress: 220/400 questions evaluated...
Progress: 230/400 questions evaluated...
Progress: 240/400 questions evaluated...
Progress: 250/400 questio

### 8.4 View and Save Results


In [110]:
results_df

Unnamed: 0,question_idx,question,ground_truth,predicted_answer,doc_ids,num_docs,exact_match,contains_answer,f1_score,cosine_similarity,euclidean_distance,dot_product_similarity,manhattan_distance,bertscore_precision,bertscore_recall,bertscore_f1
0,646,Is it not even known whether the gigantic pala...,It is not even known.,No.,"[953, 1182, 1192, 1204, 2595]",5,False,False,0.0,0.532425,0.967032,0.532425,21.478205,0.582180,0.639000,0.609268
1,646,Is it not even known whether the gigantic pala...,It is not even known.,It is not even known.,"[952, 953, 967, 975, 979]",5,True,True,1.0,1.000000,0.000000,1.000000,0.000000,1.000000,1.000000,1.000000
2,646,Is it not even known whether the gigantic pala...,It is not even known.,No.,"[950, 2467, 2471, 2596, 2642]",5,False,False,0.0,0.532425,0.967032,0.532425,21.478205,0.582180,0.639000,0.609268
3,646,Is it not even known whether the gigantic pala...,It is not even known.,Yes.,"[951, 952, 953, 979, 981]",5,False,False,0.0,0.539930,0.959239,0.539930,21.276486,0.591416,0.619107,0.604945
4,873,What religions are found in Uruguay?,"Roman Catholic, Protestant, Jewish, and nonpro...",None.,"[25, 656, 657, 658, 2141]",5,False,False,0.0,0.442526,1.055911,0.442526,23.180184,0.746263,0.617503,0.675805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,794,What shape are the eggs of the larest species ...,Spherical,Not mentioned.,"[2684, 2688, 2692, 2693, 2698]",5,False,False,0.0,0.301685,1.181791,0.301685,25.363775,0.626011,0.655157,0.640253
396,332,Did the Department of the Interior not charge ...,yes,Yes.,"[525, 699, 706, 1396, 2226]",5,True,True,1.0,0.965360,0.263210,0.965360,5.764243,0.815256,0.889402,0.850716
397,332,Did the Department of the Interior not charge ...,yes,Yes.,"[147, 525, 567, 1967, 2999]",5,True,True,1.0,0.965360,0.263210,0.965360,5.764243,0.815256,0.889402,0.850716
398,332,Did the Department of the Interior not charge ...,yes,Yes.,"[117, 289, 567, 2718, 2903]",5,True,True,1.0,0.965360,0.263210,0.965360,5.764243,0.815256,0.889402,0.850716


In [111]:
# View sample results
print("Sample results (first 5):")
print(results_df[['question_idx', 'question', 'ground_truth', 'predicted_answer', 
                  'exact_match', 'f1_score', 'bertscore_precision']].head())

# View all columns
print("\n\nAll available columns:")
print(results_df.columns.tolist())


Sample results (first 5):
   question_idx                                           question  \
0           646  Is it not even known whether the gigantic pala...   
1           646  Is it not even known whether the gigantic pala...   
2           646  Is it not even known whether the gigantic pala...   
3           646  Is it not even known whether the gigantic pala...   
4           873               What religions are found in Uruguay?   

                                        ground_truth       predicted_answer  \
0                              It is not even known.                    No.   
1                              It is not even known.  It is not even known.   
2                              It is not even known.                    No.   
3                              It is not even known.                   Yes.   
4  Roman Catholic, Protestant, Jewish, and nonpro...                  None.   

   exact_match  f1_score  bertscore_precision  
0        False       0.0      

In [112]:
# Save results to CSV
output_file = 'rag_evaluation_results.csv'
results_df.to_csv(output_file, index=False)
print(f"‚úì Results saved to: {output_file}")

# You can also save just the summary metrics
summary_file = 'rag_evaluation_summary.csv'
summary_cols = ['question_idx', 'exact_match', 'f1_score']
if 'cosine_similarity' in results_df.columns:
    summary_cols.append('cosine_similarity')
if 'bertscore_f1' in results_df.columns:
    summary_cols.append('bertscore_f1')
summary_df = results_df[summary_cols].copy()
summary_df.to_csv(summary_file, index=False)
print(f"‚úì Summary saved to: {summary_file}")


‚úì Results saved to: rag_evaluation_results.csv
‚úì Summary saved to: rag_evaluation_summary.csv


### 8.5 Analyze Results

Once you have the results, you can analyze them in various ways:


In [114]:
# 1. Find questions with perfect exact match
perfect_matches = results_df[results_df['exact_match'] == True]
print(f"Perfect matches: {len(perfect_matches)} / {len(results_df)} ({len(perfect_matches)/len(results_df)*100:.1f}%)")

# 2. Find questions with high F1 but not exact match
high_f1_not_exact = results_df[(results_df['f1_score'] > 0.5) & (results_df['exact_match'] == False)]
print(f"\nHigh F1 (>0.5) but not exact match: {len(high_f1_not_exact)}")
if len(high_f1_not_exact) > 0:
    print("Examples:")
    for _, row in high_f1_not_exact.head(3).iterrows():
        print(f"  Q{row['question_idx']}: GT='{row['ground_truth']}' | Pred='{row['predicted_answer']}' | F1={row['f1_score']:.3f}")

# 3. Find worst performing questions
worst = results_df.nsmallest(5, 'f1_score')
print(f"\nWorst performing questions (by F1):")
for _, row in worst.iterrows():
    print(f"  Q{row['question_idx']}: F1={row['f1_score']:.3f} | GT='{row['ground_truth']}' | Pred='{row['predicted_answer']}'")

# 4. Correlation between metrics (if BERTScore was computed)
if 'bertscore_f1' in results_df.columns:
    corr = results_df[['f1_score', 'bertscore_f1', 'cosine_similarity']].corr()
    print("\n\nMetric Correlations:")
    print(corr)


Perfect matches: 93 / 400 (23.2%)

High F1 (>0.5) but not exact match: 7
Examples:
  Q649: GT='At least one giant penguin.' | Pred='At least one giant penguin occurred in this region.' | F1=0.714
  Q895: GT='history and political science' | Pred='Political Science' | F1=0.667
  Q896: GT='German began unrestricted submarine warfare' | Pred='Germany's resumption of unrestricted submarine warfare.' | F1=0.545

Worst performing questions (by F1):
  Q646: F1=0.000 | GT='It is not even known.' | Pred='No.'
  Q646: F1=0.000 | GT='It is not even known.' | Pred='No.'
  Q646: F1=0.000 | GT='It is not even known.' | Pred='Yes.'
  Q873: F1=0.000 | GT='Roman Catholic, Protestant, Jewish, and nonprofessing.' | Pred='None.'
  Q873: F1=0.000 | GT='Roman Catholic, Protestant, Jewish, and nonprofessing.' | Pred='None (answer not found in provided documents)'


Metric Correlations:
                   f1_score  bertscore_f1  cosine_similarity
f1_score           1.000000      0.666879           0.575936
be