# RAG LLM Evaluation with Embedding Metrics

This notebook evaluates RAG performance by:
1. Taking a question and selected document IDs
2. Creating prompts with retrieved documents
3. Passing to an LLM (**Ollama - runs locally, no API limits!**)
4. Comparing generated answers with ground truth using:
   - **Text-based metrics** (exact match, F1 score)
   - **Embedding-based metrics** (cosine similarity, euclidean distance)


## 1. Setup and Imports


In [74]:
import pandas as pd
import numpy as np
import os
import ollama
from typing import List, Dict, Tuple
import json
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time


## 2. Load Data


In [75]:
# Load Q&A data
qa_df = pd.read_csv('data/rag-mini-wikipedia_q_and_a.csv')
print(f"Loaded {len(qa_df)} questions")
print(qa_df.head())


Loaded 918 questions
                                            question     answer  id
0  Was Abraham Lincoln the sixteenth President of...        yes   0
1  Did Lincoln sign the National Banking Act of 1...        yes   2
2                   Did his mother die of pneumonia?         no   4
3      How many long was Lincoln's formal education?  18 months   6
4       When did Lincoln begin his political career?       1832   8


In [76]:
# Load document data
doc_df = pd.read_csv('data/rag-mini-wikipedia_document.csv')
print(f"Loaded {len(doc_df)} documents")
print(doc_df.head())


Loaded 3200 documents
                                             passage  id
0  Uruguay (official full name in  ; pron.  , Eas...   0
1  It is bordered by Brazil to the north, by Arge...   1
2  Montevideo was founded by the Spanish in the e...   2
3  The economy is largely based in agriculture (m...   3
4  According to Transparency International, Urugu...   4


## 3. Setup Ollama (Local LLM)

**‚úÖ Ollama is already running on your system!**

You have these models installed:
- `llama3` (8B params) ‚Üê **Using this one**
- `gemma3` (3B params)

**To use a different model:**
```bash
# Change OLLAMA_MODEL in the cell below to one of:
OLLAMA_MODEL = "llama3"   # Currently selected
OLLAMA_MODEL = "gemma3"   # Alternative option
```




In [78]:
# Configure Ollama
OLLAMA_MODEL = "llama3"  # Using your installed model

# Test connection to Ollama
try:
    # List available models
    models_response = ollama.list()
    
    # Parse the response - it might be a dict with 'models' key or have different structure
    if isinstance(models_response, dict) and 'models' in models_response:
        models_list = models_response['models']
    else:
        models_list = models_response
    
    # Extract model names - handle different response formats
    available_models = []
    for model in models_list:
        if isinstance(model, dict):
            # Try different possible keys
            name = model.get('name') or model.get('model') or str(model)
            available_models.append(name)
        else:
            available_models.append(str(model))
    
    print(f"‚úì Ollama is running!")
    print(f"üì¶ Available models:")
    for model in available_models:
        print(f"   - {model}")
    
    # Check if selected model is available
    if any(OLLAMA_MODEL in str(model_name) for model_name in available_models):
        print(f"\n‚úì Using model: {OLLAMA_MODEL}")
    else:
        print(f"\n‚ö†Ô∏è  Model '{OLLAMA_MODEL}' not found.")
        print(f"Available: {available_models}")
        print(f"Run: ollama pull {OLLAMA_MODEL}")
        
except Exception as e:
    import traceback
    print(f"‚ùå Error connecting to Ollama: {e}")
    print(f"\nDebug info:")
    traceback.print_exc()
    print("\nMake sure Ollama is running.")
    print("\nTo start Ollama:")
    print("  1. Open the Ollama app from Applications, OR")
    print("  2. Run in terminal: ollama serve")


‚úì Ollama is running!
üì¶ Available models:
   - ('models', [Model(model='llama3:latest', modified_at=datetime.datetime(2025, 10, 7, 13, 18, 0, 391725, tzinfo=TzInfo(-14400)), digest='365c0bd3c000a25d28ddbf732fe1c6add414de7275464c4e4d1c3b5fcb5d8ad1', size=4661224676, details=ModelDetails(parent_model='', format='gguf', family='llama', families=['llama'], parameter_size='8.0B', quantization_level='Q4_0')), Model(model='gemma3:latest', modified_at=datetime.datetime(2025, 10, 7, 12, 9, 38, 308655, tzinfo=TzInfo(-14400)), digest='a2af6cc3eb7fa8be8504abaf9b04e88f17a119ec3f04a3addf55f92841195f5a', size=3338801804, details=ModelDetails(parent_model='', format='gguf', family='gemma3', families=['gemma3'], parameter_size='4.3B', quantization_level='Q4_K_M'))])

‚úì Using model: llama3


## 4. Setup Embedding Model


In [79]:
# Initialize embedding model for semantic similarity evaluation
# Using the SAME model as in Embedding_process.ipynb for consistency
embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
print(f"Embedding model loaded: multi-qa-mpnet-base-dot-v1")

def get_documents_by_ids(doc_df: pd.DataFrame, doc_ids: List[int]) -> List[str]:
    """
    Retrieve document passages by their IDs.
    
    Args:
        doc_df: DataFrame containing documents
        doc_ids: List of document IDs to retrieve
    
    Returns:
        List of document passages
    """
    documents = []
    for doc_id in doc_ids:
        doc = doc_df[doc_df['id'] == doc_id]
        if not doc.empty:
            documents.append(doc['passage'].values[0])
        else:
            print(f"Warning: Document ID {doc_id} not found")
    return documents


Embedding model loaded: multi-qa-mpnet-base-dot-v1


In [90]:
def create_rag_prompt(question: str, documents: List[str]) -> str:
    """
    Create a RAG prompt with question and retrieved documents.
    
    Args:
        question: The user's question
        documents: List of retrieved document passages
    
    Returns:
        Formatted prompt string
    """
    context = "\n\n".join([f"Document {i+1}:\n{doc}" for i, doc in enumerate(documents)])
    
    prompt = f"""You are a precise question-answering assistant. Based on the provided documents, answer the question with ONLY the direct answer. Do not include explanations, context, or additional information.

Context Documents:
{context}

Question: {question}

Instructions:
- Provide ONLY the direct answer to the question
- Do NOT add phrases like "Based on the documents..." or "According to..."
- Do NOT provide explanations or reasoning
- If the answer is a single word, number, or short phrase, return only that
- If the answer requires a sentence, make it as brief as possible

Answer:"""
    
    return prompt


In [91]:
def query_llm(prompt: str, temperature: float = 0.0, max_retries: int = 3, base_delay: float = 1.0) -> str:
    """
    Send prompt to Ollama (local LLM) and get response with retry logic.
    
    Args:
        prompt: The prompt to send
        temperature: Sampling temperature (0.0 for deterministic)
        max_retries: Maximum number of retry attempts  
        base_delay: Base delay between retries (will use exponential backoff)
    
    Returns:
        Generated answer from LLM, or None if all retries fail
    """
    for attempt in range(max_retries):
        try:
            response = ollama.generate(
                model=OLLAMA_MODEL,
                prompt=prompt,
                options={
                    'temperature': temperature,
                    'num_predict': 500,  # Max tokens
                }
            )
            
            # Extract the response text
            answer = response.get('response', '').strip()
            return answer if answer else None
            
        except Exception as e:
            error_msg = str(e)
            
            # Check if Ollama is not running
            if "connection" in error_msg.lower() or "refused" in error_msg.lower():
                print(f"‚ùå Cannot connect to Ollama. Make sure it's running: ollama serve")
                return None
            
            # Generic error with retry
            print(f"‚ö†Ô∏è  Error calling Ollama (attempt {attempt + 1}/{max_retries}): {error_msg}")
            if attempt < max_retries - 1:
                wait_time = base_delay * (2 ** attempt)
                print(f"   Retrying in {wait_time:.1f}s...")
                time.sleep(wait_time)
            else:
                print(f"‚ùå Failed after {max_retries} attempts")
                return None
    
    return None


In [92]:
def normalize_answer(text: str) -> str:
    """
    Normalize answer text for comparison.
    
    Args:
        text: Text to normalize
    
    Returns:
        Normalized text
    """
    # Convert to lowercase
    text = text.lower().strip()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text


In [93]:
def exact_match_score(predicted: str, ground_truth: str) -> bool:
    """
    Calculate exact match score (normalized).
    
    Args:
        predicted: Predicted answer
        ground_truth: Ground truth answer
    
    Returns:
        True if answers match exactly (after normalization)
    """
    return normalize_answer(predicted) == normalize_answer(ground_truth)


In [94]:
def contains_answer_score(predicted: str, ground_truth: str) -> bool:
    """
    Check if predicted answer contains the ground truth.
    
    Args:
        predicted: Predicted answer
        ground_truth: Ground truth answer
    
    Returns:
        True if predicted contains ground truth
    """
    return normalize_answer(ground_truth) in normalize_answer(predicted)


In [95]:
def f1_score(predicted: str, ground_truth: str) -> float:
    """
    Calculate token-level F1 score.
    
    Args:
        predicted: Predicted answer
        ground_truth: Ground truth answer
    
    Returns:
        F1 score (0.0 to 1.0)
    """
    pred_tokens = set(normalize_answer(predicted).split())
    gt_tokens = set(normalize_answer(ground_truth).split())
    
    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        return 0.0
    
    common_tokens = pred_tokens.intersection(gt_tokens)
    
    if len(common_tokens) == 0:
        return 0.0
    
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(gt_tokens)
    
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1


**Note:** You created embeddings with 4 different models in `Embedding_process.ipynb`:
- `bert-base-uncased`
- `multi-qa-mpnet-base-dot-v1` ‚Üê **Using this one**
- `intfloat/e5-small-v2`
- `hkunlp/instructor-large`

You can change the model above to test which embedding model gives the best evaluation correlation!


## 5. Embedding-based Evaluation Metrics


### Alternative: Use E5 or Instructor models (uncomment to use)


**‚ö†Ô∏è Important Note on Embedding Metrics:**

Embedding-based metrics work best for **longer text** (paragraphs, sentences). For **very short answers** like "yes" vs "no", they can give misleading results:

- ‚úÖ **Good for**: Comparing longer answers where semantic meaning is clear
- ‚ùå **Not ideal for**: Single-word answers, especially yes/no questions
- üí° **Recommendation**: For short answers, rely more on **text-based metrics** (exact match, F1 score)


In [96]:
# # Option 1: E5 model (requires "query: " prefix for answers)
# embedding_model = SentenceTransformer('intfloat/e5-small-v2')
# 
# def compute_embedding_metrics_e5(predicted: str, ground_truth: str) -> Dict[str, float]:
#     # Add "query: " prefix as done in Embedding_process.ipynb
#     emb1 = embedding_model.encode(["query: " + predicted])[0]
#     emb2 = embedding_model.encode(["query: " + ground_truth])[0]
#     
#     from sklearn.preprocessing import normalize
#     emb1 = normalize([emb1])[0]
#     emb2 = normalize([emb2])[0]
#     
#     return {
#         'cosine_similarity': float(np.dot(emb1, emb2)),
#         'euclidean_distance': float(np.linalg.norm(emb1 - emb2)),
#         'dot_product_similarity': float(np.dot(emb1, emb2)),
#         'manhattan_distance': float(np.sum(np.abs(emb1 - emb2)))
#     }

# # Option 2: Instructor model (requires instruction)
# from InstructorEmbedding import INSTRUCTOR
# embedding_model = INSTRUCTOR('hkunlp/instructor-large')
# 
# def compute_embedding_metrics_instructor(predicted: str, ground_truth: str) -> Dict[str, float]:
#     # Add instruction as done in Embedding_process.ipynb
#     emb1 = embedding_model.encode([["Represent the answer:", predicted]])[0]
#     emb2 = embedding_model.encode([["Represent the answer:", ground_truth]])[0]
#     
#     # Note: Instructor embeddings are NOT normalized in Embedding_process.ipynb
#     return {
#         'cosine_similarity': float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))),
#         'euclidean_distance': float(np.linalg.norm(emb1 - emb2)),
#         'dot_product_similarity': float(np.dot(emb1, emb2)),
#         'manhattan_distance': float(np.sum(np.abs(emb1 - emb2)))
#     }


In [97]:
def compute_embedding_metrics(predicted: str, ground_truth: str) -> Dict[str, float]:
    """
    Compute multiple embedding-based metrics between predicted and ground truth answers.
    
    Metrics:
    - Cosine Similarity: Measures angular similarity (0 to 1, higher is better)
    - Euclidean Distance: L2 distance between embeddings (lower is better)
    - Dot Product: Inner product of normalized embeddings (0 to 1, higher is better)
    - Manhattan Distance: L1 distance between embeddings (lower is better)
    
    Args:
        predicted: Predicted answer
        ground_truth: Ground truth answer
    
    Returns:
        Dictionary with embedding-based metrics
    """
    # Generate embeddings
    emb1 = embedding_model.encode([predicted])[0]
    emb2 = embedding_model.encode([ground_truth])[0]
    
    # Normalize embeddings (same as in Embedding_process.ipynb)
    from sklearn.preprocessing import normalize
    emb1 = normalize([emb1])[0]
    emb2 = normalize([emb2])[0]
    
    # Cosine similarity (for normalized vectors, this equals dot product)
    cos_sim = np.dot(emb1, emb2)
    
    # Euclidean distance
    euclidean_dist = np.linalg.norm(emb1 - emb2)
    
    # Dot product similarity (same as cosine for normalized embeddings)
    dot_product = np.dot(emb1, emb2)
    
    # Manhattan distance
    manhattan_dist = np.sum(np.abs(emb1 - emb2))
    
    return {
        'cosine_similarity': float(cos_sim),
        'euclidean_distance': float(euclidean_dist),
        'dot_product_similarity': float(dot_product),
        'manhattan_distance': float(manhattan_dist)
    }


### üìä Understanding the Metrics

**For SHORT answers (yes/no, single words):**
- ‚úÖ **Exact Match** - Most reliable! 0 or 1, no ambiguity
- ‚úÖ **F1 Score** - Good for token overlap
- ‚ùå **Embedding Metrics** - Can be misleading (e.g., "yes" vs "no" might have high similarity)

**For LONG answers (sentences, paragraphs):**
- ‚úÖ **Embedding Metrics** - Excellent! Captures semantic meaning
- ‚úÖ **F1 Score** - Good for word overlap
- ‚ö†Ô∏è **Exact Match** - Too strict, rarely matches

**Example of the issue:**
```
Ground Truth: "yes"
Predicted: "no"
Exact Match: 0.0 ‚úì (Correct - they're different)
Cosine Similarity: 0.933 ‚úó (Misleading - seems similar but they're opposite!)
```

**Why?** Embeddings capture that both are short, single-word boolean answers in similar contexts, not that they're semantically opposite.


### üí° Recommendations

**Option 1: Filter your analysis**
When analyzing results, separate short vs long answers:
```python
# After getting results_df
short_answers = results_df[results_df['ground_truth'].str.split().str.len() <= 2]
long_answers = results_df[results_df['ground_truth'].str.split().str.len() > 2]

# For short answers: focus on exact_match and f1_score
# For long answers: embedding metrics are more reliable
```

**Option 2: Use text metrics primarily**
For this RAG evaluation, **Exact Match** and **F1 Score** are your most reliable metrics across all answer lengths.


## 6. RAG Evaluation Pipeline


In [98]:
def evaluate_rag_single(
    question_id: int,
    doc_ids: List[int],
    qa_df: pd.DataFrame,
    doc_df: pd.DataFrame,
    use_embedding_metrics: bool = True,
    verbose: bool = True
) -> Dict:
    """
    Evaluate RAG for a single question.
    
    Args:
        question_id: ID of the question in qa_df
        doc_ids: List of document IDs to use for context
        qa_df: DataFrame with questions and answers
        doc_df: DataFrame with documents
        use_embedding_metrics: Whether to compute embedding-based metrics
        verbose: Whether to print detailed output
    
    Returns:
        Dictionary with evaluation results
    """
    # Get question and ground truth
    qa_row = qa_df[qa_df['id'] == question_id]
    if qa_row.empty:
        print(f"Error: Question ID {question_id} not found")
        return None
    
    question = qa_row['question'].values[0]
    ground_truth = qa_row['answer'].values[0]
    
    # Retrieve documents
    documents = get_documents_by_ids(doc_df, doc_ids)
    
    if not documents:
        print("Error: No documents retrieved")
        return None
    
    # Create prompt
    prompt = create_rag_prompt(question, documents)
    
    # Query LLM
    predicted_answer = query_llm(prompt)
    
    if predicted_answer is None:
        print("Error: Failed to get LLM response")
        return None
    
    # Calculate text-based metrics
    exact_match = exact_match_score(predicted_answer, ground_truth)
    contains = contains_answer_score(predicted_answer, ground_truth)
    f1 = f1_score(predicted_answer, ground_truth)
    
    # Prepare results
    results = {
        'question_id': question_id,
        'question': question,
        'ground_truth': ground_truth,
        'predicted_answer': predicted_answer,
        'doc_ids': doc_ids,
        'num_docs': len(documents),
        'exact_match': exact_match,
        'contains_answer': contains,
        'f1_score': f1
    }
    
    # Calculate embedding-based metrics
    if use_embedding_metrics:
        embedding_metrics = compute_embedding_metrics(predicted_answer, ground_truth)
        results.update(embedding_metrics)
    
    if verbose:
        print("="*80)
        print(f"Question ID: {question_id}")
        print(f"Question: {question}")
        print(f"\nRetrieved Documents: {doc_ids}")
        print(f"Number of Documents: {len(documents)}")
        print(f"\nGround Truth: {ground_truth}")
        print(f"Predicted Answer: {predicted_answer}")
        print(f"\nText-based Metrics:")
        print(f"  Exact Match: {exact_match}")
        print(f"  Contains Answer: {contains}")
        print(f"  F1 Score: {f1:.3f}")
        
        if use_embedding_metrics:
            print(f"\nEmbedding-based Metrics:")
            print(f"  Cosine Similarity: {results['cosine_similarity']:.4f}")
            print(f"  Euclidean Distance: {results['euclidean_distance']:.4f}")
            print(f"  Dot Product Similarity: {results['dot_product_similarity']:.4f}")
            print(f"  Manhattan Distance: {results['manhattan_distance']:.4f}")
        
        print("="*80)
    
    return results


## 7. Single Example Evaluation


In [102]:
# Example: Evaluate a single question with selected documents
# Replace question_id and doc_ids with your actual values

question_id = 0  # Question ID from qa_df
doc_ids = [1, 2, 3]  # Document IDs you determined are best for this question

result = evaluate_rag_single(
    question_id=question_id,
    doc_ids=doc_ids,
    qa_df=qa_df,
    doc_df=doc_df,
    use_embedding_metrics=True,  # Enable embedding-based metrics
    verbose=True
)


Question ID: 0
Question: Was Abraham Lincoln the sixteenth President of the United States?

Retrieved Documents: [1, 2, 3]
Number of Documents: 3

Ground Truth: yes
Predicted Answer: No.

Text-based Metrics:
  Exact Match: False
  Contains Answer: False
  F1 Score: 0.000

Embedding-based Metrics:
  Cosine Similarity: 0.9336
  Euclidean Distance: 0.3644
  Dot Product Similarity: 0.9336
  Manhattan Distance: 8.2146


## 8. Batch Evaluation

**‚úÖ Benefits of Using Ollama (Local LLM):**
- **No rate limits!** Run as many evaluations as you want
- **No API costs** - completely free
- **Privacy** - all processing happens locally
- **Fast** - no network latency

**Features:**
- **Checkpoint saving** every 5 questions (auto-resumes on restart)
- **Automatic retries** if something goes wrong
- **Progress tracking** with clear status messages


In [None]:
def evaluate_rag_batch(
    question_doc_pairs: List[Tuple[int, List[int]]],
    qa_df: pd.DataFrame,
    doc_df: pd.DataFrame,
    use_embedding_metrics: bool = True,
    verbose: bool = False,
    rate_limit_delay: float = 1.0,
    checkpoint_file: str = "data_processed/rag_evaluation_checkpoint.csv"
) -> pd.DataFrame:
    """
    Evaluate RAG for multiple questions with rate limiting and progress saving.
    
    Args:
        question_doc_pairs: List of (question_id, doc_ids) tuples
        qa_df: DataFrame with questions and answers
        doc_df: DataFrame with documents
        use_embedding_metrics: Whether to compute embedding-based metrics
        verbose: Whether to print detailed output for each question
        rate_limit_delay: Seconds to wait between API calls (helps avoid rate limits)
        checkpoint_file: File to save progress (in case of interruption)
    
    Returns:
        DataFrame with evaluation results
    """
    results = []
    
    # Try to load existing checkpoint
    import os
    checkpoint_exists = os.path.exists(checkpoint_file)
    processed_ids = set()
    
    if checkpoint_exists:
        try:
            checkpoint_df = pd.read_csv(checkpoint_file)
            results = checkpoint_df.to_dict('records')
            processed_ids = set(checkpoint_df['question_id'].values)
            print(f"üìÇ Loaded checkpoint: {len(processed_ids)} questions already processed")
        except Exception as e:
            print(f"‚ö†Ô∏è  Could not load checkpoint: {e}")
    
    for i, (question_id, doc_ids) in enumerate(question_doc_pairs):
        # Skip if already processed
        if question_id in processed_ids:
            print(f"‚è≠Ô∏è  Skipping question {i+1}/{len(question_doc_pairs)} (ID: {question_id}) - already processed")
            continue
        
        print(f"üîÑ Evaluating question {i+1}/{len(question_doc_pairs)} (ID: {question_id})...")
        
        # Add delay to avoid rate limits (except for first request)
        if len(results) > 0:
            time.sleep(rate_limit_delay)
        
        result = evaluate_rag_single(
            question_id=question_id,
            doc_ids=doc_ids,
            qa_df=qa_df,
            doc_df=doc_df,
            use_embedding_metrics=use_embedding_metrics,
            verbose=verbose
        )
        
        if result:
            results.append(result)
            processed_ids.add(question_id)
            
            # Save checkpoint every 5 questions
            if len(results) % 5 == 0:
                try:
                    checkpoint_df = pd.DataFrame(results)
                    os.makedirs(os.path.dirname(checkpoint_file), exist_ok=True)
                    checkpoint_df.to_csv(checkpoint_file, index=False)
                    print(f"üíæ Checkpoint saved: {len(results)} questions completed")
                except Exception as e:
                    print(f"‚ö†Ô∏è  Could not save checkpoint: {e}")
    
    # Final save
    results_df = pd.DataFrame(results)
    if len(results) > 0:
        try:
            results_df.to_csv(checkpoint_file, index=False)
            print(f"üíæ Final checkpoint saved")
        except Exception as e:
            print(f"‚ö†Ô∏è  Could not save final checkpoint: {e}")
    
    # Calculate aggregate metrics
    if len(results_df) > 0:
        print("\n" + "="*80)
        print("AGGREGATE RESULTS")
        print("="*80)
        print(f"Total Questions Evaluated: {len(results_df)}")
        print(f"\nText-based Metrics:")
        print(f"  Exact Match Accuracy: {results_df['exact_match'].mean():.3f}")
        print(f"  Contains Answer Accuracy: {results_df['contains_answer'].mean():.3f}")
        print(f"  Average F1 Score: {results_df['f1_score'].mean():.3f}")
        
        if use_embedding_metrics and 'cosine_similarity' in results_df.columns:
            print(f"\nEmbedding-based Metrics:")
            print(f"  Average Cosine Similarity: {results_df['cosine_similarity'].mean():.4f}")
            print(f"  Average Euclidean Distance: {results_df['euclidean_distance'].mean():.4f}")
            print(f"  Average Dot Product Similarity: {results_df['dot_product_similarity'].mean():.4f}")
            print(f"  Average Manhattan Distance: {results_df['manhattan_distance'].mean():.4f}")
        
        print("="*80)
    else:
        print("‚ö†Ô∏è  No results to display")
    
    return results_df

In [None]:
# Example batch evaluation
# Format: List of (question_id, doc_ids) tuples
question_doc_pairs = [
    (0, [0, 1, 2]),      # Question 0 with documents [0, 1, 2]
    (2, [3, 4, 5]),      # Question 2 with documents [3, 4, 5]
    (4, [6, 7, 8]),      # Question 4 with documents [6, 7, 8]
]

results_df = evaluate_rag_batch(
    question_doc_pairs=question_doc_pairs,
    qa_df=qa_df,
    doc_df=doc_df,
    use_embedding_metrics=True,  # Enable embedding-based metrics
    verbose=False,
    rate_limit_delay=0.5,  # Small delay for stability (Ollama has no rate limits!)
    checkpoint_file="data_processed/rag_evaluation_checkpoint.csv"  # Save progress
)

### Clear Checkpoint (Optional)


In [None]:
# Uncomment to clear checkpoint and start fresh
# import os
# checkpoint_file = "data_processed/rag_evaluation_checkpoint.csv"
# if os.path.exists(checkpoint_file):
#     os.remove(checkpoint_file)
#     print("üóëÔ∏è  Checkpoint cleared")


## 9. View Results

In [None]:
# Display results table with all metrics
results_df[['question_id', 'question', 'ground_truth', 'predicted_answer', 
            'exact_match', 'contains_answer', 'f1_score',
            'cosine_similarity', 'euclidean_distance', 'dot_product_similarity']]

## 10. Save Results

In [None]:
# Save detailed results to CSV with embedding metrics
results_df.to_csv('data_processed/rag_evaluation_results_with_embeddings.csv', index=False)
print("Results saved to data_processed/rag_evaluation_results_with_embeddings.csv")

## 11. Analyze Embedding vs Text Metrics (Optional)

In [None]:
# Analyze correlation between embedding similarity and text-based metrics
import matplotlib.pyplot as plt

# Scatter plot: Cosine Similarity vs F1 Score
plt.figure(figsize=(10, 6))
plt.scatter(results_df['cosine_similarity'], results_df['f1_score'], alpha=0.6, s=100)
plt.xlabel('Cosine Similarity (Embedding-based)', fontsize=12)
plt.ylabel('F1 Score (Text-based)', fontsize=12)
plt.title('Embedding Similarity vs Text-based Similarity', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Print correlation
correlation = results_df['cosine_similarity'].corr(results_df['f1_score'])
print(f"\nCorrelation between Cosine Similarity and F1 Score: {correlation:.3f}")