In [None]:
# Environment Setup for Hybrid RAG - The Meta-RAG System
# Hybrid RAG combines multiple RAG strategies in a single intelligent workflow
# Requires both OpenAI and Tavily API keys for comprehensive retrieval capabilities
from dotenv import load_dotenv
import os

load_dotenv()

# Load API keys for multi-source retrieval
# OpenAI: For LLM operations (scoring, rewriting, synthesis)
# Tavily: For web search when local knowledge is insufficient
openai_api_key = os.getenv("OPENAI_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")

# Validate required API keys
# Both are essential for Hybrid RAG's comprehensive approach
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file.")
if not tavily_api_key:
    raise ValueError("TAVILY_API_KEY not found in .env file.")

In [None]:
# Import Libraries for Hybrid RAG Implementation
# Hybrid RAG is the most comprehensive approach, combining all RAG strategies

# Core data processing and ML libraries
import pandas as pd
from transformers import pipeline

# LangChain document processing components
# Note: Some imports are duplicated - could be cleaned up in production
from langchain.document_loaders import DataFrameLoader  # Document loading
from langchain.text_splitter import CharacterTextSplitter  # Text chunking
from langchain.embeddings import HuggingFaceEmbeddings  # Legacy import
from langchain.vectorstores import FAISS  # Legacy import
from langchain_community.vectorstores import FAISS  # Current import
from langchain_community.document_loaders import DataFrameLoader  # Current import

# LangChain core workflow components
from langchain_core.prompts import ChatPromptTemplate  # Prompt templates
from langchain_core.runnables import RunnableParallel, RunnablePassthrough  # Chain composition
from langchain_core.output_parsers import StrOutputParser  # Response parsing
from langchain import hub  # Prompt hub access
from langchain_openai import ChatOpenAI  # OpenAI integration

# Additional ML and utility libraries
import os
import getpass
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from langchain.llms import HuggingFaceHub

In [None]:
# Setup Local Document Retriever for Hybrid RAG
# This creates the vector-based retrieval component of the hybrid system
def get_reteriver() : 
    import pandas as pd
    
    # Load UCSC knowledge base from CSV
    # This forms the foundation of our local retrieval capability
    ucsc_passage_df = pd.read_csv("passage.csv")
    
    # Convert to LangChain document format
    # Each row becomes a searchable document
    ucsc_passge_data_loader = DataFrameLoader(ucsc_passage_df, page_content_column="passage")
    ucsc_passage_data = ucsc_passge_data_loader.load()
    
    # Chunk documents for optimal retrieval
    # Small chunks with overlap ensure comprehensive coverage
    text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    docs = text_splitter.split_documents(ucsc_passage_data)
    
    # Create semantic embeddings for vector search
    # Enables similarity-based document retrieval
    embeddings = HuggingFaceEmbeddings()
    
    # Build FAISS vector database
    # Provides fast similarity search capabilities
    db = FAISS.from_documents(docs, embeddings)
    
    # Return retriever interface for the hybrid system
    return db.as_retriever()

# Initialize the local retriever
# This will be one component of our multi-source retrieval strategy
retriever = get_reteriver()

Created a chunk of size 265, which is longer than the specified 200
Created a chunk of size 1411, which is longer than the specified 200
Created a chunk of size 294, which is longer than the specified 200
Created a chunk of size 395, which is longer than the specified 200
Created a chunk of size 298, which is longer than the specified 200
Created a chunk of size 324, which is longer than the specified 200
Created a chunk of size 470, which is longer than the specified 200
Created a chunk of size 662, which is longer than the specified 200
Created a chunk of size 451, which is longer than the specified 200
Created a chunk of size 245, which is longer than the specified 200
Created a chunk of size 226, which is longer than the specified 200
Created a chunk of size 580, which is longer than the specified 200
Created a chunk of size 1970, which is longer than the specified 200
Created a chunk of size 617, which is longer than the specified 200
Created a chunk of size 312, which is longer t

In [None]:
# Setup Web Search Component for Hybrid RAG
# This enables the system to search beyond local knowledge when needed
from langchain_tavily import TavilySearch
from langchain.schema import HumanMessage
from typing import List, TypedDict
import re

# Initialize Tavily search with advanced configuration
# This provides the web search capability for our hybrid approach
tavily = TavilySearch(
    max_results=5,          # Limit results for focused information
    search_depth="advanced", # Use advanced search for better quality
    include_answer=True,     # Include direct answers when available
    include_raw_content=True, # Include full content for context
    api_key=tavily_api_key   # Use API key from environment
)

# Import message types for LLM interactions
# These are used throughout the hybrid system for structured communication
from langchain.schema import HumanMessage, SystemMessage

In [None]:
# === HYBRID RAG HELPER FUNCTIONS ===
# This comprehensive collection of functions powers the meta-RAG system
# Each function handles a specific aspect of the intelligent retrieval workflow

# Initialize the core LLM for all operations
# Temperature=0.0 ensures consistent, deterministic behavior
llm = ChatOpenAI(temperature=0.0, model_name="gpt-3.5-turbo")

# === RETRIEVAL FUNCTIONS ===

def vector_search(query: str) -> List[str]:
    """
    Perform vector-based similarity search on local knowledge base
    Returns: List of relevant document contents as strings
    """
    docs: List[Document] = retriever.get_relevant_documents(query)
    # Extract only the text content from document objects
    return [d.page_content for d in docs]
    

def web_search(query: str) -> List[str]:
    """
    Perform web search using Tavily API
    Returns: List of web search results as strings
    """
    return tavily.invoke(query)  

# === EVALUATION AND SCORING FUNCTIONS ===

def score_docs(docs: List[str], query: str) -> float:
    """
    Use LLM to rate relevance of documents to UCSC-specific query
    Returns: Confidence score from 0.0 (irrelevant) to 1.0 (highly relevant)
    
    This is crucial for Hybrid RAG's adaptive behavior - it determines
    whether to continue with current results or try alternative strategies
    """
    # Ensure UCSC context is explicit in the evaluation
    refined_query = f"{query} (UCSC = University of California Santa Cruz)"

    # Format documents as numbered list for LLM evaluation
    doc_list = "\n".join(f"{i+1}. {d}" for i, d in enumerate(docs))
    prompt = (
        "You are evaluating how well each document helps answer a question about UCSC "
        "(University of California Santa Cruz). "
        "Rate relevance from 0.0 (irrelevant) to 1.0 (fully relevant).\n\n"
        f"Query: {refined_query}\n\n"
        f"Documents:\n{doc_list}\n\n"
        "Respond with ONLY a single number between 0.0 and 1.0."
    )

    # Get LLM evaluation
    resp = llm.invoke([HumanMessage(content=prompt)])
    text = resp.content.strip()

    # Parse score and ensure it's within valid range
    try:
        score = float(text)
        return max(0.0, min(1.0, score))
    except ValueError:
        return 0.0  # Default to no confidence if parsing fails

# === QUERY OPTIMIZATION FUNCTIONS ===

def rewrite_query(query: str) -> str:
    """
    Rewrite query for optimal vector retrieval against UCSC knowledge base
    
    This function enhances queries with:
    - Explicit UCSC context
    - Academic keywords
    - Domain-specific terminology
    """
    prompt = (
        "You're rewriting a student's question into a concise vector-search query "
        "targeting UCSC documents.  \n"
        "- Always include "UCSC (University of California Santa Cruz)".\n"
        "- Add academic keywords like course codes, department names, document types.\n"
        "- Keep it to 3–6 key terms or short phrases.\n\n"
        f"Original question: {query}\n\n"
        "Vector-search query:"
    )
    resp = llm.invoke([HumanMessage(content=prompt)])
    return resp.content.strip()

def rewrite_web_query(query: str) -> str:
    """
    Rewrite query for optimal web search results
    
    This function optimizes queries for:
    - Web search engines
    - Site-specific searches
    - Online forum searches
    """
    prompt = (
        "You're converting a student's question into an advanced web search query "
        "about UCSC (University of California Santa Cruz).  \n"
        "- Mention "UCSC (University of California Santa Cruz)" explicitly.\n"
        "- Favor site-specific qualifiers (e.g., site:ucsc.edu) or student/departmental forums.\n"
        "- Keep it concise—aim for a 5–8 word search string.\n\n"
        f"Original question: {query}\n\n"
        "Web search query:"
    )
    resp = llm.invoke([HumanMessage(content=prompt)])
    return resp.content.strip()

# === ANSWER SYNTHESIS FUNCTIONS ===

def synthesize_answer(context: str, query: str) -> str:
    """
    Generate final answer by combining retrieved context with original query
    
    Uses strict guidelines to:
    - Prevent hallucination
    - Ensure grounding in provided context
    - Maintain answer quality
    """
    system_msg = SystemMessage(content=(
    "You are a UCSC assistant. Only use the provided context and the original query to answer. "
    "Do NOT hallucinate or invent facts. If you're not 100% sure, say "I'm sorry, I don't know.""
))

    user_prompt = (
        f"Original Query: {query}\n\n"
        f"Context from UCSC sources:\n{context}\n\n"
        "Based on this information and the query above, "
        "provide a clear and direct answer. "
        "If information is partial, focus on what you DO know rather than what you don't."
    )
    resp = llm.invoke([system_msg, HumanMessage(content=user_prompt)])
    return resp.content.strip()

# === CHAIN-OF-THOUGHT FUNCTIONS ===

import json

def break_into_cot_questions( query: str) -> List[str]:
    """
    Decompose complex query into logical sub-questions
    
    This enables the CoT (Chain-of-Thought) fallback strategy
    when direct retrieval doesn't yield sufficient confidence
    """
    prompt = (
        "You are a UCSC (University of California Santa Cruz) assistant.  \n"
        "Break down this query into 3–5 logical subquestions.  \n"
        "Output MUST be valid JSON, a top-level array of strings.  \n\n"
        f"Query: {query}\n\n"
        "Example output:\n"
        '["What is …?", "How does …?"]\n\n'
        "Now your output:"
    )
    resp = llm.invoke([HumanMessage(content=prompt)])
    content = resp.content.strip()

    try:
        questions = json.loads(content)
        if isinstance(questions, list) and all(isinstance(q, str) for q in questions):
            return questions
    except json.JSONDecodeError:
        pass

    # Fallback parsing if JSON fails
    return [q.strip() for q in content.split("\n") if q.strip()]

# === QUALITY ASSESSMENT FUNCTIONS ===

def score_answer(answer: str, query: str) -> float:
    """
    Evaluate answer quality and completeness
    
    Returns confidence score (0.0-1.0) indicating how well
    the answer addresses the original query
    """
    prompt = (
        "You are grading the quality of an answer to a UCSC (University of California Santa Cruz) question.\n\n"
        f"Query: {query}\n\n"
        f"Answer: {answer}\n\n"
        "On a scale from 0.0 (not at all confident) to 1.0 (extremely confident), "
        "how well does this answer address the query accurately and completely? "
        "Respond with ONLY a single number."
    )
    resp = llm.invoke([HumanMessage(content=prompt)])
    text = resp.content.strip()
    try:
        score = float(text)
        return max(0.0, min(1.0, score))
    except ValueError:
        return 0.0

def rerank_docs(docs: List[str], query: str, top_k: int = 3) -> List[str]:
    """
    Use LLM to intelligently rerank and select most relevant documents
    
    This provides more sophisticated document selection than
    simple similarity scores alone
    """
    numbered = "\n".join(f"{i+1}. {d}" for i, d in enumerate(docs))
    prompt = (
        f"You are selecting the {top_k} most relevant UCSC (University of California Santa Cruz) documents "
        f"for the query: {query}\n\n"
        f"Documents:\n{numbered}\n\n"
        "Respond with a JSON list of the 1-based indices of the top documents, e.g. [1,3,5]."
    )
    resp = llm.invoke([HumanMessage(content=prompt)])
    try:
        inds = json.loads(resp.content.strip())
        return [docs[i-1] for i in inds if 1 <= i <= len(docs)]
    except Exception:
        return docs[:top_k]  # Fallback to first k documents

In [None]:
# === LANGGRAPH-BASED HYBRID RAG AGENT ===
# This implementation uses LangGraph to create a stateful workflow
# that manages the complex decision-making process of the hybrid system

from langgraph.graph import StateGraph, START, END

# === STATE MANAGEMENT ===
# Define the state schema that tracks information through the workflow
class State(TypedDict):
    """
    State object that persists information through the agent workflow
    
    Attributes:
        query: Original user question
        cot_questions: Sub-questions from chain-of-thought decomposition  
        docs: Retrieved documents used for answer generation
        answer: Final generated answer
        direct: Strategy indicator (1=direct, 2=CoT, 3=failed)
    """
    query: str
    cot_questions: List[str]
    docs: List[str]
    answer: str
    direct: int  

# === CORE RETRIEVAL TOOL ===
def retriever_tool(query: str, use_web: bool = True) -> str:
    """
    Unified retrieval function that combines vector search with web fallback
    
    This tool implements the adaptive retrieval strategy:
    1. Start with vector search on local knowledge base
    2. Iteratively rewrite query if confidence is low
    3. Fall back to web search if local results insufficient
    4. Return best available results as concatenated string
    
    Args:
        query: The search query
        use_web: Whether to allow web search fallback
        
    Returns:
        Combined document text separated by newlines
    """
    # === PHASE 1: VECTOR SEARCH WITH QUERY OPTIMIZATION ===
    docs = vector_search(query)
    score = score_docs(docs, query)
    retries = 0
    
    # Iteratively improve query if confidence is low
    while score < 0.6 and retries < 3:
        query = rewrite_query(query)  # Optimize for vector retrieval
        docs = vector_search(query)
        score = score_docs(docs, query)
        retries += 1
    
    # === PHASE 2: WEB SEARCH FALLBACK (if enabled and needed) ===
    if use_web and score < 0.6:
        web_docs = web_search(query)
        web_score = score_docs(web_docs, query)
        web_retries = 0
        
        # Optimize web query if results are poor
        while web_score < 0.6 and web_retries < 3:
            query = rewrite_web_query(query)  # Optimize for web search
            web_docs = web_search(query)
            web_score = score_docs(web_docs, query)
            web_retries += 1
        
        # Use web results if they're better than vector results
        docs = web_docs if web_score > score else docs
    
    return "\n".join(docs)

# === MAIN AGENT NODE ===
def agent_node(state: State) -> State:
    """
    Main agent workflow that implements the complete Hybrid RAG strategy
    
    Workflow:
    1. Try direct retrieval with high confidence threshold
    2. If unsuccessful, decompose query using Chain-of-Thought
    3. Process each sub-question individually with web search
    4. Synthesize final answer from combined results
    5. Return appropriate state based on success/failure
    """
    
    # === STRATEGY 1: DIRECT RETRIEVAL ===
    print("Attempting direct retrieval...")
    
    # Get initial results using only local knowledge (no web fallback)
    raw = retriever_tool(state["query"], use_web=False)
    initial_docs = raw.split("\n")

    # Generate answer and evaluate confidence
    initial_answer = synthesize_answer(initial_docs, state["query"])
    conf_initial = score_answer(initial_answer, state["query"])

    # If direct retrieval is confident enough, use it
    if conf_initial >= 0.6:
        print(f"Direct retrieval succeeded with confidence {conf_initial:.2f}")
        return {**state, "docs": initial_docs, "answer": initial_answer, "direct": 1}

    # === STRATEGY 2: CHAIN-OF-THOUGHT DECOMPOSITION ===
    print("Direct retrieval insufficient, trying Chain-of-Thought...")
    
    # Break complex query into simpler sub-questions
    qs = break_into_cot_questions(state["query"])
    all_docs: List[str] = []
    
    # Process each sub-question individually
    for i, q in enumerate(qs):
        print(f"  Processing sub-question {i+1}/{len(qs)}: {q}")
        
        # Use full retrieval pipeline (including web search) for sub-questions
        part = retriever_tool(q, use_web=True)   
        chunk_docs = part.split("\n")
        
        # Intelligently select best documents for this sub-question
        chunk_docs = rerank_docs(chunk_docs, q, top_k=5)
        all_docs.extend(chunk_docs)                  

    # === FINAL SYNTHESIS ===
    # Combine all retrieved documents into comprehensive context
    context = "\n".join(all_docs)
    final_answer = synthesize_answer(context, state["query"])
    conf_final = score_answer(final_answer, state["query"])

    # Check if final answer meets minimum confidence threshold
    if conf_final < 0.6:
        print("All strategies failed to meet confidence threshold")
        return {**state, "docs": [], "answer": "NO_ANSWER", "direct": 3}

    print(f"Chain-of-Thought succeeded with confidence {conf_final:.2f}")
    return {**state, "docs": all_docs, "answer": final_answer, "direct": 2}

# === GRAPH CONSTRUCTION ===
# Build the LangGraph workflow with our agent node
print("Building Hybrid RAG workflow graph...")

graph = StateGraph(State)
graph.add_node("agent", agent_node)
graph.set_entry_point("agent")

In [None]:
# === COMPILE AND TEST THE HYBRID RAG GRAPH ===
# Compile the LangGraph workflow into an executable application
app = graph.compile()

# Test the system with a sample UCSC-specific query
print("Testing Hybrid RAG with sample query...")

init_state = {
    "query": "What classes does UCSC Extension Silicon Valley Campus provide?",
    "cot_questions": [],  # Will be populated if CoT strategy is used
    "docs": [],           # Will contain retrieved documents  
    "answer": ""          # Will contain final generated answer
}

# Execute the workflow and get results
result = app.invoke(init_state)
print("Final Answer:")
print(result["answer"])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  docs: List[Document] = retriever.get_relevant_documents(query)


UCSC Extension Silicon Valley Campus provides UC-approved courses, professional certificate programs, skills-based intensives, and boot camps to students and professionals at every stage of their careers.


In [None]:
# Check which strategy was used to generate the answer
# 1 = Direct retrieval succeeded
# 2 = Chain-of-Thought strategy succeeded  
# 3 = All strategies failed
print(f"Strategy used: {result['direct']}")
result["direct"]

1

In [None]:
# === COMPREHENSIVE EVALUATION SETUP ===
# Set up RAGAS (RAG Assessment) framework for systematic evaluation
# This provides standardized metrics for RAG system performance

import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.run_config import RunConfig
from ragas.metrics import faithfulness, answer_relevancy, context_recall, context_precision

# === EVALUATION DATA STORAGE ===
# Lists to collect evaluation data for batch processing
question_list = []   # User queries
gen_ans_list  = []   # Generated answers from Hybrid RAG
searched_docs = []   # Retrieved documents used for answers
src_ans_list  = []   # Ground truth answers (if available)

def run_query(query: str) -> dict:
    """
    Execute a single query through the Hybrid RAG system
    
    This wrapper function:
    1. Initializes the state for the query
    2. Executes the complete workflow
    3. Returns detailed results including strategy used
    
    Args:
        query: User question to process
        
    Returns:
        Complete result dictionary with answer, docs, and metadata
    """
    init_state = {
        "query":         query,
        "cot_questions": [],  # Will be populated during CoT if needed
        "docs":          [],  # Will contain retrieved context documents
        "answer":        "",  # Final generated answer
        "direct":        0    # Strategy indicator (1=direct, 2=CoT, 3=failed)
    }
    return app.invoke(init_state)

In [None]:
# === BATCH EVALUATION PROCESS ===
# Process the complete evaluation dataset through Hybrid RAG
# This systematically tests the system on multiple queries

print("Starting batch evaluation of Hybrid RAG system...")

# Load evaluation dataset with questions and ground truth answers
df = pd.read_csv("new_qa.csv")  # expects columns "questions" and "answers"

# Process each question in the dataset
for idx, row in df.iterrows():
    q = row["questions"]         # User query
    a = row["answers"]           # Ground truth answer
    print(f"Processing #{idx+1}: {q}")

    # Always record the ground truth answer for later comparison
    src_ans_list.append(a)

    # Execute the query through our Hybrid RAG system
    final_state = run_query(q)

    # Skip questions that couldn't be answered
    # This maintains alignment between successful answers and ground truth
    if final_state["answer"] == "NO_ANSWER":
        print(f" No answer generated for query #{idx+1}")
        continue

    # Collect successful results for evaluation
    question_list.append(q)
    gen_ans_list.append(final_state["answer"])
    searched_docs.append(final_state["docs"])
    print(f" Answer generated using strategy {final_state['direct']}")

print(f"\n Evaluation complete: {len(gen_ans_list)} answers generated from {len(df)} questions")

Processing #1: Who can be a member of the Boating Club?
Processing #2: What classes does the boating center offer to community members?
Processing #3: When is the boating club open?
Processing #4: What boats are available to members?
Processing #5: How much does a boating membership cost?
Processing #6: How do I sign up for a boating membership?
Processing #7: How can I join the Boating Club?
Processing #8: Are there additional fees to take out the boats after I join the boating club?
Processing #9: Do the boating club rent Kayaks?
Processing #10: How can I get involved with the Boating Club?
Processing #11: Why is health insurance required at UCSC?
Processing #12: How do I get UC SHIP?
Processing #13: How do I get my UC SHIP ID card?
Processing #14: Am I eligible if I opted out of UC SHIP last year?
Processing #15: Am I eligible for UC SHIP if I've graduated? 
Processing #16: How much does UC SHIP cost?
Processing #17: Will financial aid pay for my UC SHIP?
Processing #18: Will Medi-C

In [None]:
# Inspect a specific question from the dataset for analysis
# This helps understand the types of queries being evaluated
df = pd.read_csv("new_qa.csv")
print("Sample question #44:")
df["questions"][43]

'What classes does UCSC Extension Silicon Valley provide?'

In [None]:
# === EXPORT EVALUATION RESULTS ===
# Save the generated outputs for further analysis and comparison
# This creates a permanent record of system performance

print("Exporting evaluation results...")

df_out = pd.DataFrame({
    "question":         question_list,    # Queries that received answers
    "generated_answer": gen_ans_list,     # Hybrid RAG responses
    "supporting_docs":  searched_docs     # Documents used for each answer
})

# Export to CSV for analysis and comparison with other RAG approaches
df_out.to_csv("HybridRAG_generated_outputsFinal.csv", index=False)
print(f"Exported {len(df_out)} results to HybridRAG_generated_outputsFinal.csv")

In [None]:
# Verify data consistency before RAGAS evaluation
# All lists should have the same length for successful questions
print("Data consistency check:")
print(f"Generated answers: {len(gen_ans_list)}")
print(f"Questions: {len(question_list)}")
print(f"Supporting docs: {len(searched_docs)}")
print(f"Ground truth answers: {len(src_ans_list)}")

len(gen_ans_list), len(question_list), len(searched_docs), len(src_ans_list)

(107, 107, 107, 112)

In [None]:
# === ALIGN GROUND TRUTH WITH GENERATED ANSWERS ===
# Since we skipped some questions (NO_ANSWER cases), we need to 
# align ground truth answers with only the questions we actually answered

print("Aligning ground truth answers with successful generations...")

# Create mapping from questions to ground truth answers
gt_map = dict(zip(df["questions"], df["answers"]))

# Filter ground truth to match only the questions we answered
# This ensures proper alignment for RAGAS evaluation
src_ans_list = [gt_map[q] for q in question_list]

print(f"Aligned {len(src_ans_list)} ground truth answers with generated answers")

In [None]:
# Final verification: all data should now be properly aligned
# This confirms we can proceed with RAGAS evaluation
print("Final alignment verification:")
print(f"All lists should be equal: {len(gen_ans_list)} = {len(question_list)} = {len(searched_docs)} = {len(src_ans_list)}")

len(gen_ans_list), len(question_list), len(searched_docs), len(src_ans_list)

(107, 107, 107, 107)

In [None]:
# === CREATE RAGAS EVALUATION DATASET ===
# Prepare data in the format required by RAGAS framework
# RAGAS expects specific field names for automated evaluation

print("Creating RAGAS evaluation dataset...")

ragas_dataset = Dataset.from_dict({
    "question":           question_list,    # User queries
    "ground_truth":       src_ans_list,     # Expected correct answers
    "answer":             gen_ans_list,     # Hybrid RAG generated answers
    "retrieved_contexts": searched_docs     # Supporting documents used
})

print(f"RAGAS dataset created with {len(question_list)} examples")
print("Dataset ready for comprehensive evaluation with multiple metrics")

In [None]:
# === COMPREHENSIVE RAGAS EVALUATION ===
# Execute multi-metric evaluation to assess Hybrid RAG performance
# Each metric evaluates a different aspect of RAG system quality

print("Starting comprehensive RAGAS evaluation...")

# Define evaluation metrics:
# - faithfulness: How well answers are grounded in retrieved context
# - answer_relevancy: How relevant answers are to the questions
# - context_recall: How well retrieval captures relevant information
# - context_precision: How precise the retrieved context is
metrics = [faithfulness, answer_relevancy, context_recall, context_precision]

# Configure evaluation settings for robust results
run_cfg = RunConfig(
    # timeout=60,        # Maximum time per evaluation
    max_retries=10       # Retry failed evaluations for robustness
    # max_wait=180,      # Maximum wait time
    # max_workers=2      # Parallel evaluation workers
)

print("Running evaluation (this may take several minutes)...")

# Execute the comprehensive evaluation
results = evaluate(
    dataset=ragas_dataset,
    metrics=metrics,
    run_config=run_cfg,
    raise_exceptions=False  # Continue evaluation even if some items fail
)

# Display detailed results
results_df = results.to_pandas()
print("\nDetailed Evaluation Results:")
print(results_df)

Evaluating: 100%|█████████████████████████████| 428/428 [05:22<00:00,  1.33it/s]


                                            user_input  \
0             Who can be a member of the Boating Club?   
1    What classes does the boating center offer to ...   
2                       When is the boating club open?   
3                 What boats are available to members?   
4             How much does a boating membership cost?   
..                                                 ...   
102  What if I don't fulfill the general education ...   
103                                    What is UC TAP?   
104              When are acceptance notices sent out?   
105  What are the Cross-Campus and Simultaneous Enr...   
106    Can I talk to an adviser during a campus visit?   

                                    retrieved_contexts  \
0    [How can I join the Boating Club?, The boating...   
1    [UC Santa Cruz Community Boating Center is a c...   
2    [Boating Club Hours, The Community Boating Cen...   
3    [Our boats:, Sailing vessels for weekend use c...   
4    [Cost, C

In [None]:
# === CALCULATE AND DISPLAY AVERAGE PERFORMANCE ===
# Compute overall system performance across all evaluation metrics
# This provides a summary view of Hybrid RAG effectiveness

print("Computing average performance metrics...")

# Calculate mean scores across all evaluated examples
avg_scores = results_df[[
    "faithfulness",        # Average grounding in retrieved context
    "answer_relevancy",     # Average relevance to user questions  
    "context_recall",       # Average retrieval completeness
    "context_precision"     # Average retrieval accuracy
]].mean()

print("\nHybrid RAG Average Performance:")
print("=" * 40)
for metric, score in avg_scores.items():
    print(f"{metric:20s}: {score:.3f}")
print("=" * 40)

print("\nMetric Interpretation:")
print("• Faithfulness: How well answers stick to retrieved facts")
print("• Answer Relevancy: How well answers address the questions")  
print("• Context Recall: How completely relevant info is retrieved")
print("• Context Precision: How accurately irrelevant info is filtered")

avg_scores

Average Scores:
faithfulness         0.881475
answer_relevancy     0.827332
context_recall       0.535358
context_precision    0.468474
dtype: float64
