In [1]:
"""
Evaluating Agents with RAGAS - AI Makerspace

This script demonstrates:
1. Setting up RAGAS for agent evaluation
2. Creating synthetic test data
3. Evaluating RAG chains with different metrics
4. Comparing agent performance

Based on the AI Makerspace notebook for evaluating RAG systems.
"""

import os
import sys
import getpass
from uuid import uuid4
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Add backend to path for RAG system integration
backend_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'backend'))

if backend_path not in sys.path:
    sys.path.insert(0, backend_path)

# Set up environment variables
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY", "")
os.environ["LANGCHAIN_PROJECT"] = f"RAGAS-Evaluation-{uuid4().hex[0:8]}"

In [None]:
 
def install_dependencies():
    """Install required dependencies"""
    print("Installing dependencies...")
    # Note: In a real environment, these would be installed via pip
    # !pip install -qU ragas==0.2.10
    # !pip install -qU langchain-community==0.3.14 langchain-openai==0.2.14
    # !pip install -qU unstructured==0.16.12 langgraph==0.2.61 langchain-qdrant==0.2.0
    print("Dependencies would be installed here")

def setup_ragas_components():
    """Set up RAGAS components for evaluation using Ollama"""
    from ragas.llms import LangchainLLMWrapper
    from ragas.embeddings import LangchainEmbeddingsWrapper
    from langchain_community.llms import Ollama
    from langchain_ollama import OllamaEmbeddings
    
    # Use Ollama models from your current project - same as RAG system
    #generator_llm = LangchainLLMWrapper(Ollama(model="qwen:latest"))
    from langchain_openai import ChatOpenAI
    from langchain_openai import OpenAIEmbeddings
    generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-nano"))
    generator_embeddings = LangchainEmbeddingsWrapper(OllamaEmbeddings(model="nomic-embed-text:latest"))
    
    print("✅ RAGAS components configured to use same models as RAG system")
    print(f"   - LLM: qwen:latest")
    print(f"   - Embeddings: nomic-embed-text:latest")
    
    return generator_llm, generator_embeddings

In [3]:
def load_documents():
    """Load documents using the existing RAG system"""
    from rag_system import NPTERAGSystem
    
    # Initialize the RAG system (this will connect to existing Qdrant collection)
    # The NPTERAGSystem already uses the correct path "./qdrant_data" relative to backend
    rag_system = NPTERAGSystem()
    print("✅ RAG system initialized with existing Qdrant collection")
    rag_system.setup_collection()
    
    # Check if documents are already in the vector store
    try:
        # Try to retrieve some documents to verify the vector store is populated
        test_docs = rag_system.retrieve_relevant_context("therapy", k=5)
        if test_docs:
            print(f"✅ Connected to existing vector store with {len(test_docs)} sample documents")
            print("Using pre-embedded documents from Qdrant collection")
            return rag_system
        else:
            print("⚠️ Vector store appears to be empty, but continuing...")
            return rag_system
    except Exception as e:
        print(f"⚠️ Error checking vector store: {e}")
        print("Continuing with RAG system...")
        return rag_system

In [4]:
# ============================================================================
# Synthetic Data Generation
# ============================================================================

def create_knowledge_graph(docs, generator_llm, generator_embeddings):
    """Create knowledge graph from documents"""
    from ragas.testset.graph import KnowledgeGraph, Node, NodeType
    
    print("Creating knowledge graph...")
    kg = KnowledgeGraph()
    
    # Use a subset of data for cost/time efficiency
    for doc in docs[:20]:
        kg.nodes.append(
            Node(
                type=NodeType.DOCUMENT,
                properties={"page_content": doc.page_content, "document_metadata": doc.metadata}
            )
        )
    
    print(f"Added {len(kg.nodes)} nodes to knowledge graph")
    return kg

def apply_transformations(kg, docs, generator_llm, generator_embeddings):
    """Apply default transformations to knowledge graph"""
    from ragas.testset.transforms import default_transforms, apply_transforms
    
    print("Applying transformations...")
    transformer_llm = generator_llm
    embedding_model = generator_embeddings
    
    default_transforms = default_transforms(documents=docs, llm=transformer_llm, embedding_model=embedding_model)
    apply_transforms(kg, default_transforms)
    
    return kg

def generate_synthetic_data(docs, generator_llm, generator_embeddings, testset_size=10):
    """Generate synthetic data using RAGAS"""
    from ragas.testset import TestsetGenerator
    
    print("Generating synthetic data using RAGAS...")
    generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
    
    # Use a smaller subset and smaller test size to avoid hanging
    dataset = generator.generate_with_langchain_docs(docs[:5], testset_size=testset_size)
    return dataset


In [None]:
def setup_basic_rag_chain(rag_system):
    """Set up RAG chain that matches the existing system for RAGAS evaluation"""
    from langchain.prompts import ChatPromptTemplate
    from langchain_openai import ChatOpenAI
    from langchain_core.runnables import RunnablePassthrough
    from langchain.schema import StrOutputParser
    from operator import itemgetter
    
    print("Setting up RAG chain for evaluation...")
    
    # Use the existing retriever from RAG system
    retriever = rag_system.retriever
    
    # Create a prompt that matches your system's style
    RAG_PROMPT = """\
You are an NPTE-PT exam tutor assistant. Answer the question based ONLY on the provided context.

If you cannot answer the question based on the context, say "I don't have enough information to answer this question."

Context: {context}
Question: {question}

Answer:"""

    rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
    
    # Create LLM using the same model as your system
    llm = ChatOpenAI(model="gpt-4.1-nano")
    
    # Create RAG chain
    rag_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | rag_prompt | llm | StrOutputParser()
    )
    
    return rag_chain

In [40]:
def setup_improved_rag_chain(rag_system):
    """Set up improved RAG chain with multi-query retriever - FIXED VERSION"""
    from langchain.prompts import ChatPromptTemplate
    from langchain_openai import ChatOpenAI
    from langchain_core.runnables import RunnablePassthrough
    from langchain.schema import StrOutputParser
    from operator import itemgetter
    from langchain.retrievers import MultiQueryRetriever
    
    print("Setting up improved RAG chain with multi-query retriever...")
    
    # Use the existing retriever from RAG system as base
    base_retriever = rag_system.retriever
    
    # Create LLM for query generation
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)
    
    # Create multi-query retriever
    multi_query_retriever = MultiQueryRetriever.from_llm(
        retriever=base_retriever,
        llm=llm
    )
    
    print("✅ Multi-query retriever created successfully")
    
    # Create an improved prompt
    IMPROVED_RAG_PROMPT = """\
You are an expert NPTE-PT exam tutor assistant. Answer the question based ONLY on the provided context.

IMPORTANT INSTRUCTIONS:
1. Use ONLY the information provided in the context below
2. If the context doesn't contain enough information to answer the question, say "I don't have enough information to answer this question based on the provided context."
3. Do NOT add any information that is not in the context
4. Be specific and direct in your answer
5. If the context contains multiple relevant pieces of information, synthesize them clearly

Context Information:
{context}

Question: {question}

Answer (based only on the context above):"""

    improved_prompt = ChatPromptTemplate.from_template(IMPROVED_RAG_PROMPT)
    
    # Create improved RAG chain with multi-query retriever
    improved_rag_chain = (
        {
            "context": itemgetter("question") | multi_query_retriever,  # ✅ FIXED: using multi_query_retriever
            "question": itemgetter("question")
        }
        | improved_prompt | llm | StrOutputParser()
    )
    
    return improved_rag_chain

In [None]:
def setup_evaluators():
    """Set up evaluation components with specific RAGAS metrics"""
    from ragas.metrics import (
        answer_relevancy,
        context_precision,
        context_recall,
        faithfulness
    )
    from ragas.llms import LangchainLLMWrapper
    from langchain_openai import ChatOpenAI
    
    print("Setting up RAGAS evaluators with updated metrics...")
    
    # Create LLM for evaluation
    eval_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
    
    # RAGAS evaluators with the specific metrics you requested
    evaluators = [
        answer_relevancy,
        context_precision,     # How precise the retrieved context is
        context_recall,        # How much relevant context was retrieved
        faithfulness          # How faithful the response is to the context
    ]
    
    print("✅ RAGAS evaluators configured with:")
    print("   - response_relevancy: Measures how relevant the response is to the question")
    print("   - context_precision: Measures how precise the retrieved context is")
    print("   - context_recall: Measures how much relevant context was retrieved")
    print("   - faithfulness: Measures how faithful the response is to the context")
    
    return evaluators

In [30]:
def evaluate_rag_chain(rag_chain, dataset, evaluators):
    """Evaluate RAG chain using RAGAS 0.3.0 - FIXED VERSION"""
    from ragas import evaluate
    from langchain_openai import ChatOpenAI
    from datasets import Dataset
    import pandas as pd
    
    print("Evaluating RAG chain with RAGAS 0.3.0...")
    
    # Convert RAGAS TestsetSample to a proper evaluation dataset
    df = dataset.to_pandas()
    
    # Create a new dataset with the required format for RAGAS evaluation
    # We need to run the RAG chain on the questions to get answers
    print("Running RAG chain on test questions...")
    
    questions = []
    ground_truths = []
    contexts = []
    
    for idx, row in df.iterrows():
        question = row.get('user_input', row.get('question', ''))
        ground_truth = row.get('reference', row.get('answer', ''))
        context = row.get('reference_contexts', [])
        
        if question and ground_truth:
            questions.append(question)
            ground_truths.append(ground_truth)
            contexts.append(context)
    
    # Run the RAG chain to get predictions
    predictions = []
    for question in questions:
        try:
            answer = rag_chain.invoke({"question": question})
            predictions.append(answer)
        except Exception as e:
            print(f"Error running RAG chain on question: {e}")
            predictions.append("Error generating answer")
    
    # Create the evaluation dataset
    eval_data = {
        "question": questions,
        "answer": predictions,
        "ground_truth": ground_truths,
        "contexts": contexts
    }
    
    eval_dataset = Dataset.from_dict(eval_data)
    print(f"Created evaluation dataset with {len(eval_dataset)} examples")
    
    # For RAGAS 0.3.0, we need to pass the LLM and embeddings separately
    eval_llm = ChatOpenAI(model="gpt-4.1-nano")
    
    # The correct order for RAGAS 0.3.0 is: evaluate(dataset, metrics, llm, embeddings, ...)
    result = evaluate(
        eval_dataset,      # dataset comes first
        evaluators,        # metrics come second
        llm=eval_llm,      # llm parameter
        # embeddings parameter is optional and will use default if not provided
    )
    return result

In [23]:
def create_langsmith_dataset(dataset, dataset_name="NPTE Evaluation Data"):
    """Create LangSmith dataset from RAGAS dataset - FIXED VERSION"""
    from langsmith import Client
    
    client = Client()
    
    # Check if dataset already exists
    try:
        existing_datasets = client.list_datasets()
        existing_dataset = None
        
        for ds in existing_datasets:
            if ds.name == dataset_name:
                existing_dataset = ds
                break
        
        if existing_dataset:
            print(f"✅ Dataset '{dataset_name}' already exists, using existing dataset")
            langsmith_dataset = existing_dataset
        else:
            print(f"Creating new LangSmith dataset: {dataset_name}")
            langsmith_dataset = client.create_dataset(
                dataset_name=dataset_name,
                description="NPTE Evaluation Data"
            )
            print(f"✅ Created new dataset with ID: {langsmith_dataset.id}")
    
    except Exception as e:
        print(f"❌ Error checking/creating dataset: {e}")
        print("Creating new dataset anyway...")
        langsmith_dataset = client.create_dataset(
            dataset_name=dataset_name,
            description="NPTE Evaluation Data"
        )
    
    # First, let's check what columns are actually available
    df = dataset.to_pandas()
    print(f"Available columns: {list(df.columns)}")
    print(f"Dataset shape: {df.shape}")
    
    # Check if dataset already has examples
    try:
        existing_examples = list(client.list_examples(dataset_id=langsmith_dataset.id))
        if existing_examples:
            print(f"⚠️ Dataset already has {len(existing_examples)} examples")
            print("Skipping example creation to avoid duplicates")
            return langsmith_dataset
    except Exception as e:
        print(f"⚠️ Could not check existing examples: {e}")
    
    # Add examples to dataset with proper error handling
    examples_added = 0
    for idx, data_row in df.iterrows():
        try:
            # Handle different possible column names (including RAGAS specific ones)
            question = None
            answer = None
            context = None
            
            # Try different possible column names for question
            for col in ['question', 'questions', 'query', 'queries', 'user_input']:
                if col in data_row:
                    question = data_row[col]
                    break
            
            # Try different possible column names for answer
            for col in ['answer', 'answers', 'response', 'responses', 'reference']:
                if col in data_row:
                    answer = data_row[col]
                    break
            
            # Try different possible column names for context
            for col in ['context', 'contexts', 'documents', 'docs', 'reference_contexts']:
                if col in data_row:
                    context = data_row[col]
                    break
            
            # If we found the required fields, create the example
            if question and answer:
                client.create_example(
                    inputs={"question": question},
                    outputs={"answer": answer},
                    metadata={"context": context} if context else {},
                    dataset_id=langsmith_dataset.id
                )
                examples_added += 1
                print(f"✅ Added example {idx + 1}")
            else:
                print(f"⚠️ Skipping example {idx + 1} - missing required fields")
                print(f"   Question found: {question is not None}")
                print(f"   Answer found: {answer is not None}")
                
        except Exception as e:
            print(f"❌ Error adding example {idx + 1}: {e}")
            continue
    
    if examples_added > 0:
        print(f"✅ Successfully added {examples_added} examples to LangSmith dataset")
    else:
        print("⚠️ No examples were added to the dataset")
    
    return langsmith_dataset

In [10]:

    # Step 1: Load documents and RAG system
print("\n1. Loading documents and RAG system...")
rag_system = load_documents()
    
    # Verify we're using existing embeddings
print(f"✅ Using existing Qdrant collection: {rag_system.collection_name}")
print(f"✅ Using existing embeddings model: nomic-embed-text:latest")
print(f"✅ Vector store path: ../backend/qdrant_data")


1. Loading documents and RAG system...


INFO:rag_system:Collection npte_materials already exists


✅ RAG system initialized with existing Qdrant collection


  self.vector_store = Qdrant(
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:rag_system:Retrieved 5 relevant documents for query: therapy


✅ Connected to existing vector store with 5 sample documents
Using pre-embedded documents from Qdrant collection
✅ Using existing Qdrant collection: npte_materials
✅ Using existing embeddings model: nomic-embed-text:latest
✅ Vector store path: ../backend/qdrant_data


In [11]:

# Step 2: Set up RAGAS components
print("\n2. Setting up RAGAS components...")
generator_llm, generator_embeddings = setup_ragas_components()
    


2. Setting up RAGAS components...
✅ RAGAS components configured to use same models as RAG system
   - LLM: qwen:latest
   - Embeddings: nomic-embed-text:latest


In [12]:
# Step 3: Retrieve documents from existing vector store
print("\n3. Retrieving documents from existing vector store...")
docs = rag_system.retrieve_relevant_context("therapy", k=20)
print(f"Retrieved {len(docs)} documents from existing vector store")

if len(docs) == 0:
    print("❌ No documents found in vector store. Please ensure documents are loaded.")
    print("💡 Try running: cd ../backend && python check_qdrant.py")
    sys.exit(0)

print(f"✅ Successfully using existing embeddings with {len(docs)} documents")
    
    # Save documents to JSON file for inspection
import json
docs_as_dicts = [{"content": doc.page_content, "metadata": doc.metadata} for doc in docs]

with open("docs_dump.json", "w") as f:
    json.dump(docs_as_dicts, f, indent=2)
print(f"✅ Saved {len(docs)} documents to docs_dump.json")
    

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:rag_system:Retrieved 20 relevant documents for query: therapy



3. Retrieving documents from existing vector store...
Retrieved 20 documents from existing vector store
✅ Successfully using existing embeddings with 20 documents
✅ Saved 20 documents to docs_dump.json


In [13]:

# Step 4: Create knowledge graph
print("\n4. Creating knowledge graph...")
kg = create_knowledge_graph(docs, generator_llm, generator_embeddings)


4. Creating knowledge graph...
Creating knowledge graph...
Added 20 nodes to knowledge graph


In [14]:
# Step 5: Apply transformations (skipping for now due to performance issues)
print("\n5. applying transformations ...")
kg = apply_transformations(kg, docs, generator_llm, generator_embeddings)
print("✅ Knowledge graph ready for synthetic data generation")


5. applying transformations ...
Applying transformations...


Applying SummaryExtractor:   0%|          | 0/20 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

Applying CustomNodeFilter:   0%|          | 0/20 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/60 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Knowledge graph ready for synthetic data generation


In [15]:
# Step 6: Generate synthetic data
print("\n6. Generating synthetic data...")
dataset = generate_synthetic_data(docs, generator_llm, generator_embeddings, testset_size=2)
print(f"✅ Generated {len(dataset.to_pandas())} synthetic test examples")


6. Generating synthetic data...
Generating synthetic data using RAGAS...


Applying SummaryExtractor:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Applying CustomNodeFilter:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/15 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:ragas.testset.synthesizers.multi_hop.abstract:found 20 clusters
INFO:ragas.testset.synthesizers.multi_hop.specific:found 1 clusters


Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:ragas.testset.synthesizers.multi_hop.abstract:found 20 clusters
INFO:ragas.testset.synthesizers.multi_hop.specific:found 1 clusters
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Generating Samples:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


✅ Generated 3 synthetic test examples


In [24]:
# Step 7: Create LangSmith dataset
print("\n7. Creating LangSmith dataset...")
dataset_name = "NPTE Evaluation Data"
langsmith_dataset = create_langsmith_dataset(dataset, dataset_name)



7. Creating LangSmith dataset...
Creating new LangSmith dataset: NPTE Evaluation Data
✅ Created new dataset with ID: f4739ba4-c31a-43b0-8928-601290f7684e
Available columns: ['user_input', 'reference_contexts', 'reference', 'synthesizer_name']
Dataset shape: (3, 4)
✅ Added example 1
✅ Added example 2
✅ Added example 3
✅ Successfully added 3 examples to LangSmith dataset


In [25]:

# Step 8: Set up RAG chains
print("\n8. Setting up RAG chains...")
basic_rag_chain = setup_basic_rag_chain(rag_system)
improved_rag_chain = setup_improved_rag_chain(rag_system)


8. Setting up RAG chains...
Setting up RAG chain for evaluation...
Setting up improved RAG chain for evaluation...


In [47]:
# Step 9: Set up evaluators
print("\n9. Setting up evaluators...")
evaluators = setup_evaluators()


9. Setting up evaluators...
Setting up RAGAS evaluators with updated metrics...
✅ RAGAS evaluators configured with:
   - response_relevancy: Measures how relevant the response is to the question
   - context_precision: Measures how precise the retrieved context is
   - context_recall: Measures how much relevant context was retrieved
   - faithfulness: Measures how faithful the response is to the context


In [None]:
# Step 10: Evaluate basic chain
print("\n10. Evaluating basic RAG chain...")
basic_results = evaluate_rag_chain(basic_rag_chain, dataset, evaluators)


10. Evaluating basic RAG chain...
Evaluating RAG chain with RAGAS 0.3.0...
Running RAG chain on test questions...


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Created evaluation dataset with 3 examples


AttributeError: 'property' object has no attribute 'get'

In [41]:
# Step 11: Evaluate improved chain
print("\n11. Evaluating improved RAG chain...")
improved_results = evaluate_rag_chain(improved_rag_chain, dataset, evaluators)

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"



11. Evaluating improved RAG chain...
Evaluating RAG chain with RAGAS 0.3.0...
Running RAG chain on test questions...


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Created evaluation dataset with 3 examples


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:

In [43]:
# Step 12: Display results
print("\n" + "=" * 60)
print("EVALUATION RESULTS")
print("=" * 60)
print("Basic RAG Chain Results:")
print(basic_results)
print("\nImproved RAG Chain Results:")
print(improved_results)

print("\n" + "=" * 60)
print("EVALUATION COMPLETE")


EVALUATION RESULTS
Basic RAG Chain Results:
{'answer_relevancy': 0.0000, 'faithfulness': 0.0833, 'context_recall': 1.0000, 'context_precision': 0.8333}

Improved RAG Chain Results:
{'answer_relevancy': 0.0000, 'faithfulness': 0.0000, 'context_recall': 1.0000, 'context_precision': 0.8333}

EVALUATION COMPLETE
