### Chapter 5: RAG Evaluation

#### Load the OpenAI API key

In [1]:
from getpass import getpass
import os

OPENAI_API_KEY = getpass()

In [2]:
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

### Recipe 5.1 String evaluator with an OpenAI model acting as a judge

##### Segment 1: Set up the variables and the LLM model

In [3]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
 
## Step 1: Define the components 
query = "What are the battery life specifications and charging capabilities of the Zonic Earbuds Pro, including quick charge features"
 
result = "Battery Life: 8 hours per charge + 24 hours from case (32 hours total). Charging: 1.5 hours (earbuds), 2 hours (case). Quick Charge: 10 minutes = 1 hour playback."
 
answer = """
The battery life specifications and charging capabilities of the Zonic Earbuds Pro are as follows:
 
- Battery Life (Earbuds): 8 hours
- Battery Life (Case): 24 hours (additional)
- Charging Time: 
  - Earbuds: 1.5 hours
  - Case: 2 hours
- Quick Charge: 10 minutes of charging provides 1 hour of playback.
 
"""
 
## Step 2: Define the prompt template with instruction
prompt_template = PromptTemplate.from_template(
    input_variable = ["result", "answer", "query"],
    template = """You are an expert evaluator judging RAG system responses.
    Question: {query}
    RAG Response: {result}
    Ground Truth: {answer}
 
Evaluate based on accuracy, completeness, and hallucination.
 
Respond with CORRECT or INCORRECT:
Grade: 
 
"""
)
 
 
## Step 3: Specify the LLM
judge_llm = ChatOpenAI(temperature=0, 
                      model="gpt-4o-mini", api_key=OPENAI_API_KEY) 


##### Segment 2: Instantiate the string evaluator

In [4]:
from langsmith.evaluation import LangChainStringEvaluator
 
## Step 4: Instantiate the evaluator
questionAnswer_evaluator = LangChainStringEvaluator(
    "qa",
    config={
        "llm": judge_llm,
        "prompt": prompt_template
    }
)
 
## Step 5: Call the evaluate string method
score = questionAnswer_evaluator.evaluator.evaluate_strings(
    
        prediction = result,
        reference = answer,
        input =  query
    
)
 
## Step 6: Print the score
print(f"Score: {score.values()}")
print(f"Score: {score}")


Score: dict_values(['CORRECT\n\nGrade: The RAG response accurately reflects the battery life specifications and charging capabilities of the Zonic Earbuds Pro as provided in the ground truth. There are no inaccuracies or hallucinations, and all relevant details are included.', 'CORRECT', 1])
Score: {'reasoning': 'CORRECT\n\nGrade: The RAG response accurately reflects the battery life specifications and charging capabilities of the Zonic Earbuds Pro as provided in the ground truth. There are no inaccuracies or hallucinations, and all relevant details are included.', 'value': 'CORRECT', 'score': 1}


### Recipe 5.2 Getting started with evaluating faithfulness with DeepEvals

#### Complete RAG pipeline from Chapter 4

In [30]:
##### Document loading for building the knowledge base #####
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
 
## Step 1: Define the path (a directory) that contains our files
path = "./rag_resources"
 
## Step 2: Instantiate the directory loader with the path defined above
dir_loader = DirectoryLoader(path, glob="**/*.pdf", 
                             loader_cls = PyPDFLoader,
                             show_progress=True,
                             loader_kwargs = {"mode":"page"}
                            )
 
## Step 3: Invoke the load method to load the files into memory
dir_pdf_docs = dir_loader.load()
 
 
# Step 4: Define a text splitter that splits recursively through the document loaded # into memory
text_splitter = RecursiveCharacterTextSplitter(
    separators = ['\n\n', '\n', ' ', ''],
    chunk_size=200,  
    chunk_overlap=20,
    length_function = len
)
 
# Step 5: Split the document using text_splitter
doc_chunks = text_splitter.split_documents(dir_pdf_docs)


100%|██████████| 3/3 [00:00<00:00,  4.26it/s]


In [31]:
##### Vector store and retriever #####

from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
 
## Step 6: Specify an embedding model
embedding_model = OpenAIEmbeddings(api_key = OPENAI_API_KEY,
                                   model = "text-embedding-3-small")
 
## Step 7: Create a Chroma instance and instantiate it with an embedding model and the document chunk
vectore_store = Chroma.from_documents(
    embedding = embedding_model,
    documents = doc_chunks
   
)

## Step 8: Transform the vectore store into a retriever
retriever = vectore_store.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k":2}
)

In [33]:
##### Prompt template and setting up the generator component #####

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
 

## Step 9: Define a prompt template
prompt_template = ChatPromptTemplate.from_template(
    input_variable = ["context", "question"],
    template = 
    """
    Use the following piece of context to answer the question at the end. 
    Instruction: Stay loyal to the context as close as possible. Don't add your opinion. If the context is not enough to answer the question, state this.
    Context: {context} 
    Question: {question}. 
    
    """
)
 
## Step 10: Define a chat model for response generation
llm = ChatOpenAI(
    openai_api_key = OPENAI_API_KEY,
    model = "gpt-4o-mini",
    temperature = 0,
)

In [34]:
#####   The RAG chain #####
## Step 11: Define the chain
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"question": RunnablePassthrough(), "context": retriever}
    | prompt_template 
    | llm 
   )

#### Evaluating faithfulness with DeepEvals

In [10]:
from deepeval.test_case import LLMTestCase
from deepeval.evaluate import evaluate
from deepeval.metrics import FaithfulnessMetric

rag_output = """
The battery life specifications and charging capabilities of the Zonic Earbuds Pro are as follows:

- Battery Life (Earbuds): 8 hours
- Battery Life (Case): 24 hours (additional)
- Charging Time: 
  - Earbuds: 1.5 hours
  - Case: 2 hours
- Quick Charge: 10 minutes of charging provides 1 hour of playback.

"""

answer = """
The battery life for the Zonic Earbuds Pro is 8 hours, and the battery life for the case is 24 hours (additional). The charging time for the earbuds is 1.5 hours, and the charging time for the case is 2 hours. The earbuds also have a quick charge feature where 10 minutes of charging provides 1 hour of playback. These specifications are provided in the context provided.

"""

query = "What are the battery life specifications and charging capabilities of the Zonic Earbuds Pro, including quick charge features"

retrieved_chunks = [chunk.page_content for chunk in retriever.invoke(query)]

test_case = LLMTestCase(
    input = query,
    actual_output = rag_output,
    retrieval_context = retrieved_chunks
)


metric = FaithfulnessMetric(
    threshold=0.7,
    model="gpt-4o-mini",
    include_reason=False
)

evaluate(test_cases=[test_case], metrics=[metric])

Output()



Metrics Summary

  - ✅ Faithfulness (score: 0.8, threshold: 0.7, strict: False, evaluation model: gpt-4o-mini, reason: None, error: None)

For test case:

  - input: What are the battery life specifications and charging capabilities of the Zonic Earbuds Pro, including quick charge features
  - actual output: 
The battery life specifications and charging capabilities of the Zonic Earbuds Pro are as follows:

- Battery Life (Earbuds): 8 hours
- Battery Life (Case): 24 hours (additional)
- Charging Time: 
  - Earbuds: 1.5 hours
  - Case: 2 hours
- Quick Charge: 10 minutes of charging provides 1 hour of playback.


  - expected output: None
  - context: None
  - retrieval context: ['Product Specification Sheet – Zonic Earbuds Pro \n \nProduct Introduction \nThe Zonic Earbuds Pro represent the next generation of true wireless audio for consumers \nwho value performance, reliability, and convenience. Developed for busy professionals and', 'Battery Life (Earbuds) 8 hours \nBattery Life (Cas

EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Faithfulness', threshold=0.7, success=True, score=0.8, reason=None, strict_mode=False, evaluation_model='gpt-4o-mini', error=None, evaluation_cost=0.0005141999999999999, verbose_logs='Truths (limit=None):\n[\n    "The Zonic Earbuds Pro are true wireless audio devices.",\n    "The Zonic Earbuds Pro are designed for busy professionals.",\n    "The battery life of the earbuds is 8 hours.",\n    "The battery life of the charging case is 24 hours.",\n    "The charging time for the earbuds is 1.5 hours.",\n    "The charging time for the case is 2 hours.",\n    "A quick charge of 10 minutes provides 1 hour of playback.",\n    "The wireless range of the Zonic Earbuds Pro is up to 15 meters.",\n    "The Zonic Earbuds Pro use Bluetooth version 5.2.",\n    "The audio codecs supported by the Zonic Earbuds Pro are SBC, AAC, and aptX.",\n    "The Zonic Earbuds Pro have an IPX5 water resistance 

### Recipe 5.3 Evaluating multiple metrics on a combination of dataset with DeepEvals

##### Import the ground truth data

In [37]:
import pandas as pd

eval_data = pd.read_csv("./rag_resources/eval_qa.csv",encoding="cp1252")
print(eval_data.head())
questions = eval_data["question"].tolist()
answers = eval_data["answer"].tolist()

                                            question  \
0  What are the battery life and charging specs o...   
1  How do I claim warranty for Bluetooth issues a...   
2           How do I return defective earbuds (RMA)?   

                                              answer  
0  Battery: 8 hrs + 24 hrs (case). Charging: 1.5 ...  
1  Coverage: 12-month warranty. Docs Needed: Rece...  
2  Steps: (1) Request RMA at zonic-support.com/rm...  


In [38]:
print(questions)

['What are the battery life and charging specs of the Zonic Earbuds Pro?', 'How do I claim warranty for Bluetooth issues after 8 months?']


##### Invoke the RAG chain with the questions

    > NOTE: Ensure you run the complete RAG pipeline under Recipe 5.2 above before the sections below

In [14]:
## Step 2: Invoke the chain with the questions via the .batch() method
batch_responses =  rag_chain.batch(questions)

In [15]:
## Step 3: Invoke the retrieved chunks via the .batch() method
batch_retrieved_chunks = retriever.batch(questions)

In [39]:
## Step 4: Bundle the batched responses from our LLM into a list
rag_responses = [response.content for response in batch_responses]

## Step 5: Bundle the retrieved context for each query into another list of list
retrieval_contexts = [[chunk.page_content for chunk in retrieved_chunks] for retrieved_chunks in batch_retrieved_chunks]


##### Specifying the ingredients for DeepEval

In [23]:

from deepeval.test_case import LLMTestCase
from deepeval.evaluate import evaluate
from deepeval.dataset import EvaluationDataset

from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric
)

## Step 6: Compile our test cases
test_cases = [
    LLMTestCase(
        input=q,
        actual_output=r,
        expected_output=a,
        retrieval_context = rc
        
    )
    for q, a, r, rc in zip(questions, answers, rag_responses, retrieval_contexts)
]

## Step 7: Bundle the test cases into an instance of the EvaluationDataset class
dataset = EvaluationDataset()
dataset.test_cases = test_cases

In [22]:
## Step 8: Define the parameters for the precision metrics
precision_metric = ContextualPrecisionMetric(
    threshold=0.7,
    model="gpt-4o-mini",
    include_reason=False
)

## Step 9: Define the parameter for the recall metrics
recall_metric = ContextualRecallMetric(
    threshold=0.7,
    model="gpt-4o-mini",
    include_reason=False
)


## Step 10: Execute the test by calling evaluate on the dataset.
evaluation_output = dataset.evaluate([precision_metric, recall_metric])

Output()



Metrics Summary

  - ❌ Contextual Precision (score: 0.0, threshold: 0.7, strict: False, evaluation model: gpt-4o-mini, reason: None, error: None)
  - ❌ Contextual Recall (score: 0.0, threshold: 0.7, strict: False, evaluation model: gpt-4o-mini, reason: None, error: None)

For test case:

  - input: How do I claim warranty for Bluetooth issues after 8 months?
  - actual output: The provided context does not contain information on how to claim warranty for Bluetooth issues after 8 months. Therefore, I cannot answer the question.
  - expected output: Coverage: 12-month warranty. Docs Needed: Receipt, serial no., issue description, contact. Steps: Submit form at zonic-support.com/warranty; wait 3 business days. Timeline: 7–14 days post-receipt.
  - context: None
  - retrieval context: ['●  Faulty  Bluetooth  connectivity  or  hardware  integration  ●  Manufacturing  defects  not  resulting  from  improper  usage  \nWhat  is  Not  Covered  \nThis  warranty  does  not  apply  in  the  foll

Output()



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o-mini, reason: None, error: None)
  - ✅ Contextual Recall (score: 0.8333333333333334, threshold: 0.7, strict: False, evaluation model: gpt-4o-mini, reason: None, error: None)

For test case:

  - input: What are the battery life and charging specs of the Zonic Earbuds Pro?
  - actual output: The battery life and charging specifications of the Zonic Earbuds Pro are as follows:

- Battery Life (Earbuds): 8 hours
- Battery Life (Case): 24 hours (additional)
- Charging Time: 1.5 hours (earbuds), 2 hours (case)
- Quick Charge: 10 minutes = 1 hour playback
  - expected output: Battery: 8 hrs + 24 hrs (case). Charging: 1.5 hrs (buds), 2 hrs (case). Quick Charge: 10 min = 1 hr playback. Options: USB-C, Qi wireless.
  - context: None
  - retrieval context: ['Battery Life (Earbuds) 8 hours \nBattery Life (Case) 24 hours (additional) \nCharging Time 1.5 hours (earbuds), 2 hours (case)

In [35]:
precision_metric.measure(test_cases[1])
print("Score: ", precision_metric.score)
print("Reason: ", precision_metric.reason)

recall_metric.measure(test_cases[1])
print("Score: ", recall_metric.score)
print("Reason: ", recall_metric.reason)

Output()

Output()

Score:  0
Reason:  None


Score:  1.0
Reason:  None


### 5.5: Running the metrics one by one for sensitivity analysis

    > 
    "You can also run the FaithfulnessMetric (or any metric) on a single test case as a standalone, one-off execution."

##### Import the eval data

In [46]:
import pandas as pd

eval_data = pd.read_csv("./rag_resources/eval_qa.csv",encoding="cp1252")
print(eval_data.head())
questions = eval_data["question"].tolist()
answers = eval_data["answer"].tolist()

                                            question  \
0  What are the battery life and charging specs o...   
1  How do I claim warranty for Bluetooth issues a...   
2           How do I return defective earbuds (RMA)?   

                                              answer  
0  Battery: 8 hrs + 24 hrs (case). Charging: 1.5 ...  
1  Coverage: 12-month warranty. Docs Needed: Rece...  
2  Steps: (1) Request RMA at zonic-support.com/rm...  


##### The RAG variables

In [47]:
## Step 2: Invoke the chain with the questions via the .batch() method
batch_responses = rag_chain.batch(questions)

## Step 3: Invoke the retrieved chunks via the .batch() method
batch_retrieved_chunks = retriever.batch(questions)

## Step 4: Bundle the batched responses from our LLM into a list
rag_responses = [response.content for response in batch_responses]

## Step 5: Bundle the retrieved context for each query into another list of list
retrieval_contexts = [[chunk.page_content for chunk in retrieved_chunks] for retrieved_chunks in batch_retrieved_chunks]


##### The evaluation variables

In [48]:
from deepeval.evaluate import evaluate
from deepeval.metrics import FaithfulnessMetric, AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
 

## Step 6: Compile our test cases
test_cases = [
    LLMTestCase(
        input=q,
        actual_output=r,
        expected_output=a,
        retrieval_context = rc
        
    )
    for q, a, r, rc in zip(questions, answers, rag_responses, retrieval_contexts)
]

 
## Step 4: Instantiate the faithfulness metric
faithfulness_metric = FaithfulnessMetric(
    threshold=0.7,
    model="gpt-4.1-mini",
    include_reason=False
)

## Step 4: Instantiate the faithfulness metric
relevancy_metric = AnswerRelevancyMetric(
    threshold=0.7,
    model="gpt-4.1-mini",
    include_reason=False
)
 


In [49]:
## For sensitivity analysis of the pipeline over multiple sessions
faithfulness_scores = []
relevancy_scores = []
for test_case in test_cases:
    faithfulness_scores.append(faithfulness_metric.measure(test_case))
    relevancy_scores.append(relevancy_metric.measure(test_case))

print(faithfulness_scores)
print(relevancy_scores)

Output()

Output()

Output()

Output()

Output()

Output()

[1.0, 1.0, 1.0]
[1.0, 0.0, 1.0]
