# How to evaluate a RAG application

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = "gpt-3.5-turbo"

# Scrape the Website and Split the Content

In [2]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)

loader = WebBaseLoader("https://www.ml.school/")
documents = loader.load_and_split(text_splitter)
documents

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://www.ml.school/', 'title': "Building Machine Learning Systems That Don't Suck", 'description': "A live, interactive program that'll help you build production-ready machine learning systems from the ground up.", 'language': 'en'}, page_content='Building Machine Learning Systems That Don\'t Suck"This is the best machine learning course I\'ve done. Worth every cent."Jose Reyes, AI/ML at Cevo AustraliaBuilding Machine Learning Systems (That Don\'t Suck)A live, interactive program that\'ll help you build production-ready systems from the ground up.This program is for anyone who wants to use Machine Learning and Artificial Intelligence to solve real-world problems.This practical, hands-on course will teach you the skills you need for building production systems that work.The cohort will take you through the entire lifecycle of a project, from selling, planning, and structuring it to using open-source tools to build a system that runs anywhere.This is the

# Load the Content in a Vector Store

In [3]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore = DocArrayInMemorySearch.from_documents(
    documents, embedding=OpenAIEmbeddings()
)



# Create a Knowledge Base

In [4]:
import pandas as pd

df = pd.DataFrame([d.page_content for d in documents], columns=["text"])
df.head(10)

Unnamed: 0,text
0,Building Machine Learning Systems That Don't S...
1,where you'll learn from years of experience an...
2,software for over 30 years in the industry.Day...
3,"backtesting, invariance, and behavioral testin..."
4,drift. You'll learn how to use adversarial val...
5,to accommodate your needs.Office HoursEvery we...
6,are the prerequisites to succeed in the progra...
7,is recorded. You can attend live or watch the ...
8,"session, you can catch up asynchronously later..."
9,"like Disney, Boston Dynamics, IBM, Dell, G4S, ..."


In [5]:
from giskard.rag import KnowledgeBase

knowledge_base = KnowledgeBase(df)

  from .autonotebook import tqdm as notebook_tqdm


2025-02-12 10:16:29,740 pid:51077 MainThread giskard.llm.embeddings INFO     No embedding model set though giskard.llm.set_embedding_model. Defaulting to openai/text-embedding-3-small since OPENAI_API_KEY is set.


# Generate the Test Set

In [6]:
from giskard.rag import generate_testset

testset = generate_testset(
    knowledge_base,
    num_questions=60,
    agent_description="A chatbot answering questions about the Machine Learning School Website",
)

2025-02-12 10:16:29,865 pid:51077 MainThread giskard.rag  INFO     Finding topics in the knowledge base.


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
  warn(


2025-02-12 10:16:33,895 pid:51077 MainThread giskard.rag  INFO     Found 1 topics in the knowledge base.


Generating questions: 100%|██████████| 60/60 [03:42<00:00,  3.71s/it]


In [7]:
test_set_df = testset.to_pandas()

for index, row in enumerate(test_set_df.head(3).iterrows()):
    print(f"Question {index + 1}: {row[1]['question']}")
    print(f"Reference answer: {row[1]['reference_answer']}")
    print("Reference context:")
    print(row[1]['reference_context'])
    print("******************", end="\n\n")

Question 1: What should I do if I can't attend a live session of the program?
Reference answer: Every live session is recorded. If you can't attend a live session, you can watch the recorded version later.
Reference context:
Document 7: is recorded. You can attend live or watch the recorded version later.Here are the upcoming cohorts:Cohort 18: May 5 - May 22, 2025. 2:00 PM EDTDo not wait for a specific cohort to join the program. You have lifetime access, so you can join any time to lock in the current price."This is one of the best classes I've ever purchased over the internet. Santiago is a terrific teacher. The ability he has to share knowledge is fantastic. I recommend this course. Worth 10x what he's charging."Sal DiStefanoFrequently Asked QuestionsIf you can't find the answer to your question, please reach out and I'll be happy to help.How long will it take to complete the program?Set aside a minimum of 4 hours every week during the three weeks of the program to attend the live 

In [39]:
test_set_df.shape

(60, 5)

In [8]:
testset.save("test-set.jsonl")

# Prepare the Prompt Template


In [9]:
from langchain.prompts import PromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
print(prompt.format(context="Here is some context", question="Here is a question"))


Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: Here is some context

Question: Here is a question



# Create the RAG Chain

In [10]:
#Create a retriever from the Vector Store that will allow us to get the top similar documents to a given question.
retriever = vectorstore.as_retriever()
retriever.get_relevant_documents("What is the Machine Learning School?")

  retriever.get_relevant_documents("What is the Machine Learning School?")


[Document(metadata={'source': 'https://www.ml.school/', 'title': "Building Machine Learning Systems That Don't Suck", 'description': "A live, interactive program that'll help you build production-ready machine learning systems from the ground up.", 'language': 'en'}, page_content="where you'll learn from years of experience and real-world examples.Here is a summary of what makes this program unique:You'll join 20+ hours of live classes to discuss the fundamental ideas of building systems that work in the real world.You'll learn best practices to tackle the most significant challenges engineers face when building, evaluating, running, monitoring, and maintaining systems in production.You'll get hands-on access and a complete walkthrough of the implementation of an end-to-end machine learning system built using open-source tools.You'll learn how to build systems once and deploy them anywhere using some of the most popular techniques in the field.You'll get lifetime access to every future

In [11]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | StrOutputParser()
)

In [12]:
chain.invoke({"question": "What is the Machine Learning School?"})

'The Machine Learning School is a live, interactive program that helps individuals build production-ready machine learning systems from the ground up.'

# Evaluating the Model on the Test Set

In [13]:
def answer_fn(question, history=None):
    return chain.invoke({"question": question})

In [14]:
from giskard.rag import evaluate

report = evaluate(answer_fn, testset=testset, knowledge_base=knowledge_base)

Asking questions to the agent: 100%|██████████| 60/60 [01:14<00:00,  1.24s/it]
CorrectnessMetric evaluation: 100%|██████████| 60/60 [01:11<00:00,  1.20s/it]


Let now display the report.

Here are the five components of our RAG application:

- Generator: This is the LLM used in the chain to generate the answers.
- Retriever: This is the retriever that fetches relevant documents from the knowledge base according to a query.
- Rewriter: This is a component that rewrites the user query to make it more relevant to the knowledge base or to account for chat history.
- Router: This is a component that filters the query of the user based on his intentions.
- Knowledge Base: This is the set of documents given to the RAG to generate the answers.

In [15]:
display(report)

In [16]:
report.to_html("report.html")

In [17]:
report.correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,0.9
conversational,0.2
distracting element,0.6
double,0.9
simple,0.9
situational,0.7


In [18]:
report.get_failures()

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata,agent_answer,correctness,correctness_reason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
b4b333c6-c10e-439d-b9ff-54fd369c7643,What topics are covered on Day 6 of the course?,"On Day 6, you'll learn how to automate the end...",Document 4: drift. You'll learn how to use adv...,[],"{'question_type': 'simple', 'seed_document_id'...",How To Build Continual Learning Systems,False,The agent provided a vague and incomplete answ...
a0478427-969f-46e3-b623-579058c025ae,Could you enumerate certain distinctive charac...,The program includes 20+ hours of live classes...,Document 1: where you'll learn from years of e...,[],"{'question_type': 'complex', 'seed_document_id...",Some distinctive characteristics of the machin...,False,The agent provided a more detailed and extensi...
f4f69169-ccac-42dd-a350-20f1751b308e,Could you provide the enrollment fee for the '...,$500,Document 0: Building Machine Learning Systems ...,[],"{'question_type': 'distracting element', 'seed...",I don't know.,False,"The agent stated 'I don't know,' but should ha..."
f84b0b78-a871-4397-9b94-59bc243d2d9b,Could you provide the starting date for the pr...,The program started in March 2023.,"Document 9: like Disney, Boston Dynamics, IBM,...",[],"{'question_type': 'distracting element', 'seed...",I don't know.,False,The agent stated that it doesn't know the star...
555bcb5f-7934-4ac4-889d-b8b0a027590e,Considering the program's emphasis on real-wor...,"The instructor of the program is Santiago, a m...","Document 8: session, you can catch up asynchro...",[],"{'question_type': 'distracting element', 'seed...",The instructor of the machine learning program...,False,The agent provided the instructor's name and g...
e9e7ac3e-f6fc-4c34-a37f-293b63d72fdb,Considering the high-profile companies like Di...,The prerequisites include not being afraid of ...,Document 6: are the prerequisites to succeed i...,[],"{'question_type': 'distracting element', 'seed...",The specific prerequisites for success in the ...,False,The agent's answer is missing the mention of D...
0b96f17f-e7db-46c0-a295-c415226e7915,"Hi, I'm considering enrolling and I'm really i...",The program includes 20+ hours of live classes...,Document 1: where you'll learn from years of e...,[],"{'question_type': 'situational', 'seed_documen...",The machine learning program offers hands-on a...,False,"The agent's answer mentions hands-on access, b..."
2d497dbc-3789-43e5-af47-f325ae2e68aa,"Hi, I'm considering enrolling in the Machine L...",The program started in March 2023.,"Document 9: like Disney, Boston Dynamics, IBM,...",[],"{'question_type': 'situational', 'seed_documen...",I don't know.,False,The agent stated that it doesn't know when the...
5f566370-847c-4b8e-91f5-bbf974a99bf1,"Hi, I'm trying to deepen my understanding of m...","On Day 4, participants will learn how to versi...","Document 3: backtesting, invariance, and behav...",[],"{'question_type': 'situational', 'seed_documen...",On Day 4 of the Machine Learning School progra...,False,The agent's answer is missing the mention of '...
305076cf-cd10-4ad1-bf4c-2ffe3a91db3c,What are the prerequisites to succeed in the M...,"To succeed in the program, you should not be a...",Document 6: are the prerequisites to succeed i...,[],"{'question_type': 'double', 'original_question...",The prerequisites to succeed in the Machine Le...,False,The agent's answer is mostly correct but misse...


# Creating a Test Suite

In [32]:
!pip install "giskard[llm]" -U




In [33]:
from giskard.rag import QATestset

testset = QATestset.load("test-set.jsonl")

In [34]:
test_suite = testset.to_test_suite("Machine Learning School Test Suite")

In [35]:
import giskard


def batch_prediction_fn(df: pd.DataFrame):
    return chain.batch([{"question": q} for q in df["question"].values])

In [36]:
giskard_model = giskard.Model(
    model=batch_prediction_fn,
    model_type="text_generation",
    name="Machine Learning School Question and Answer Model",
    description="This model answers questions about the Machine Learning School website.",
    feature_names=["question"], 
)

2025-02-12 11:10:32,696 pid:51077 MainThread giskard.models.automodel INFO     Your 'prediction_function' is successfully wrapped by Giskard's 'PredictionFunctionModel' wrapper class.


In [37]:

test_suite_results = test_suite.run(model=giskard_model)

2025-02-12 11:10:33,660 pid:51077 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
2025-02-12 11:10:41,431 pid:51077 MainThread giskard.utils.logging_utils INFO     Predicted dataset with shape (60, 5) executed in 0:00:07.781764
2025-02-12 11:11:40,720 pid:51077 MainThread root         ERROR    An error happened during test execution for test: TestsetCorrectnessTest
Traceback (most recent call last):
  File "/Users/kiranbele/Downloads/RAG-Evaluation/rgenv/lib/python3.12/site-packages/giskard/core/suite.py", line 522, in run
    result = test_partial.giskard_test(**test_params).execute()
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kiranbele/Downloads/RAG-Evaluation/rgenv/lib/python3.12/site-packages/giskard/registry/giskard_test.py", line 195, in execute
    return configured_validate_arguments(self.test_fn)(*self.args, **self.kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [28]:
display(test_suite_results)