In [5]:
# Define the the three inputs needed for any RAG evaluation

wikipedia_text = """
A large language model (LLM) is a machine learning model designed for natural language processing tasks, especially language generation.
LLMs are language models with many parameters, and are trained with self-supervised learning on a vast amount of text.

The largest and most capable LLMs are generative pretrained transformers (GPTs), which are largely used in generative chatbots such as ChatGPT or Gemini.
LLMs can be fine-tuned for specific tasks or guided by prompt engineering. These models acquire predictive power regarding syntax, semantics, and
ontologies inherent in human language corpora, but they also inherit inaccuracies and biases present in the data they are trained in.
"""


question = "Which large language model is currently the largest and most capable?"


llm_response = """
The largest and most capable LLMs are the generative pretrained transformers (GPTs). These models are
designed to handle complex language tasks, and their vast number of parameters gives them the ability to
understand and generate human-like text.
"""

# Environment Setup and API Key Configuration

In [None]:
!pip install -q python-dotenv

from dotenv import load_dotenv
load_dotenv()
import os


# Set the OpenAI API key env variable manually
os.environ["OPENAI_API_KEY"] = "###"

print(os.environ["OPENAI_API_KEY"])

# Import RAG_Evaluation package

In [None]:
!pip install -q rag_evaluation

# import evaluate_response
from rag_evaluation.evaluator import evaluate_response

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25h

# Examples with different OpenAI models

In [6]:
# use rag-evaluation

# Input strings
query = question
response = llm_response
document = wikipedia_text

# Get evaluation results
evaluate_response(
        query,
        response,
        document
    )

# Note that no model was explicitly provided as an argument here, because the default model is `gpt-4o-mini`


Unnamed: 0,Metric,Score (Normalized),Score (%)
0,Query Relevance,1.0,100.0
1,Factual Accuracy,1.0,100.0
2,Coverage,0.8,80.0
3,Coherence,1.0,100.0
4,Fluency,1.0,100.0
5,Overall Accuracy,0.95,95.0


In [8]:
# use rag-evaluation

# Input strings
query = question
response = llm_response
document = wikipedia_text

# Get evaluation results
evaluate_response(
        query,
        response,
        document,
        model = 'gpt-4o'
    )

Unnamed: 0,Metric,Score (Normalized),Score (%)
0,Query Relevance,1.0,100.0
1,Factual Accuracy,1.0,100.0
2,Coverage,0.8,80.0
3,Coherence,1.0,100.0
4,Fluency,1.0,100.0
5,Overall Accuracy,0.95,95.0


In [10]:
# use rag-evaluation

# Input strings
query = question
response = llm_response
document = wikipedia_text

# Get evaluation results
evaluate_response(
        query,
        response,
        document,
        model = 'gpt-4.1'
    )

Unnamed: 0,Metric,Score (Normalized),Score (%)
0,Query Relevance,1.0,100.0
1,Factual Accuracy,1.0,100.0
2,Coverage,0.8,80.0
3,Coherence,1.0,100.0
4,Fluency,1.0,100.0
5,Overall Accuracy,0.95,95.0


# Example with Difference Question and answer

In [17]:
question2 = "What do large language models inherit from the data they are trained on?"

human_response = """
Some inheritance in large language models related to the data they are trained on include inheriting inaccuracies and biases present in the training data.
"""

In [21]:
# use rag_evaluation

# Input strings
query = question2
response = human_response
document = wikipedia_text

# Get evaluation results
evaluate_response(
        query,
        response,
        document,
        model = 'gpt-4.1'
    )

Unnamed: 0,Metric,Score (Normalized),Score (%)
0,Query Relevance,0.8,80.0
1,Factual Accuracy,1.0,100.0
2,Coverage,0.6,60.0
3,Coherence,1.0,100.0
4,Fluency,1.0,100.0
5,Overall Accuracy,0.85,85.0


## Example with Query-Response Mismatch


The question that matches the response is Question 1, as seen in the initial sample. However, we now replace Question 1 with  Question 2 to see how the metric evaluates it.


In [22]:
# Input strings
query = question2
response = llm_response
document = wikipedia_text

# Get evaluation results
evaluate_response(
        query,
        response,
        document,
        model = 'gpt-4.1'
    )

Unnamed: 0,Metric,Score (Normalized),Score (%)
0,Query Relevance,0.4,40.0
1,Factual Accuracy,0.6,60.0
2,Coverage,0.4,40.0
3,Coherence,0.6,60.0
4,Fluency,1.0,100.0
5,Overall Accuracy,0.55,55.0
