## Import Rag Evaluation package

In [1]:
# Install package
%pip install -q --upgrade rag_evaluation

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import package
import rag_evaluation as rag_eval

### Set API Key 

In [None]:
# You can set in Environment / .env or 

# One-time, in-memory key (per Python session)
rag_eval.set_api_key("openai", "sk-###")
rag_eval.set_api_key("gemini", "AIza####")

### Usage
#### For API-based Models (GPT and Gemini)

In [None]:
# Define the the three inputs needed for any RAG evaluation

wikipedia_text = """
A large language model (LLM) is a machine learning model designed for natural language processing tasks, especially language generation.
LLMs are language models with many parameters, and are trained with self-supervised learning on a vast amount of text.

The largest and most capable LLMs are generative pretrained transformers (GPTs), which are largely used in generative chatbots such as ChatGPT or Gemini.
LLMs can be fine-tuned for specific tasks or guided by prompt engineering. These models acquire predictive power regarding syntax, semantics, and
ontologies inherent in human language corpora, but they also inherit inaccuracies and biases present in the data they are trained in.
"""


question = "Which large language model is currently the largest and most capable?"


llm_response = """
The largest and most capable LLMs are the generative pretrained transformers (GPTs). These models are
designed to handle complex language tasks, and their vast number of parameters gives them the ability to
understand and generate human-like text.
"""

In [8]:
from rag_evaluation.evaluator import evaluate_response

# OpenAI usage 
print('OpenAI Evaluation')
report = evaluate_response(
    query=question,
    response=llm_response,
    document=wikipedia_text,
    model_type="openai",
    model_name='gpt-4.1',
)
print(report)

# Gemini usage 
print('\n \n  Gemini Evaluation')
report = evaluate_response(
    query=question,
    response=llm_response,
    document=wikipedia_text,
    model_type="gemini",
    model_name='gemini-2.5-flash',
    metric_weights=[0.1, 0.4, 0.5, 0., 0.] # optional metric_weights [Query Relevance, Factual Accuracy, Coverage, Coherence, Fluency]
)
print(report)

OpenAI Evaluation


             Metric  Score (Normalized)  Score (%)
0   Query Relevance                1.00      100.0
1  Factual Accuracy                1.00      100.0
2          Coverage                0.80       80.0
3         Coherence                1.00      100.0
4           Fluency                1.00      100.0
5  Overall Accuracy                0.95       95.0

 
  Gemini Evaluation
             Metric  Score (Normalized)  Score (%)
0   Query Relevance                 1.0      100.0
1  Factual Accuracy                 1.0      100.0
2          Coverage                 0.8       80.0
3         Coherence                 1.0      100.0
4           Fluency                 1.0      100.0
5  Overall Accuracy                 0.9       90.0


### Evaluate directly from file

In [2]:
import pandas as pd 
from rag_evaluation import evaluate_df

#load your file
df = pd.read_csv("rag_evaluation_data.csv")
report = evaluate_df(df,
                     model_type="openai",
                     model_name="gpt-4o",
                     metric_weights=[0.1, 0.4, 0.5, 0., 0.]
                     )
report

Unnamed: 0,Metric,Score (Normalized),Score (%)
0,Query Relevance,0.6,60.0
1,Factual Accuracy,0.5,50.0
2,Coverage,0.5,50.0
3,Coherence,0.6,60.0
4,Fluency,1.0,100.0
5,Overall Accuracy,0.51,51.0


#### Usage with Open-Source Models (Ollama models)

In [None]:

# Define the inputs
query = "Which large language model is currently the largest and most capable?"

response_text = """The largest and most capable LLMs are the generative pretrained transformers (GPTs). These models are 
                designed to handle complex language tasks, and their vast number of parameters gives them the ability to 
                understand and generate human-like text."""
                 
document = """A large language model (LLM) is a type of machine learning model designed for natural language processing 
            tasks such as language generation. LLMs are language models with many parameters, and are trained with 
            self-supervised learning on a vast amount of text. The largest and most capable LLMs are 
            generative pretrained transformers (GPTs). Modern models can be fine-tuned for specific tasks or guided 
            by prompt engineering. These models acquire predictive power regarding syntax, semantics, and ontologies 
            inherent in human language corpora, but they also inherit inaccuracies and biases present in the data they are trained in."""

# Llama usage ('ollama pull llama3.2:1b' to download from terminal)
print('Llama Evaluation')
report = evaluate_response(
    query=query,
    response=response_text,
    document=document,
    model_type="ollama",
    model_name='llama3.2:1b'
)
print(report)

# Mistral usage ('ollama pull mistral' to download from terminal)
print('\n \n  Mistral Evaluation')
report = evaluate_response(
    query=query,
    response=response_text,
    document=document,
    model_type="ollama",
    model_name='mistral:latest',
    metric_weights=[0.1, 0., 0.9, 0., 0.] # optional metric_weights [Query Relevance, Factual Accuracy, Coverage, Coherence, Fluency]
)
print(report)

# Qwen usage ('ollama pull qwen' to download from terminal)
print('\n \n  Qwen Evaluation')
report = evaluate_response(
    query=query,
    response=response_text,
    document=document,
    model_type="ollama",
    model_name='qwen:latest',
)
print(report)


Llama Evaluation
             Metric  Score (Normalized)  Score (%)
0   Query Relevance                0.20       20.0
1  Factual Accuracy                0.80       80.0
2          Coverage                0.80       80.0
3         Coherence                0.80       80.0
4           Fluency                0.80       80.0
5  Overall Accuracy                0.65       65.0

 
  Mistral Evaluation
             Metric  Score (Normalized)  Score (%)
0   Query Relevance                1.00      100.0
1  Factual Accuracy                1.00      100.0
2          Coverage                0.80       80.0
3         Coherence                0.80       80.0
4           Fluency                1.00      100.0
5  Overall Accuracy                0.82       82.0

 
  Qwen Evaluation
             Metric  Score (Normalized)  Score (%)
0   Query Relevance                 0.8       80.0
1  Factual Accuracy                 0.8       80.0
2          Coverage                 0.8       80.0
3         Coherence 

### Example with Query-Response Mismatch


We replaced the question while keeping the retrieval and document the same to see how the metric evaluates it

In [21]:
# Input strings
query = 'List the most popular large language models in 2025'
response = llm_response
document = wikipedia_text

# Get evaluation results
# OpenAI usage 
print('OpenAI Evaluation')
report = evaluate_response(
    query=query,
    response=llm_response,
    document=wikipedia_text,
    model_type="openai",
    model_name='gpt-4o',
)
print(report)

OpenAI Evaluation
             Metric  Score (Normalized)  Score (%)
0   Query Relevance                 0.4       40.0
1  Factual Accuracy                 0.8       80.0
2          Coverage                 0.4       40.0
3         Coherence                 0.6       60.0
4           Fluency                 1.0      100.0
5  Overall Accuracy                 0.6       60.0
