# RAGAS Evaluation

Streamlined notebook for loading a lightweight evaluation dataset, indexing a small knowledge base, and computing RAGAS metrics against the project workflow.

In [None]:
from datasets import load_dataset
from tqdm.auto import tqdm
import json
import numpy as np

from app.models.models import Document
from app.db.vector_db import VectorDB
from app.workflow.rag_workflow import RAGWorkflow
from app.workflow.reranker import Reranker
from app.workflow.router import LLMClient
from app.core.config import settings

from qdrant_client import models as qdrant_models

from ragas import EvaluationDataset, evaluate
from ragas.metrics import (
    LLMContextRecall,
    Faithfulness,
    FactualCorrectness,
    LLMContextPrecisionWithReference,
    ResponseRelevancy,
)
from ragas.llms import LangchainLLMWrapper
from langchain.chat_models import init_chat_model
from ragas.run_config import RunConfig
from ragas.embeddings import OpenAIEmbeddings
from openai import OpenAI

In [None]:
EVAL_COLLECTION_NAME = f"{settings.qdrant_collection_name}-ragas"
KNOWLEDGE_SAMPLE_SIZE = 1000
EVALUATION_SAMPLE_SIZE = 50

In [None]:
eval_settings = settings.model_copy(
    update={
        "qdrant_collection_name": EVAL_COLLECTION_NAME,
        "enable_reranker": False,
    }
)

vector_db = VectorDB(eval_settings)

In [None]:
vector_db.client.recreate_collection(
    collection_name=EVAL_COLLECTION_NAME,
    vectors_config={
        "dense": qdrant_models.VectorParams(
            size=eval_settings.embeddings_dim,
            distance=qdrant_models.Distance.COSINE,
        )
    },
    sparse_vectors_config={
        "bm25": qdrant_models.SparseVectorParams(modifier=qdrant_models.Modifier.IDF)
    },
)

print(f"Collection '{EVAL_COLLECTION_NAME}' ready for evaluation.")

In [None]:
knowledge_dataset = load_dataset("jamescalam/ai-arxiv2-chunks", split="train")
knowledge_sample = knowledge_dataset.select(
    range(min(KNOWLEDGE_SAMPLE_SIZE, len(knowledge_dataset)))
)

knowledge_docs = [
    Document(
        text=row["chunk"],
        metadata={
            "metadata": {
                "source": row.get("source", ""),
                "title": row.get("title", ""),
            }
        },
    )
    for row in knowledge_sample
]

In [None]:
knowledge_sample

In [None]:
unique_titles = set(knowledge_sample["title"])
print(f"Number of unique titles: {len(unique_titles)}")
for title in list(unique_titles):
    print(title)

In [None]:
count = sum(1 for title in knowledge_sample["title"] if title == "Mixtral of Experts")
print(count)

In [None]:
kept_docs = []
dense_embeddings = []
sparse_embeddings = []

for doc in tqdm(knowledge_docs, desc="Embedding knowledge chunks"):
    try:
        dense_embeddings.append(vector_db.get_embeddings(doc.text))
        sparse_embeddings.append(next(vector_db.sparse_model.embed([doc.text])))
        kept_docs.append(doc)
    except Exception as exc:
        print(f"Skipping chunk due to embedding error: {exc}")

if not kept_docs:
    raise RuntimeError("No chunks were indexed; abort evaluation.")

vector_db.add_documents(
    kept_docs,
    np.vstack(dense_embeddings),
    sparse_embeddings,
)

print(f"Indexed {len(kept_docs)} knowledge chunks.")

In [None]:
evaluation_dataset = load_dataset("aurelio-ai/ai-arxiv2-ragas-mixtral", split="train")

evaluation_sample = evaluation_dataset.select(
    range(min(EVALUATION_SAMPLE_SIZE, len(evaluation_dataset)))
)

print(f"Prepared {len(evaluation_sample)} evaluation questions.")

In [None]:
workflow = RAGWorkflow()
workflow.config = eval_settings
workflow.vector_db = vector_db
workflow.llm = LLMClient(eval_settings)
workflow.reranker = Reranker(eval_settings)

graph = workflow.build()
print("Workflow compiled for evaluation collection.")

In [None]:
evaluation_records = []

for idx, row in enumerate(tqdm(evaluation_sample, desc="Running workflow")):
    question = row["question"]
    reference = (
        row["ground_truth"][0]
        if isinstance(row["ground_truth"], list)
        else row["ground_truth"]
    )

    state = {"question": question}
    config_dict = {"configurable": {"thread_id": f"ragas_eval_{idx}"}}
    result = graph.invoke(state, config=config_dict)

    answer = result.get("answer", "")
    contexts = [doc.text for doc in result.get("context", []) if hasattr(doc, "text")]

    evaluation_records.append(
        {
            "user_input": question,
            "retrieved_contexts": contexts,
            "response": answer,
            "reference": reference,
        }
    )

print(f"Collected {len(evaluation_records)} responses.")

In [None]:
with open("evaluation_records.json", "w", encoding="utf-8") as f:
    json.dump(evaluation_records, f, ensure_ascii=False, indent=4)

print("Saved evaluation records to evaluation_records.json")

## Load Evaluation Data

In [None]:
with open("evaluation_records.json", "r", encoding="utf-8") as f:
    loaded_records = json.load(f)
print(loaded_records[0])

evaluation_records = loaded_records

In [None]:
if not evaluation_records:
    raise RuntimeError("No evaluation records created.")

evaluation_data = EvaluationDataset.from_list(evaluation_records)

llm = init_chat_model(
    "deepseek-ai/DeepSeek-V3.1",
    model_provider="together",
    api_key=settings.llm_api_key,
    timeout=180,
    max_retries=10,
)

evaluator_llm = LangchainLLMWrapper(llm)

embeddings_client = OpenAI(
    base_url=settings.embeddings_base_url,
    api_key=settings.embeddings_api_key,
    timeout=180,
    max_retries=10,
)

embeddings = OpenAIEmbeddings(client=embeddings_client, model=settings.embeddings_model)

run_config = RunConfig(
    # max_retries=10,
    # timeout=180,
    max_workers=8,
)

result = evaluate(
    dataset=evaluation_data,
    metrics=[
        LLMContextRecall(),
        Faithfulness(),
        FactualCorrectness(),
        LLMContextPrecisionWithReference(),
        # ResponseRelevancy(),
    ],
    llm=evaluator_llm,
    embeddings=embeddings,
    run_config=run_config,
)

result

#### Faithfulness
The Faithfulness metric measures how factually consistent a response is with the retrieved context. It ranges from 0 to 1, with higher scores indicating better consistency.
A response is considered faithful if all its claims can be supported by the retrieved context. 

#### Context Precision
Context Precision is a metric that evaluates the retriever’s ability to rank relevant chunks higher than irrelevant ones for a given query in the retrieved context. Specifically, it assesses the degree to which relevant chunks in the retrieved context are placed at the top of the ranking.
It is calculated as the mean of the precision@k for each chunk in the context. Precision@k is the ratio of the number of relevant chunks at rank k to the total number of chunks at rank k.

#### Context Recall
Context Recall measures how many of the relevant documents (or pieces of information) were successfully retrieved. It focuses on not missing important results. Higher recall means fewer relevant documents were left out. In short, recall is about not missing anything important. Since it is about not missing anything, calculating context recall always requires a reference to compare against.

#### Response Relevancy
The ResponseRelevancy metric measures how relevant a response is to the user input. Higher scores indicate better alignment with the user input, while lower scores are given if the response is incomplete or includes redundant information. 

#### Factual Correctness
FactualCorrectness is a metric that compares and evaluates the factual accuracy of the generated response with the reference. This metric is used to determine the extent to which the generated response aligns with the reference. The factual correctness score ranges from 0 to 1, with higher values indicating better performance. 

{'context_recall': 0.8553, 'faithfulness': 0.8710, 'factual_correctness(mode=f1)': 0.3626, 'llm_context_precision_with_reference': 0.7379}