# RAGAS Evaluation for MeTTa AI Assistant

This notebook loads logged RAG interactions from the backend, builds a dataset, and evaluates it using RAGAS metrics.

In [1]:
%pip install ipykernel
%pip install ragas datasets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
from pathlib import Path
from dotenv import load_dotenv

project_root = Path.cwd().parent 
backend_env = project_root / "Backend" / ".env"
print("Backend .env path:", backend_env)

os.environ.pop("OPENAI_API_KEY", None)
load_dotenv(backend_env, override=False)

key = os.getenv("OPENAI_API_KEY")
print("Has OPENAI_API_KEY?", bool(key))
print("Value:", repr(key)[0])

Backend .env path: c:\Users\yonat\Videos\GDSC\MeTTa-AI-Assistant\Backend\.env
Has OPENAI_API_KEY? True
Value: '


In [None]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_precision,
    context_recall,
)
import json
import os

# Path to the JSONL log file produced by the backend
log_path_default = os.path.join(
    os.getcwd(),  
    "rag_interactions.jsonl",
)
log_path = os.environ.get("RAG_LOG_PATH", log_path_default)
log_path

In [None]:
records = []
if os.path.exists(log_path):
    with open(log_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                records.append(json.loads(line))
            except json.JSONDecodeError:
                continue

len(records)

In [None]:
# Build RAGAS dataset columns
questions = []
answers = []
contexts_list = []
ground_truths = [] 

for r in records:
    questions.append(r.get('question', ''))
    answers.append(r.get('answer', ''))
    ctx = r.get('contexts', [])
    if isinstance(ctx, list):
        contexts_list.append([str(c) for c in ctx])
    else:
        contexts_list.append([str(ctx)])
    # For now we leave ground_truth empty
    ground_truths.append(r.get('ground_truth', ''))

dataset = Dataset.from_dict({
    'question': questions,
    'answer': answers,
    'contexts': contexts_list,
    'ground_truth': ground_truths,
})
dataset


In [None]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


metrics = [answer_relevancy, faithfulness, context_precision, context_recall]


if 'contexts' in dataset.features:
    non_empty = dataset.filter(lambda x: len(x['contexts']) > 0)
else:
    non_empty = dataset

print("Total samples:", dataset.num_rows)
print("Samples with non-empty contexts:", non_empty.num_rows)


evaluator_llm = ChatOpenAI(model="gpt-4o")
evaluator_embeddings = OpenAIEmbeddings()

if non_empty.num_rows == 0:
    print("No samples with retrieved contexts; check that your RAG pipeline is returning documents and logging them.")
else:

    result = evaluate(
        non_empty,
        metrics=metrics,
        llm=evaluator_llm, 
        embeddings=evaluator_embeddings
    )
    
    print(result)
    result.to_pandas()