## RAG Evaluation

### Setup 

In [25]:
import json
import os

import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from tqdm.auto import tqdm
from pathlib import Path

from meal_mentor import ingest

In [2]:
# Load environment variables from the .env file
load_dotenv()

# Access the environment variable
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if OPENAI_API_KEY is not None:
    print("Loaded OPENAI_API_KEY")
    client = OpenAI(api_key=OPENAI_API_KEY)
else:
    print("Something went wrong; Make sure you have added the OPENAI_API_KEY in the .env file")


Loaded OPENAI_API_KEY


In [3]:
documents = ingest.ingest_data(file_path=Path("../data/data.csv"))
text_fields = [
    "recipe_name",
    "diet_type",
    "cuisine_type",
    "protein(g)",
    "carbs(g)",
    "fat(g)",
]
keyword_fields = ["id"]

index = ingest.load_index(
    documents=documents,
    text_fields=text_fields,
    keyword_fields=keyword_fields
)

   id diet_type                                  recipe_name cuisine_type  \
0   0      dash  Potato Latkes Made Simple: A Twitter Recipe       kosher   
1   1      dash                             Avocado Dressing     american   
2   2      dash                                  Bread Salad     american   
3   3      dash                             Ultimat Sparkler        world   
4   4      dash                                  Yogurt Pops     american   

   protein(g)  carbs(g)  fat(g)  
0       31.55    110.84  118.28  
1        4.18     18.97  118.42  
2       44.09    153.84   86.03  
3        0.32     16.32    0.08  
4        9.07     34.05    7.90  


In [4]:
# Load the ground truth dataset
df_question = pd.read_csv('../data/ground-truth-retrieval_4o_mini.csv')
ground_truth = df_question.to_dict(orient='records')
record = ground_truth[0]
record

{'id': 0,
 'question': 'Can you provide the nutritional breakdown for the Potato Latkes recipe, including protein, carbs, and fat content?'}

In [5]:
# Take a sample of the ground truth dataset
df_sample = df_question.sample(n=200, random_state=1)
sample = df_sample.to_dict(orient='records')

In [7]:
prompt_template = """
You're a healthy recipe recommender. Answer the QUESTION based on the CONTEXT from our recipe database.
Use only the facts from the CONTEXT when answering the QUESTION. Do not ask any follow up questions.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
recipe_name: {recipe_name}
diet_type: {diet_type}
protein: {protein(g)}g
carbs: {carbs(g)}g
fat: {fat(g)}g
cuisine_type: {cuisine_type}
""".strip()


def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    return prompt_template.format(question=query, context=context).strip()

In [9]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results


def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content


def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer


In [10]:
rag_evaluation_prompt_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

### Evaluation for GPT-4o-Mini

In [20]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question)

    prompt = rag_evaluation_prompt_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [21]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

In [22]:
df_eval.relevance.value_counts()

relevance
RELEVANT           151
PARTLY_RELEVANT     29
NON_RELEVANT        20
Name: count, dtype: int64

#### Results:

| Relevance       | Count |
|-----------------|-------|
| RELEVANT        | 151   |
| PARTLY_RELEVANT | 29    |
| NON_RELEVANT    | 20    |

In [23]:
# Display relevance proportions
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.755
PARTLY_RELEVANT    0.145
NON_RELEVANT       0.100
Name: proportion, dtype: float64

#### Results:

| Relevance       | Proportion |
|-----------------|------------|
| RELEVANT        | 0.755      |
| PARTLY_RELEVANT | 0.145      |
| NON_RELEVANT    | 0.100      |

In [24]:
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

### Evaluation for GPT-4o

In [15]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question)

    prompt = rag_evaluation_prompt_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt, model='gpt-4o')
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [16]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

In [17]:
df_eval.relevance.value_counts()

relevance
RELEVANT           95
PARTLY_RELEVANT    76
NON_RELEVANT       29
Name: count, dtype: int64

#### Results:

| Relevance       | Count |
|-----------------|-------|
| RELEVANT        | 95    |
| PARTLY_RELEVANT | 76    |
| NON_RELEVANT    | 29    |

In [18]:
# Display relevance proportions
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.475
PARTLY_RELEVANT    0.380
NON_RELEVANT       0.145
Name: proportion, dtype: float64

#### Results:

| Relevance       | Proportion |
|-----------------|------------|
| RELEVANT        | 0.475      |
| PARTLY_RELEVANT | 0.380      |
| NON_RELEVANT    | 0.145      |

In [19]:
df_eval.to_csv('../data/rag-eval-gpt-4o.csv', index=False)