# Evaluate RAG System

In [14]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [26]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from groq import Client
from src.data_loader import extract_text_from_pdf, chunk_text, chunk_per_row, retrieve_chunks
import os

# Load data
pdf_text = extract_text_from_pdf("../doc/makanan-sehat.pdf")
pdf_chunks = chunk_text(pdf_text)

df = pd.read_csv("../doc/fast-food.csv")
csv_chunks = chunk_per_row(df)

chunks = pdf_chunks + csv_chunks

# Load FAISS Index
vector_store_path = "../doc/vector_store.index"
model = SentenceTransformer('all-MiniLM-L6-v2')

if os.path.exists(vector_store_path):
    index = faiss.read_index(vector_store_path)
else:
    chunk_embeddings = model.encode(chunks)
    faiss.normalize_L2(chunk_embeddings)

    dimension = chunk_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(chunk_embeddings))
    faiss.write_index(index, vector_store_path)

# Initialize LLM API
client = Client(api_key="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")

In [31]:
questions = [
    "Apa yang dimaksud dengan fast food?",
    "Berapa kalori yang dimiliki oleh egg mcmuffin mcdollops?",
    "Apa saja jenis superfood?",
    "Bagaimana perbandingan realfood dan makanan instan?",
    "Apa jenis makanan yang cocok untuk olahraga?",
    "Bagaimana cara mengatasi emotional eating?",
    "Apa itu diet mediterania?",
    "Gejala apa yang muncul karena alergi makanan?",
    "Apa tujuan zat aditif pada makanan?",
    "Apa standar label makanan di Indonesia?",
    "Apa makanan dari raja burger yang paling tinggi kalori?",
    "Berapa total lemak tak jenuh yang dimiliki Blue Raspberry Freeze?"
]

In [32]:
# Generate answer using Groq API
def generate_answer(client, query, context):
    prompt = f"Gunakan informasi ini untuk menjawab pertanyaa, apabila jawaban tidak ada maka katakan saya tidak tahu.\n\nContext: {context}\n\nQuestion: {query}\nAnswer:"
    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

# RAG pipeline
def rag_answer(client,model, index, chunks, query):
    context_chunks = retrieve_chunks(model, index, chunks, query)
    context = " ".join(context_chunks)
    return generate_answer(client, query, context)

In [33]:
rag_answers = []
for q in questions:
    answer = rag_answer(client, model, index, chunks, q)
    rag_answers.append(answer)

rag_df = pd.DataFrame({"Question": questions, "RAG_Answer": rag_answers})

In [34]:
rag_df

Unnamed: 0,Question,RAG_Answer
0,Apa yang dimaksud dengan fast food?,"Saya tidak tahu apa yang dimaksud dengan ""fast..."
1,Berapa kalori yang dimiliki oleh egg mcmuffin ...,Saya tahu bahwa kalori yang dimiliki oleh Egg ...
2,Apa saja jenis superfood?,Berikut adalah beberapa jenis superfood yang d...
3,Bagaimana perbandingan realfood dan makanan in...,Berikut adalah perbandingan antara Real Food d...
4,Apa jenis makanan yang cocok untuk olahraga?,Berikut beberapa jenis makanan yang cocok untu...
5,Bagaimana cara mengatasi emotional eating?,Cara mengatasi emotional eating meliputi:\n\n1...
6,Apa itu diet mediterania?,Diet Mediterania adalah pola makan yang berfok...
7,Gejala apa yang muncul karena alergi makanan?,"Ruam kulit atau gatal-gatal, pembengkakan di w..."
8,Apa tujuan zat aditif pada makanan?,Zat aditif pada makanan memiliki tujuan untuk ...
9,Apa standar label makanan di Indonesia?,Saya tidak tahu tentang standar label makanan ...


In [40]:
def llm_answer_with_hierarchy(client, question, document_sections):
    # Step 1: Identify the most relevant section
    toc_prompt = f"Based on the following sections, which section is most relevant to the question?\n\nSections:\n{document_sections}\n\nQuestion: {question}\nRelevant Section:"
    toc_response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": toc_prompt}],
        max_tokens=500  # Limit tokens for this step
    )
    relevant_section = toc_response.choices[0].message.content.strip()

    # Step 2: Summarize the relevant section to reduce size
    summary_prompt = f"Summarize the following section in 300 tokens or less:\n\nSection: {relevant_section}"
    summary_response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": summary_prompt}],
        max_tokens=300  # Limit tokens for summarization
    )
    summarized_section = summary_response.choices[0].message.content.strip()

    # Step 3: Answer the question using the summarized section
    answer_prompt = f"Answer the following question based on the document. If the answer is not in the document, say 'I do not know.'\n\nDocument: {summarized_section}\n\nQuestion: {question}\nAnswer:"
    answer_response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": answer_prompt}],
        max_tokens=200  # Limit tokens for the final answer
    )
    return answer_response.choices[0].message.content.strip()

# Example usage
document_sections = "\n\n".join([f"Section {i}: {section}" for i, section in enumerate(chunks)])
answer = llm_answer_with_hierarchy(client, "What is the main topic of the document?", document_sections)

APIStatusError: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01jm28nzg4e9k9h7dxyfs3fhfm` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 114787, please reduce your message size and try again. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [None]:
from sentence_transformers import util
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

similarity_scores = []
rouge_scores = []
bleu_scores = []

for i in range(len(questions)):
    embedding1 = model.encode(rag_df["RAG_Answer"][i])
    embedding2 = model.encode(rag_df["LLM_FullDoc_Answer"][i])

    # Cosine Similarity
    similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
    similarity_scores.append(similarity)

    # ROUGE-L Score
    rouge = scorer.score(rag_df["RAG_Answer"][i], rag_df["LLM_FullDoc_Answer"][i])["rougeL"].fmeasure
    rouge_scores.append(rouge)

    # BLEU Score
    reference = rag_df["LLM_FullDoc_Answer"][i].split()
    candidate = rag_df["RAG_Answer"][i].split()
    bleu = sentence_bleu([reference], candidate)
    bleu_scores.append(bleu)

rag_df["Cosine_Similarity"] = similarity_scores
rag_df["ROUGE_L"] = rouge_scores
rag_df["BLEU"] = bleu_scores