# RAGAs(Retrieval-Augmented Generation Assessment) Implementation

### Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import Dataset
from datetime import datetime
import json
import os
import re
import time
from dotenv import load_dotenv
load_dotenv()

# Langchain and SQL Imports
from langchain_community.utilities import SQLDatabase
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_anthropic import ChatAnthropic
from langchain_huggingface import HuggingFaceEmbeddings


# RAGAS Evaluation Framework
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas import evaluate
from ragas.metrics import (
    AspectCritic,
    RubricsScore,
    ContextPrecision,
    Faithfulness
)
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    answer_correctness,
    answer_similarity
)
from ragas.dataset_schema import SingleTurnSample, EvaluationDataset



### LLM and Database Initialisation

In [2]:
# LLM Setup (Google Gemini 2.0 Flash)
llm = ChatGoogleGenerativeAI(
    model='gemini-2.0-flash',
    api_key=os.environ.get("GOOGLE_API_KEY")
)

# Database Connection (same as app.py)
db_password = os.environ.get("DB_PASSWORD")
host = 'localhost'
port = '3306'
username = 'root'
database_schema = os.environ.get("DB_SCHEMA")
mysql_uri = f"mysql+mysqlconnector://{username}:{db_password}@{host}:{port}/{database_schema}"
db = SQLDatabase.from_uri(mysql_uri, sample_rows_in_table_info=2)

# LLM Setup for SQL Generation (same as app.py)
template = """Based on the table schema below, write a SQL query that would answer the user's question:
Remember : Only provide me the sql query do not include anything else. Provide me sql query in a single line do not add line breaks.
Table Schema: {schema}
Question: {question}
SQL Query:
"""
prompt = ChatPromptTemplate.from_template(template)

def get_schema_from_db(database: SQLDatabase) -> str:
    """Get the schema from the database."""
    table_info = database.get_table_info()
    schema = "\n".join(table_info)
    return schema

sql_chain = (
    RunnablePassthrough.assign(schema=lambda _: get_schema_from_db(db))
    | prompt
    | llm.bind(stop=["\nSQLResult:"])
    | StrOutputParser()
)

def get_sql_query_from_question(question: str) -> str:
    """Get the SQL query from the natural language question."""
    sql_query = sql_chain.invoke({"question": question})
    query = re.search(r"```sql\s*(.*?)\s*```", sql_query, re.DOTALL | re.IGNORECASE)
    if query:
        sql_query = query.group(1).strip()
    return sql_query

def execute_sql_query(sql_query: str):
    """Execute the SQL query and return the results."""
    try:
        results = db.run(sql_query)
        return results
    except Exception as e:
        return f"Error: {str(e)}"

print("Database and LLM setup complete!")

Database and LLM setup complete!


### Prepare Evaluation Dataset

In [3]:
# Create your test cases
test_data = {
    'question': [
        "What was the budget of Product 12?",
        "What are the names of top 5 selling products?",
        "List 3 customer names and order quantity with the lowest Order Quantity.",
        "Find the total number of states",
        "What is the name of the customer with Customer Index = 1"
    ],
    'ground_truth_sql': [
        "SELECT `2017 Budgets` FROM `2017_budgets` WHERE `Product Name` = 'Product 12'",
        "SELECT T2.`Product Name` FROM `sales_order` AS T1 INNER JOIN products AS T2 ON T1.`Product Description Index` = T2.`Index` GROUP BY T2.`Product Name` ORDER BY SUM(T1.`Line Total`) DESC LIMIT 5;",
        "SELECT T1.`Customer Names`, SUM(T2.`Order Quantity`) AS TotalQuantity FROM customers AS T1 JOIN sales_order AS T2 ON T1.`Customer Index` = T2.`Customer Name Index` GROUP BY T1.`Customer Names` ORDER BY TotalQuantity ASC LIMIT 3"
        "SELECT count(DISTINCT `State`) FROM state_regions",
        "SELECT `Customer Names` FROM customers WHERE `Customer Index` = 1"
    ],
    'ground_truth_answer': [
        "1356976.996",
        [('Product 26',), ('Product 25',), ('Product 13',), ('Product 14',), ('Product 5',)],
        [('Amerisourc Corp', '2022'), ('Mycone Ltd', '2208'), ('Voonyx Group', '2226')],
        "48",
        "Geiss Company"
    ],
    'contexts': [  # Database schema information
        ["Table: `2017_budgets` (`Product Name`, `2017 Budgets`)"],
        ["Table: `sales_order` (`Line Total`, `Product Description Index`)", "Table: `products` (`Index`, `Product Name`)"],
        ["Table: `customers` (`Customer Index`, `Customer Names`)", "Table: `sales_order` (`Order Quantity`, `Customer Name Index`)"],
        ["Table: `regions` (`name`, `state`)"],
        ["Table: `customers` (`Customer Index`, `Customer Names`)"]
    ]
}

print("Test data loaded!")
print(f"Number of test questions: {len(test_data['question'])}")

Test data loaded!
Number of test questions: 5


### Generate predictions for our test questions

In [4]:
# Generate SQL queries and answers using our system
generated_queries = []
generated_answers = []

print("Generating predictions...\n")

for i, question in enumerate(test_data['question'], 1):
    print(f"Processing question {i}/{len(test_data['question'])}: {question}")
    
    # Generate SQL query
    sql_query = get_sql_query_from_question(question)
    generated_queries.append(sql_query)
    print(f"Generated SQL: {sql_query}")
    
    # Execute query and get results
    result = execute_sql_query(sql_query)
    generated_answers.append(str(result))
    print(f"Generated Answer: {result}\n")

    time.sleep(10) # Pause to avoid rate limits

print("Prediction generation complete!")

Generating predictions...

Processing question 1/5: What was the budget of Product 12?
Generated SQL: SELECT `2017 Budgets` FROM `2017_budgets` WHERE `Product Name` = 'Product 12';
Generated Answer: [(1356976.996,)]

Processing question 2/5: What are the names of top 5 selling products?
Generated SQL: SELECT p.`Product Name` FROM sales_order so JOIN products p ON so.`Product Description Index` = p.`Index` GROUP BY p.`Product Name` ORDER BY SUM(so.`Line Total`) DESC LIMIT 5
Generated Answer: [('Product 26',), ('Product 25',), ('Product 13',), ('Product 14',), ('Product 5',)]

Processing question 3/5: List 3 customer names and order quantity with the lowest Order Quantity.
Generated SQL: SELECT T1.`Customer Names`, T2.`Order Quantity` FROM customers AS T1 JOIN sales_order AS T2 ON T1.`Customer Index` = T2.`Customer Name Index` ORDER BY T2.`Order Quantity` ASC LIMIT 3
Generated Answer: [('Ascend Ltd', 5), ('Ascend Ltd', 5), ('Skidoo Company', 5)]

Processing question 4/5: Find the total n

### Prepare Dataset for RAGAs

In [5]:
# Prepare data in RAGAS format
ragas_data = {
    'question': test_data['question'],
    'contexts': test_data['contexts'],
    'answer': generated_answers,
    'ground_truth': [str(gt) for gt in test_data['ground_truth_answer']]
}

# Create HuggingFace Dataset
ragas_dataset = Dataset.from_dict(ragas_data)

print("RAGAS dataset created!")
print(f"\nDataset structure:")
print(ragas_dataset)
print(f"\nFirst example:")
print(ragas_dataset[0])

RAGAS dataset created!

Dataset structure:
Dataset({
    features: ['question', 'contexts', 'answer', 'ground_truth'],
    num_rows: 5
})

First example:
{'question': 'What was the budget of Product 12?', 'contexts': ['Table: `2017_budgets` (`Product Name`, `2017 Budgets`)'], 'answer': '[(1356976.996,)]', 'ground_truth': '1356976.996'}


### Initialize Evaluator LLM and Embeddings model

In [6]:
# Initialize ChatAnthropic for evaluation
evaluator_llm = ChatAnthropic(
    #model="claude-3-5-sonnet-20241022",
    model="claude-3-5-haiku-latest",
    api_key=os.environ.get("ANTHROPIC_API_KEY"),
    temperature=0
)

print("Evaluator LLM (Claude) initialized!")


# Initialize HuggingFace Embeddings model for evaluation
embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",  # Using a lighter model
    cache_folder="./models",  # Specify cache directory
    encode_kwargs={'normalize_embeddings': True}  # Ensure proper encoding
)

print("Evaluator Embeddings model initialized!")

Evaluator LLM (Claude) initialized!
Evaluator Embeddings model initialized!


### Run RAGAs Evaluation

In [8]:
from ragas.run_config import RunConfig


# create a RunConfig to reduce parallelism, increase timeouts & add retries
run_config = RunConfig(
    max_workers=1,   # limit parallel LLM requests
    timeout=240,     # seconds per LLM call
    max_retries=2,
    max_wait=120,       # number of retry attempts on transient errors
    log_tenacity=True
)

In [15]:
# Define metrics to evaluate
metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    answer_correctness,
    answer_similarity
]

print("Starting RAGAS evaluation...")
print(f"Metrics to evaluate: {[m.name for m in metrics]}\n")

for metric in metrics:
    print(f"Preparing to evaluate metric: {metric.name}")
    # Run evaluation
    results = evaluate(
        dataset=ragas_dataset,
        metrics=[metric],
        llm=evaluator_llm,
        embeddings=embeddings_model,
        run_config=run_config,
        raise_exceptions=False
    )

    results_df = results.to_pandas()
    print("=" * 60)
    print(f"\nOverall results for {metric.name}:")
    score = results_df[metric.name].mean()
    print(f"{metric.name:25s}: {score:.4f}")

    time.sleep(300)  # Pause between evaluations
    




print("=" * 60)
print("\n Evaluation complete!")

Starting RAGAS evaluation...
Metrics to evaluate: ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall', 'answer_correctness', 'answer_similarity']

Preparing to evaluate metric: faithfulness


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]


Overall results for faithfulness:
faithfulness             : 0.0000
Preparing to evaluate metric: answer_relevancy


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

Exception raised in Job[2]: RateLimitError(Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'This request would exceed the rate limit for your organization (5869db6b-d040-4b1e-a7da-994c139b3d65) of 5 requests per minute. For details, refer to: https://docs.claude.com/en/api/rate-limits. You can see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}, 'request_id': 'req_011CUbyP8CoJK4wLjk9aY1tK'})
Exception raised in Job[3]: RateLimitError(Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'This request would exceed the rate limit for your organization (5869db6b-d040-4b1e-a7da-994c139b3d65) of 5 requests per minute. For details, refer to: https://docs.claude.com/en/api/rate-limits. You can see the response headers for current


Overall results for answer_relevancy:
answer_relevancy         : 0.4976
Preparing to evaluate metric: context_precision


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]


Overall results for context_precision:
context_precision        : 0.2000
Preparing to evaluate metric: context_recall


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]


Overall results for context_recall:
context_recall           : 0.2000
Preparing to evaluate metric: answer_correctness


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]


Overall results for answer_correctness:
answer_correctness       : 0.7729
Preparing to evaluate metric: answer_similarity


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]


Overall results for answer_similarity:
answer_similarity        : 0.6918

 Evaluation complete!
