In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_TOKEN")

from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,  # Ensures deterministic results
    max_tokens=512,  # Keeps responses concise but informative
    timeout=20,  # Avoids long hangs
    max_retries=3,  # Handles transient failures
    verbose=True  # Enables debugging info
)

In [None]:
import pandas as pd

df = pd.read_csv("test_queries.csv")

nl_queries = df["nl_query"].tolist()
expected_outputs = df["expected_output"].tolist()
rag_v1_outputs = df["rag_v1_output"].tolist()
rag_v2_outputs = df["rag_v2_output"].tolist()
rag_v3_outputs = df["rag_v3_output"].tolist()

In [3]:
from langchain.prompts import PromptTemplate

promql_evaluation_template = """
You are an expert in PromQL and query evaluation. Your task is to evaluate the correctness of a model-generated query compared to the expected output based on a natural language query. Provide detailed feedback and assign a score (on a scale of 0 to 10) based on the following criteria:

### Scoring Criteria:
1. **Semantic Equivalence** (0.4 points):
   - Is the model output semantically equivalent to the expected output? Evaluate how closely it matches in terms of functionality.

2. **Correctness** (0.3 points):
   - Are all key components of the query (e.g., aggregation functions, filters, time ranges) accurate and relevant to the natural language query?

3. **Optimization** (0.2 points):
   - Is the output efficient and aligned with best practices for PromQL queries?

4. **Overall Clarity** (0.1 point):
   - Is the query clean and easy to understand?

### Inputs:
- **NL Query**: {nl_query}  
- **Expected Output**: {expected_output}  
- **Model Output**: {rag_output}  

### Example Response Format:
1. **Semantic Equivalence**:
   - Score: X/0.4
   - Explanation: [Explanation of the alignment]

2. **Correctness**:
   - Score: X/0.3
   - Explanation: [Analysis of key components]

3. **Optimization**:
   - Score: X/0.2
   - Explanation: [Improvement suggestions]

4. **Overall Clarity**:
   - Score: X/0.1
   - Explanation: [Comment on query clarity]

5. **Total Score**:
   - X (in decimal format, e.g., 0.7)

Evaluate and provide the scores with explanations.
"""

promql_evaluation_prompt = PromptTemplate.from_template(promql_evaluation_template)

In [4]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

evaluation_chain = (
    {
        "nl_query": RunnablePassthrough(),
        "expected_output": RunnablePassthrough(),
        "rag_output": RunnablePassthrough(),
    }
    | promql_evaluation_prompt
    | llm
    | StrOutputParser()
)

In [5]:
from langchain.prompts import PromptTemplate

score_extraction_template = """
From the input provided, extract and return only the total score as a decimal number. Do not include any additional text, symbols, or formatting, just the score in float form.

input: {query}
"""

score_extraction_prompt = PromptTemplate.from_template(score_extraction_template)

score_extraction_chain = (
    {"query": RunnablePassthrough()}
    | score_extraction_prompt
    | llm
    | StrOutputParser()
)

In [7]:
import time

rag_v1_scores = []
batch_size = 10  # Reduce batch size to avoid API pressure
delay_per_request = 5  # Add a small delay per request to avoid spikes

for i in range(0, len(nl_queries), batch_size):
    batch_queries = nl_queries[i : i + batch_size]
    batch_expected_outputs = expected_outputs[i : i + batch_size]
    batch_rag_outputs = rag_v1_outputs[i : i + batch_size]

    for j in range(len(batch_queries)):
        nl_query = batch_queries[j]
        expected_output = batch_expected_outputs[j]
        rag_output = batch_rag_outputs[j]

        inputs = {
            "nl_query": nl_query,
            "expected_output": expected_output,
            "model_output": rag_output,
        }

        try:
            result = evaluation_chain.invoke(inputs)
            score = score_extraction_chain.invoke(result)
            rag_v1_scores.append(float(score))
        except Exception as e:
            print(f"Error processing query {i + j + 1}: {e}")
            rag_v1_scores.append(None)

        time.sleep(delay_per_request)  # Short delay between requests to avoid 429 errors

    print(f"Completed batch {i // batch_size + 1}, sleeping for 60 seconds...")
    time.sleep(60)  # Ensure we stay under the 15 RPM limit

print(rag_v1_scores)

Completed batch 1, sleeping for 60 seconds...
Completed batch 2, sleeping for 60 seconds...
Completed batch 3, sleeping for 60 seconds...
Completed batch 4, sleeping for 60 seconds...
Completed batch 5, sleeping for 60 seconds...
Error processing query 51: could not convert string to float: '0.  1'
Completed batch 6, sleeping for 60 seconds...
Error processing query 63: could not convert string to float: '1.  0'
Completed batch 7, sleeping for 60 seconds...
Completed batch 8, sleeping for 60 seconds...
Completed batch 9, sleeping for 60 seconds...
Error processing query 92: could not convert string to float: '0.  0'
Error processing query 93: could not convert string to float: '0.  0'
Completed batch 10, sleeping for 60 seconds...
Error processing query 102: could not convert string to float: '0.  0'
Completed batch 11, sleeping for 60 seconds...
Completed batch 12, sleeping for 60 seconds...
Error processing query 122: could not convert string to float: '0.  0'
Completed batch 13, sle

In [13]:
valid_scores = [score for score in rag_v1_scores if score is not None]  # Remove None values
average_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0  # Avoid division by zero

print(f"📊 Average Score for rag_v1_output: {average_score:.4f}")

📊 Average Score for rag_v1_output: 0.4724


In [14]:
rag_v2_scores = []
batch_size = 10  # Reduce batch size to avoid API pressure
delay_per_request = 5  # Add a small delay per request to avoid spikes

for i in range(0, len(nl_queries), batch_size):
    batch_queries = nl_queries[i : i + batch_size]
    batch_expected_outputs = expected_outputs[i : i + batch_size]
    batch_rag_outputs = rag_v2_outputs[i : i + batch_size]

    for j in range(len(batch_queries)):
        nl_query = batch_queries[j]
        expected_output = batch_expected_outputs[j]
        rag_output = batch_rag_outputs[j]

        inputs = {
            "nl_query": nl_query,
            "expected_output": expected_output,
            "model_output": rag_output,
        }

        try:
            result = evaluation_chain.invoke(inputs)
            score = score_extraction_chain.invoke(result)
            rag_v2_scores.append(float(score))
        except Exception as e:
            print(f"Error processing query {i + j + 1}: {e}")
            rag_v2_scores.append(None)

        time.sleep(delay_per_request)  # Short delay between requests to avoid 429 errors

    print(f"Completed batch {i // batch_size + 1}, sleeping for 60 seconds...")
    time.sleep(60)  # Ensure we stay under the 15 RPM limit

print(rag_v2_scores)

Completed batch 1, sleeping for 60 seconds...
Error processing query 12: could not convert string to float: '1.  0'
Completed batch 2, sleeping for 60 seconds...
Completed batch 3, sleeping for 60 seconds...
Completed batch 4, sleeping for 60 seconds...
Completed batch 5, sleeping for 60 seconds...
Completed batch 6, sleeping for 60 seconds...
Error processing query 63: could not convert string to float: '1.  0'
Completed batch 7, sleeping for 60 seconds...
Completed batch 8, sleeping for 60 seconds...
Error processing query 81: could not convert string to float: '0.  3'
Completed batch 9, sleeping for 60 seconds...
Completed batch 10, sleeping for 60 seconds...
Completed batch 11, sleeping for 60 seconds...
Error processing query 112: could not convert string to float: '1.  0'
Completed batch 12, sleeping for 60 seconds...
Completed batch 13, sleeping for 60 seconds...
Completed batch 14, sleeping for 60 seconds...
Error processing query 142: could not convert string to float: '0.  0'

In [15]:
valid_scores = [score for score in rag_v2_scores if score is not None]  # Remove None values
average_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0  # Avoid division by zero

print(f"📊 Average Score for rag_v2_output: {average_score:.4f}")

📊 Average Score for rag_v2_output: 0.4772


In [16]:
rag_v3_scores = []
batch_size = 10  # Reduce batch size to avoid API pressure
delay_per_request = 5  # Add a small delay per request to avoid spikes

for i in range(0, len(nl_queries), batch_size):
    batch_queries = nl_queries[i : i + batch_size]
    batch_expected_outputs = expected_outputs[i : i + batch_size]
    batch_rag_outputs = rag_v3_outputs[i : i + batch_size]

    for j in range(len(batch_queries)):
        nl_query = batch_queries[j]
        expected_output = batch_expected_outputs[j]
        rag_output = batch_rag_outputs[j]

        inputs = {
            "nl_query": nl_query,
            "expected_output": expected_output,
            "model_output": rag_output,
        }

        try:
            result = evaluation_chain.invoke(inputs)
            score = score_extraction_chain.invoke(result)
            rag_v3_scores.append(float(score))
        except Exception as e:
            print(f"Error processing query {i + j + 1}: {e}")
            rag_v3_scores.append(None)

        time.sleep(delay_per_request)  # Short delay between requests to avoid 429 errors

    print(f"Completed batch {i // batch_size + 1}, sleeping for 60 seconds...")
    time.sleep(60)  # Ensure we stay under the 15 RPM limit

print(rag_v3_scores)

Completed batch 1, sleeping for 60 seconds...
Completed batch 2, sleeping for 60 seconds...
Completed batch 3, sleeping for 60 seconds...
Completed batch 4, sleeping for 60 seconds...
Completed batch 5, sleeping for 60 seconds...
Completed batch 6, sleeping for 60 seconds...
Error processing query 63: could not convert string to float: '1.  0'
Completed batch 7, sleeping for 60 seconds...
Completed batch 8, sleeping for 60 seconds...
Error processing query 89: could not convert string to float: '0.  0'
Completed batch 9, sleeping for 60 seconds...
Completed batch 10, sleeping for 60 seconds...
Completed batch 11, sleeping for 60 seconds...
Completed batch 12, sleeping for 60 seconds...
Completed batch 13, sleeping for 60 seconds...
Completed batch 14, sleeping for 60 seconds...
Completed batch 15, sleeping for 60 seconds...
Completed batch 16, sleeping for 60 seconds...
Completed batch 17, sleeping for 60 seconds...
Completed batch 18, sleeping for 60 seconds...
Error processing query 

In [17]:
valid_scores = [score for score in rag_v3_scores if score is not None]  # Remove None values
average_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0  # Avoid division by zero

print(f"📊 Average Score for rag_v2_output: {average_score:.4f}")

📊 Average Score for rag_v2_output: 0.4611


In [18]:
import pandas as pd

rag_v1_valid_scores = [score for score in rag_v1_scores if score is not None]
rag_v2_valid_scores = [score for score in rag_v2_scores if score is not None]
rag_v3_valid_scores = [score for score in rag_v3_scores if score is not None]


# Calculate the average score for each RAG version
rag_v1_avg = sum(rag_v1_valid_scores) / len(rag_v1_valid_scores)
rag_v2_avg = sum(rag_v2_valid_scores) / len(rag_v2_valid_scores)
rag_v3_avg = sum(rag_v3_valid_scores) / len(rag_v3_valid_scores)

# Create a DataFrame to display the averages as a table
data = {
    'RAG Version': ['rag_v1', 'rag_v2', 'rag_v3'],
    'Average Score': [rag_v1_avg, rag_v2_avg, rag_v3_avg]
}

# Create the DataFrame and display it
df = pd.DataFrame(data)
print(df)

  RAG Version  Average Score
0      rag_v1       0.472402
1      rag_v2       0.477238
2      rag_v3       0.461093
