In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["USER_AGENT"] = "myagent"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

from langchain_groq import ChatGroq

llm = ChatGroq(model="llama-3.3-70b-versatile")

In [2]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate

fix_syntax_template = """
You are a PromQL syntax expert. Analyze the following PromQL query:

{query}

The query might contain syntax errors, including mismatches between scalars and vectors 
(for example, using a function that expects a scalar but receiving a vector, or vice versa),
or operator/function usage issues. If the query is syntactically correct, return it unchanged.
Otherwise, fix all syntax errors and return only the corrected query.

Examples:
- Input: "sum_over_time(active_users[5m])"
  Output: "sum_over_time(active_users[5m])"
- Input: "sum(active_users[5m])"
  Output: "sum_over_time(active_users[5m])"

Output only the final query.
"""

fix_syntax_prompt = PromptTemplate.from_template(fix_syntax_template)

fix_syntax_chain = (
    {"query": RunnablePassthrough()}
    | fix_syntax_prompt
    | llm
    | StrOutputParser()
)


In [3]:
fix_syntax_chain.invoke("avg(db_query_response_time[5m])")

'avg_over_time(db_query_response_time[5m])'

In [None]:
import pandas as pd
from tqdm import tqdm

df = pd.read_csv("../evaluation/test_queries.csv")

if 'rag_v8a_output' not in df.columns:
    df['rag_v8a_output'] = [''] * len(df)

batch_size = 20
num_batches = (len(df) + batch_size - 1) // batch_size

for batch_num in range(num_batches):
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, len(df))
    batch_indices = range(start_idx, end_idx)

    batch_outputs = []

    for idx in tqdm(batch_indices, desc=f"Processing Batch {batch_num + 1}/{num_batches}"):
        # Check if the row is already processed (non-empty output)
        if pd.notna(df.loc[idx, 'rag_v8a_output']) and df.loc[idx, 'rag_v8a_output'].strip():
            batch_outputs.append(df.loc[idx, 'rag_v8a_output'])
            continue

        input_text = df.loc[idx, 'rag_v8_output']

        try:
            fixed_output = fix_syntax_chain.invoke(input_text)
        except Exception as e:
            fixed_output = "ERROR"

        batch_outputs.append(fixed_output)

    df.loc[start_idx:end_idx - 1, 'rag_v8a_output'] = batch_outputs
    
    # Save progress after each batch
    df.to_csv("../evaluation/test_queries.csv", index=False)

print("Processing complete!")


Processing Batch 1/10: 100%|██████████| 20/20 [00:00<00:00, 30045.16it/s]
Processing Batch 2/10: 100%|██████████| 20/20 [00:00<00:00, 30164.00it/s]
Processing Batch 3/10: 100%|██████████| 20/20 [00:00<00:00, 38818.18it/s]
Processing Batch 4/10: 100%|██████████| 20/20 [00:39<00:00,  2.00s/it]
Processing Batch 5/10: 100%|██████████| 20/20 [00:46<00:00,  2.32s/it]
Processing Batch 6/10: 100%|██████████| 20/20 [00:45<00:00,  2.28s/it]
Processing Batch 7/10: 100%|██████████| 20/20 [00:46<00:00,  2.31s/it]
Processing Batch 8/10: 100%|██████████| 20/20 [00:47<00:00,  2.35s/it]
Processing Batch 9/10: 100%|██████████| 20/20 [00:45<00:00,  2.29s/it]
Processing Batch 10/10: 100%|██████████| 6/6 [00:13<00:00,  2.30s/it]

Processing complete!



