# Setting up Environment

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["USER_AGENT"] = "myagent"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

from langchain_groq import ChatGroq

llm = ChatGroq(model="llama-3.3-70b-versatile")

# Loading Documents

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader

file_paths = [
    "../dataset/docs/prometheusDocs.pdf",
    "../dataset/docs/promqlCheatSheet.pdf",
    "../dataset/docs/prometheusBetterstack.pdf",
]

pdf_documents = []

# Loop through each file path and load the PDFs
for file_path in file_paths:
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()
    pdf_documents.extend(documents)

# Check the total number of documents (pages) loaded
print(f"Loaded {len(pdf_documents)} pages across all PDFs")

Loaded 122 pages across all PDFs


In [4]:
from langchain_community.document_loaders.csv_loader import CSVLoader

csv_loader = CSVLoader(file_path="../dataset/metric_name.csv")
csv_documents = csv_loader.load()

# Chunking

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
)

pdf_chunks = text_splitter.split_documents(pdf_documents)

csv_chunks = text_splitter.split_documents(csv_documents)

print(len(pdf_chunks))
print(len(csv_chunks))
# print(pdf_chunks[4].page_content)

216
68


# Embedding and Storing

In [6]:
from langchain_milvus import Milvus
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-m3"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}

embedding = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

URI = "./milvus_promql.db"

pdf_vectorstore = Milvus.from_documents(
    documents=pdf_chunks,
    embedding=embedding,
    collection_name="prometheus_pdf_docs",
    connection_args={"uri": URI},
    drop_old=True,
)

csv_vectorstore = Milvus.from_documents(
    documents=csv_chunks,
    embedding=embedding,
    collection_name="prometheus_csv_docs",
    connection_args={"uri": URI},
    drop_old=True,
)

# Context Retreival

In [7]:
pdf_retriever = pdf_vectorstore.as_retriever(search_kwargs={"k": 4})

csv_retriever = csv_vectorstore.as_retriever(search_kwargs={"k": 4})

# retrieved_docs = csv_retriever.invoke("Give me the total number of active connections for pods where the memory usage has been above 90% for more than 15 minutes.")
# for doc in retrieved_docs:
#     print(doc.page_content)

# Prompt Template

In [8]:
from langchain.prompts import PromptTemplate

template = """
You are an AI assistant that generates PromQL queries from natural language descriptions.
You have access to the following context, which includes:
- Metric Descriptions: A list of available Prometheus metrics with their names and descriptions to help you identify the correct metrics for the user's query.
- Relevant Documentation: Excerpts from Prometheus documentation providing detailed information about labels, PromQL functions, and syntax to support query formulation.

### Context:
Metric Descriptions:
{csv_context}

Relevant Documentation:  
{pdf_context}

### Instructions:
1. Analyze the natural language input to identify key metrics, time ranges, and functions.
2. Use Prometheus functions and operators where appropriate.
3. Ensure that the syntax is correct for PromQL and provides the required information.
4. Keep the answer concise, focusing only on the final PromQL query.

### Input:
{query}

### Example:
**Input**: "What is the average CPU usage over the past 5 minutes?"

**Output**: `avg(rate(cpu_usage[5m]))`

### Output:
"""

custom_rag_prompt = PromptTemplate.from_template(template)

# RAG Chain

In [9]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"csv_context": csv_retriever | format_docs ,
     "pdf_context": pdf_retriever | format_docs ,
     "query": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("Total number of requests to the '/checkout' service in the last 10 minutes")

'`sum(rate(http_requests{path="/checkout"}[10m]))`'

# Output

In [10]:
from langchain.prompts import PromptTemplate

promql_extraction_template = """
From the input provided, extract and return only the final PromQL query. Do not include any additional text or formatting.

input: {query}
"""

promql_extraction_prompt = PromptTemplate.from_template(promql_extraction_template)

promql_extraction_chain = (
    {"query": RunnablePassthrough()}
    | promql_extraction_prompt
    | llm
    | StrOutputParser()
)

In [11]:
import pandas as pd
from tqdm import tqdm

df = pd.read_csv("../evaluation/test_queries.csv")

if 'rag_v8_output' not in df.columns:
    df['rag_v8_output'] = [''] * len(df)

batch_size = 20
num_batches = (len(df) + batch_size - 1) // batch_size

for batch_num in range(num_batches):
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, len(df))
    batch_indices = range(start_idx, end_idx)

    rag_v8_outputs = []

    for idx in tqdm(batch_indices, desc=f"Processing Batch {batch_num + 1}/{num_batches}"):
        if pd.notna(df.loc[idx, 'rag_v8_output']) and df.loc[idx, 'rag_v8_output'].strip():
            rag_v8_outputs.append(df.loc[idx, 'rag_v8_output'])
            continue

        query = df.loc[idx, 'nl_query']
        
        try:
            rag_v8_output = rag_chain.invoke(query)
            rag_v8_final_output = promql_extraction_chain.invoke(rag_v8_output)
        except Exception as e:
            rag_v8_final_output = "ERROR"

        rag_v8_outputs.append(rag_v8_final_output)

    # Update the DataFrame with new results
    df.loc[start_idx:end_idx - 1, 'rag_v8_output'] = rag_v8_outputs
    
    # Save progress after each batch
    df.to_csv("../evaluation/test_queries.csv", index=False)

print("Processing complete!")

Processing Batch 1/10: 100%|██████████| 20/20 [00:00<00:00, 10361.42it/s]
Processing Batch 2/10: 100%|██████████| 20/20 [00:00<00:00, 68478.43it/s]
Processing Batch 3/10: 100%|██████████| 20/20 [00:00<00:00, 31057.42it/s]
Processing Batch 4/10: 100%|██████████| 20/20 [00:00<00:00, 27068.76it/s]
Processing Batch 5/10: 100%|██████████| 20/20 [00:21<00:00,  1.06s/it]
Processing Batch 6/10: 100%|██████████| 20/20 [03:53<00:00, 11.68s/it]
Processing Batch 7/10: 100%|██████████| 20/20 [04:23<00:00, 13.16s/it]
Processing Batch 8/10: 100%|██████████| 20/20 [04:00<00:00, 12.01s/it]
Processing Batch 9/10: 100%|██████████| 20/20 [02:58<00:00,  8.95s/it]
Processing Batch 10/10: 100%|██████████| 6/6 [00:01<00:00,  3.29it/s]

Processing complete!



