# Setting up Environment

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["USER_AGENT"] = "myagent"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

from langchain_groq import ChatGroq

llm = ChatGroq(model="llama-3.1-8b-instant")

# Loading Documents

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader

file_paths = [
    "../dataset/docs/prometheusDocs.pdf",
    "../dataset/docs/promqlCheatSheet.pdf",
    "../dataset/docs/prometheusBetterstack.pdf",
]

pdf_documents = []

# Loop through each file path and load the PDFs
for file_path in file_paths:
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()
    pdf_documents.extend(documents)

# Check the total number of documents (pages) loaded
print(f"Loaded {len(pdf_documents)} pages across all PDFs")

Loaded 122 pages across all PDFs


In [3]:
from langchain_community.document_loaders.csv_loader import CSVLoader

csv_loader = CSVLoader(file_path="../dataset/sample_promql_queries.csv")
csv_documents = csv_loader.load()

# Chunking

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
)

pdf_chunks = text_splitter.split_documents(pdf_documents)

csv_chunks = text_splitter.split_documents(csv_documents)

print(len(pdf_chunks))
print(len(csv_chunks))
# print(pdf_chunks[4].page_content)

216
907


# Embedding and Storing

In [5]:
from langchain_milvus import Milvus
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-m3"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}

embedding = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

URI = "./milvus_promql.db"

pdf_vectorstore = Milvus.from_documents(
    documents=pdf_chunks,
    embedding=embedding,
    collection_name="prometheus_pdf_docs",
    connection_args={"uri": URI},
    drop_old=True,
)

csv_vectorstore = Milvus.from_documents(
    documents=csv_chunks,
    embedding=embedding,
    collection_name="prometheus_csv_docs",
    connection_args={"uri": URI},
    drop_old=True,
)

# Context Retreival

In [6]:
pdf_retriever = pdf_vectorstore.as_retriever(search_kwargs={"k": 4})

csv_retriever = csv_vectorstore.as_retriever(search_kwargs={"k": 2})

# retrieved_docs = csv_retriever.invoke("Get the average CPU usage over the last 30 minutes.")
# print(retrieved_docs[0])

# Prompt Template

In [7]:
from langchain.prompts import PromptTemplate

template = """
You are an AI assistant that generates PromQL queries from natural language descriptions.
You have access to the following context, which includes:
- **Similar Queries**: Examples of natural language queries paired with their corresponding PromQL queries to help you understand how to translate user requests into PromQL syntax.
- **Relevant Documentation**: Excerpts from Prometheus documentation providing detailed information about available metrics, labels, and PromQL functions to support query formulation.

### Context:
**Similar Queries:**  
{csv_context}

**Relevant Documentation:**  
{pdf_context}

### Instructions:
1. **Understand the input**: Break down the user's natural language query and identify the key metrics, time ranges, and operations required (e.g., avg, sum, rate, histogram_quantile).
2. **Map to Prometheus metrics**: Identify the relevant Prometheus metrics and labels that will provide the required data (e.g., cpu_usage, http_requests_total, etc.).
3. **Formulate the query**: Using the identified metrics, generate the correct PromQL query by applying appropriate functions and operators.
4. **Explain the logic**: Briefly describe the reasoning for the chosen metrics and PromQL functions.
5. **Output the final query**: Provide the final PromQL query as output.

### Example:
**Input:**  
User query: "What is the average CPU usage over the last hour?"

**Output: **
1. **Identify key elements**: The user is asking for the "average CPU usage" over "the last hour".
2. **Metrics**: The metric `cpu_usage_percent` or a similar metric will be used to represent CPU usage.
3. **Time range**: The user specified "last hour", so we'll apply a time range filter of `[1h]`.
4. **PromQL Function**: We will use `avg()` to calculate the average.
5. **Final Query**:  
`avg(cpu_usage_percent[1h])`

### Input:
{query}

### Output:
"""

custom_rag_prompt = PromptTemplate.from_template(template)

# RAG Chain

In [8]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"csv_context": csv_retriever | format_docs ,
     "pdf_context": pdf_retriever | format_docs ,
     "query": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("Total number of requests to the '/checkout' service in the last 10 minutes")

'### 1. Understand the input\n\n* **Key elements**: The user is asking for the "total number of requests" to the \'/checkout\' service over "the last 10 minutes".\n* **Time range**: The user specified "last 10 minutes", so we\'ll apply a time range filter of `[10m]`.\n* **Service**: The user specified the \'/checkout\' service, which we\'ll use to filter the requests.\n\n### 2. Map to Prometheus metrics\n\n* **Metric**: The `http_requests_total` metric will be used to represent the total number of requests.\n* **Labels**: We\'ll use the `service` label to filter the requests to the \'/checkout\' service.\n\n### 3. Formulate the query\n\n* **Time range**: Apply a time range filter of `[10m]` to the `http_requests_total` metric.\n* **Filter by service**: Use the `service` label to filter the requests to the \'/checkout\' service.\n* **Aggregate**: Use the `sum()` function to calculate the total number of requests.\n\n### 4. Explain the logic\n\n* We\'re using the `sum()` function because