# Setting up Environment

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["USER_AGENT"] = "myagent"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

from langchain_groq import ChatGroq

llm = ChatGroq(model="llama-3.1-8b-instant")

# Loading Documents

In [2]:
from langchain_community.document_loaders.csv_loader import CSVLoader

csv_loader = CSVLoader(file_path="../dataset/sample_promql_queries.csv")
csv_documents = csv_loader.load()

In [3]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

urls = [
    "https://prometheus.io/docs/prometheus/latest/querying/basics/",
    "https://prometheus.io/docs/prometheus/latest/querying/operators/",
    "https://prometheus.io/docs/prometheus/latest/querying/functions/",
    "https://prometheus.io/docs/prometheus/latest/querying/examples/",
    "https://prometheus.io/docs/prometheus/latest/querying/api/",
    "https://prometheus.io/docs/prometheus/latest/http_sd/",
    "https://promlabs.com/promql-cheat-sheet/",
]

bs4_strainer = bs4.SoupStrainer(["h2", "h3", "p", "ul", "code"])

web_loader = WebBaseLoader(
    web_paths=urls,
    bs_kwargs={"parse_only": bs4_strainer}
)

web_documents = web_loader.load()

In [4]:
# Normalize metadata
for doc in web_documents:
    doc.metadata["row"] = -1

# Combine documents
documents = csv_documents + web_documents

# Chunking

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
)

docs = text_splitter.split_documents(documents)

# print(len(docs))
# print(docs[1020].page_content)

# Embedding and Storing

In [6]:
from langchain_milvus import Milvus
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-m3"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}

embedding = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

URI = "./milvus_promql.db"

vectorstore = Milvus.from_documents(
    documents=docs,
    embedding=embedding,
    collection_name="prometheus_docs",
    connection_args={"uri": URI},
    drop_old=True,
)

model.safetensors:  32%|###1      | 724M/2.27G [00:00<?, ?B/s]

# Context Retreival

In [7]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# retrieved_docs = retriever.invoke("Get the average CPU usage over the last 30 minutes.")
# print(retrieved_docs[0])

# Prompt Template

In [8]:
from langchain.prompts import PromptTemplate

template = """
You are an AI assistant that generates PromQL queries from natural language descriptions.
You have access to the following context, which contains relevant information about metrics, labels, and PromQL syntax.

### Context:
{context}

### Instructions:
1. **Understand the input**: Break down the user's natural language query and identify the key metrics, time ranges, and operations required (e.g., avg, sum, rate, histogram_quantile).
2. **Map to Prometheus metrics**: Identify the relevant Prometheus metrics and labels that will provide the required data (e.g., cpu_usage, http_requests_total, etc.).
3. **Formulate the query**: Using the identified metrics, generate the correct PromQL query by applying appropriate functions and operators.
4. **Explain the logic**: Briefly describe the reasoning for the chosen metrics and PromQL functions.
5. **Output the final query**: Provide the final PromQL query as output.

**Important**: Follow this structure exactly. Do not skip or alter any of the steps. The output must strictly adhere to the template, with sections clearly labeled and the PromQL query correctly formulated.

### Example:
**Input:**  
User query: "What is the average CPU usage over the last hour?"

**Output:**
1. **Identify key elements**: The user is asking for the "average CPU usage" over "the last hour".
2. **Metrics**: The metric `cpu_usage_percent` or a similar metric will be used to represent CPU usage.
3. **Time range**: The user specified "last hour", so we'll apply a time range filter of `[1h]`.
4. **PromQL Function**: We will use `avg()` to calculate the average.
5. **Final Query**:  
`avg(cpu_usage_percent[1h])`

### Input:
{query}

### Output:
"""

custom_rag_prompt = PromptTemplate.from_template(template)

# RAG Chain

In [9]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs , "query": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("Total number of requests to the '/checkout' service in the last 10 minutes")

'1. **Identify key elements**: The user is asking for the "total number of requests" to the \'/checkout\' service over "the last 10 minutes".\n2. **Metrics**: The metric `http_requests_total` will be used to represent the total number of HTTP requests.\n3. **Labels**: We need to filter for the \'/checkout\' service, so we\'ll use the `service` label.\n4. **Time range**: The user specified "last 10 minutes", so we\'ll apply a time range filter of `[10m]`.\n5. **PromQL Function**: We will use `sum()` to calculate the total and `rate()` to count the number of requests per second.\n6. **Final Query**:  \n`sum(rate(http_requests_total{service="/checkout"}[10m]))`'

# Output generation

In [10]:
from langchain.prompts import PromptTemplate

promql_extraction_template = """
From the input provided, extract and return only the PromQL query. Do not include any additional text or formatting.

input: {query}
"""

promql_extraction_prompt = PromptTemplate.from_template(promql_extraction_template)

promql_extraction_chain = (
    {"query": RunnablePassthrough()}
    | promql_extraction_prompt
    | llm
    | StrOutputParser()
)

import pandas as pd

df = pd.read_csv("../evaluation/test_queries.csv")

df['rag_v2_output'] = [''] * len(df)

rag_v2_outputs = [];

for query in df['nl_query']:
    rag_v2_output = rag_chain.invoke(query)
    final_output = promql_extraction_chain.invoke(rag_v2_output)
    rag_v2_outputs.append(final_output)

df['rag_v2_output'] = rag_v2_outputs

df.to_csv('../evaluation/test_queries.csv', index=False)