# Setting up Environment

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["USER_AGENT"] = "myagent"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

from langchain_groq import ChatGroq

llm = ChatGroq(model="llama-3.1-8b-instant")

# Loading Documents

In [2]:
from langchain_community.document_loaders.csv_loader import CSVLoader

csv_loader = CSVLoader(file_path="../dataset/sample_promql_queries.csv")

csv_documents = csv_loader.load()

In [3]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

urls = [
    "https://prometheus.io/docs/prometheus/latest/querying/basics/",
    "https://prometheus.io/docs/prometheus/latest/querying/operators/",
    "https://prometheus.io/docs/prometheus/latest/querying/functions/",
    "https://prometheus.io/docs/prometheus/latest/querying/examples/",
    "https://prometheus.io/docs/prometheus/latest/querying/api/",
    "https://prometheus.io/docs/prometheus/latest/http_sd/",
    "https://promlabs.com/promql-cheat-sheet/",
]

bs4_strainer = bs4.SoupStrainer(["h2", "h3", "p", "ul", "code"])

web_loader = WebBaseLoader(
    web_paths=urls,
    bs_kwargs={"parse_only": bs4_strainer}
)

web_documents = web_loader.load()

In [4]:
# Normalize metadata
for doc in web_documents:
    doc.metadata["row"] = -1

# Combine documents
documents = csv_documents + web_documents

# Chunking

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
)

docs = text_splitter.split_documents(documents)

# print(len(docs))
# print(docs[1020].page_content)

# Embedding and Storing

In [6]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = Chroma.from_documents(documents=docs, embedding=hf_embeddings)

# Context Retreival

In [7]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# retrieved_docs = retriever.invoke("Get the average CPU usage over the last 30 minutes.")
# print(retrieved_docs[0])

# Prompt Template

In [8]:
from langchain.prompts import PromptTemplate

template = """
You are an AI assistant that generates PromQL queries from natural language descriptions.
You have access to the following context, which contains relevant information about metrics, labels, and PromQL syntax.

### Context:
{context}

### Instructions:
1. Analyze the natural language input to identify key metrics, time ranges, and functions.
2. Use Prometheus functions and operators where appropriate.
3. Ensure that the syntax is correct for PromQL and provides the required information.
4. Keep the answer concise, focusing only on the final PromQL query.

### Input:
{query}

### Example:
**Input**: "What is the average CPU usage over the past 5 minutes?"

**Output**: `avg(rate(cpu_usage[5m]))`

### Output:
"""

custom_rag_prompt = PromptTemplate.from_template(template)

# RAG Chain

In [9]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs , "query": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("Show the error rate for HTTP responses in the last 15 minutes.")

'rate(http_responses_error_total[15m])'

# Output Generation

In [10]:
from langchain.prompts import PromptTemplate

promql_extraction_template = """
From the input provided, extract and return only the PromQL query. Do not include any additional text or formatting.

input: {query}
"""

promql_extraction_prompt = PromptTemplate.from_template(promql_extraction_template)

promql_extraction_chain = (
    {"query": RunnablePassthrough()}
    | promql_extraction_prompt
    | llm
    | StrOutputParser()
)

import pandas as pd

df = pd.read_csv("../evaluation/test_queries.csv")

df['rag_v1_output'] = [''] * len(df)

rag_v1_outputs = [];

for query in df['nl_query']:
    rag_v1_output = rag_chain.invoke(query)
    final_query = promql_extraction_chain.invoke(rag_v1_output)
    rag_v1_outputs.append(final_query)

df['rag_v1_output'] = rag_v1_outputs

df.to_csv('../evaluation/test_queries.csv', index=False)