In [None]:
# Summary
# Dense retrieval (FAISS) → Finds similar meanings using embeddings.
# Sparse retrieval (BM25) → Finds keyword-based matches using term frequencies.
# Hybrid retrieval → Combines both methods for better accuracy.

In [None]:
!pip install --quiet --upgrade langchain-text-splitters langchain-community whoosh sentence-transformers pandas
!pip install langchain langchain-milvus pymilvus requests beautifulsoup4 nltk
!pip install langchain-huggingface
!pip install rouge-score
!pip install langchain_groq
!pip install rank-bm25

In [5]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [6]:
from langchain_groq import ChatGroq
import os, requests, nltk, pandas as pd
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi
import faiss
import numpy as np
from difflib import SequenceMatcher
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
llm = ChatGroq(model="llama-3.1-8b-instant")


In [11]:
def fetch_website_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = ' '.join(p.get_text() for p in soup.find_all('p'))
    return text

In [12]:
resources = [
    "https://prometheus.io/docs/prometheus/latest/querying/examples/",
    "https://promlabs.com/promql-cheat-sheet/",
    "https://prometheus.io/docs/prometheus/latest/querying/basics/",
    "https://prometheus.io/docs/prometheus/latest/querying/operators/",
    "https://prometheus.io/docs/prometheus/latest/querying/functions/",
    "https://prometheus.io/docs/prometheus/latest/querying/api/",
]


In [13]:
documents = [fetch_website_content(url) for url in resources]

# Load CSV Dataset (ensure correct path)
csv_path = "/content/Dataset.csv - Sheet1.csv"  # Adjust path if needed
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    for _, row in df.iterrows():
        documents.append(f"Query: {row['instruction']}\nPromQL: {row['output']}")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.create_documents(documents)
documents = [doc.page_content for doc in splits]

In [15]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [16]:
#####################################
# Dense Retrieval via Sentence Transformers and FAISS
#####################################
# uses a neural network (Sentence Transformer) to generate numerical embeddings for queries
# and documents. These embeddings are stored in a FAISS index, which allows fast similarity search.

dense_model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = dense_model.encode(documents, convert_to_numpy=True)
embedding_dim = doc_embeddings.shape[1]
# Build a FAISS index for dense retrieval
# Stores them in a FAISS L2 (Euclidean Distance) index for fast retrieval.

faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(doc_embeddings)

def dense_retrieve(query, top_k=3):
    query_embedding = dense_model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, top_k)
    retrieved_docs = [documents[idx] for idx in indices[0]]
    return retrieved_docs

#####################################
# Sparse Retrieval via BM25 (rank-bm25)
#####################################

# with term-based matching (e.g., keyword frequency) instead of dense vectors.
# It uses the BM25 algorithm, a variant of TF-IDF.
def preprocess(text):
    return nltk.word_tokenize(text.lower())

tokenized_docs = [preprocess(doc) for doc in documents]
bm25 = BM25Okapi(tokenized_docs)

def sparse_retrieve(query, top_k=3):
    tokenized_query = preprocess(query)
    scores = bm25.get_scores(tokenized_query)
    ranked_indices = np.argsort(scores)[::-1][:top_k]
    retrieved_docs = [documents[idx] for idx in ranked_indices]
    return retrieved_docs


# Tokenizes the query.
# Computes similarity scores with BM25.
# Sorts documents by score and returns the top-k.


#####################################
# Hybrid Retrieval: Fusion of Dense and Sparse
#####################################

# combines dense (semantic) and sparse (keyword) retrieval methods to get the best of both worlds.

def hybrid_retrieve(query, top_k=3):
    dense_docs = dense_retrieve(query, top_k)
    sparse_docs = sparse_retrieve(query, top_k)
    combined = list(dict.fromkeys(dense_docs + sparse_docs))
    return combined

In [17]:
prompt_template = """
You are an expert in Prometheus and PromQL.
Given the following documentation:
{context}
Generate the most accurate PromQL query for:
{question}
Provide only the query, no text other than the query.
"""

In [18]:
def format_docs(docs):
    return " ".join(docs)

def generate_query(context, question):
    prompt = prompt_template.format(context=context, question=question)
    response = llm.invoke(prompt)
    return response.content.strip()

In [None]:
#####################################
# Rerank retrieved docs using a Cross-Encoder
#####################################
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")

def rerank_docs(docs, query):
    scores = cross_encoder.predict([(query, doc) for doc in docs])
    # Higher scores come first
    return [doc for _, doc in sorted(zip(scores, docs), reverse=True)]


In [20]:
#####################################
# Ask Question: Retrieve, Rank, Format, and Generate Query
#####################################
def ask_question(query):
    retrieved_docs = hybrid_retrieve(query)
    ranked_docs = rerank_docs(retrieved_docs, query)
    formatted_context = format_docs(ranked_docs[:3])
    generated_query = generate_query(formatted_context, query)
    return generated_query

In [21]:
#####################################
# Evaluation Metrics (Optional)
#####################################
def levenshtein_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def evaluate(generated, expected):
    bleu = sentence_bleu([expected.split()], generated.split(), smoothing_function=SmoothingFunction().method1)
    rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True).score(expected, generated)["rougeL"].fmeasure
    levenshtein = levenshtein_similarity(generated, expected)
    return {"BLEU": bleu, "ROUGE-L": rouge, "Levenshtein": levenshtein}

In [25]:
test_query = "Maximum response time for the '/api/users' endpoint in the last 5 minutes"
generated_output = ask_question(test_query)
print("Generated Output:", generated_output)

Generated Output: max(rate(http_request_duration_seconds_bucket{endpoint="/api/users", job="your_job"}[5m]))


In [None]:
#####################################
# Run Evaluation on Test Queries
#####################################
test_csv_path = "/content/test_queries.csv"  # Adjust path if needed
df_test = pd.read_csv(test_csv_path)

results = []
for index, row in df_test.iterrows():
    question = row['nl_query']
    expected_query = row['expected_output']
    generated_query = ask_question(question)
    results.append([question, expected_query, generated_query])

results_df = pd.DataFrame(results, columns=['question', 'expected_query', 'generated_query'])
results_path = "/content/test_queries_results.csv"
results_df.to_csv(results_path, index=False)
print(f"Test evaluation results saved at {results_path}")