In [None]:
%pip install langchain_huggingface

In [None]:
%pip install langchain-chroma

In [None]:
%pip install InstructorEmbedding

In [None]:

%pip install langchain requests beautifulsoup4 openai nltk faiss-cpu sentence-transformers chromadb huggingface_hub langchain_community


In [None]:
%pip install --upgrade langchain

In [None]:
%pip install pymupdf


In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from langchain_huggingface import HuggingFaceEndpoint
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import requests
from bs4 import BeautifulSoup
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import fitz  # PyMuPDF

repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
llm = HuggingFaceEndpoint(repo_id=repo_id, model_kwargs={"max_length": 128})
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def fetch_website_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = ' '.join(p.get_text() for p in soup.find_all('p'))
    return Document(page_content=text, metadata={"source": url})

def fetch_pdf_content(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return Document(page_content=text, metadata={"source": file_path})

resources = [
    "https://prometheus.io/docs/prometheus/latest/querying/examples/",
    "https://promlabs.com/promql-cheat-sheet/",
    "https://prometheus.io/docs/prometheus/latest/querying/basics/",
    "https://prometheus.io/docs/prometheus/latest/querying/operators/",
    "https://prometheus.io/docs/prometheus/latest/querying/functions/",
    "https://prometheus.io/docs/prometheus/latest/querying/api/",
    "https://prometheus.io/docs/prometheus/latest/http_sd/",
    "/content/drive/MyDrive/exp/Dataset.csv - Sheet1.csv"
]

documents = []
for resource in resources:
    if resource.startswith("http"):
        documents.append(fetch_website_content(resource))
    elif resource.endswith(".pdf"):
        documents.append(fetch_pdf_content(resource))

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

class SimpleRetriever:
    def __init__(self, vectorstore):
        self.vectorstore = vectorstore

    def get_relevant_documents(self, query):
        return self.vectorstore.similarity_search(query)

retriever = SimpleRetriever(vectorstore)

prompt_template = """
Given the following context:
{context}

Answer the question:
{question}

Provide only the PromQL query enclosed in code blocks.
"""

def format_docs(docs):
    return " ".join([doc.page_content for doc in docs])

class CustomStrOutputParser:
    def __call__(self, output):
        start_index = output.find('```')
        end_index = output.rfind('```')
        if start_index != -1 and end_index != -1 and start_index != end_index:
            return output[start_index + 3:end_index].strip()
        else:
            return output.strip()

def rag_chain(context, question):
    formatted_context = format_docs(context)
    prompt = prompt_template.format(context=formatted_context, question=question)
    response = llm.invoke(prompt)  
    return CustomStrOutputParser()(response)

def invoke_without_history(query):
    
    relevant_docs = retriever.get_relevant_documents(query)
    formatted_context = format_docs(relevant_docs)
    prompt = prompt_template.format(context=formatted_context, question=query)
    result = llm.invoke(prompt)
    parsed_result = CustomStrOutputParser()(result)

    return {"answer": parsed_result, "sources": relevant_docs}




In [None]:
def ask_question(query):
    result = invoke_without_history(query)

    if isinstance(result, dict) and "answer" in result and "sources" in result:
        answer = result["answer"]
        relevant_docs = result["sources"] 
    else:
        answer = result 
        relevant_docs = []

    sources = set(doc.metadata["source"] for doc in relevant_docs)

    print(f"Query: {query}")
    print(f"Answer: {answer}")
    print(f"Sources: {sources}")
    print()

In [None]:
ask_question("Give me the disk space usage percentage for the past 12 hours.")


Query: Give me the disk space usage percentage for the past 12 hours.
Answer: 100 - (1 - (sum(rate(node_filesystem_usage_bytes{mountpoint="/", fstype="ext4"}[12h]) / sum(node_filesystem_usage_bytes{mountpoint="/", fstype="ext4"}))) * 100)
Sources: {'https://promlabs.com/promql-cheat-sheet/'}



In [None]:
ask_question("whats the average response time of our api in last 5 minutes?")

Query: whats the average response time of our api in last 5 minutes?
Answer: sum(rate(http_requests_total{job=~"job$\\[a-z]*server\\b"}[5m])) / sum(rate(http_requests_total{job=~"job$\\[a-z]*server\\b"}[5m]))
Sources: {'https://prometheus.io/docs/prometheus/latest/querying/examples/'}

