In [None]:
!pip install --quiet --upgrade langchain-text-splitters langchain-community

In [None]:
!pip install langchain

In [None]:
!pip install -qU langchain-milvus

In [None]:
!pip install langchain-milvus

In [None]:
!pip install -U pymilvus

In [None]:
!pip install langchain_huggingface

In [None]:
import getpass
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

In [None]:
from huggingface_hub import login
login(token="paste the below comment")

In [None]:
from langchain_huggingface import HuggingFaceEndpoint

repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
llm = HuggingFaceEndpoint(repo_id=repo_id, model_kwargs={"max_length": 128})

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

In [None]:
!pip install requests beautifulsoup4

In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import requests
from bs4 import BeautifulSoup
from langchain.schema import Document
# Function to fetch website content
def fetch_website_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = ' '.join(p.get_text() for p in soup.find_all('p'))
    return Document(page_content=text, metadata={"source": url})

# Function to fetch content from a PDF
def fetch_pdf_content(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return Document(page_content=text, metadata={"source": file_path})

# List of URLs and PDF paths to load
resources = [
    "https://prometheus.io/docs/prometheus/latest/querying/examples/",
    "https://promlabs.com/promql-cheat-sheet/",
    "https://prometheus.io/docs/prometheus/latest/querying/basics/",
    "https://prometheus.io/docs/prometheus/latest/querying/operators/",
    "https://prometheus.io/docs/prometheus/latest/querying/functions/",
    "https://prometheus.io/docs/prometheus/latest/querying/api/",
    "https://prometheus.io/docs/prometheus/latest/http_sd/",
    "/content/Dataset.csv - Sheet1.csv"
]

# Load documents from URLs and PDFs
documents = []
for resource in resources:
    if resource.startswith("http"):
        documents.append(fetch_website_content(resource))
    elif resource.endswith(".pdf"):
        documents.append(fetch_pdf_content(resource))

# Split documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

In [None]:
from langchain_milvus import Milvus

URI = "./milvus_promql.db"

vectorstore = Milvus.from_documents(
    documents=splits,
    embedding=embeddings,
    collection_name="prometheus_docs",
    connection_args={"uri": URI},
    drop_old=True,
)

In [None]:
class SimpleRetriever:
    def __init__(self, vectorstore):
        self.vectorstore = vectorstore

    def get_relevant_documents(self, query):
        return self.vectorstore.similarity_search(query)

retriever = SimpleRetriever(vectorstore)

In [None]:
prompt_template = """
Given the following context:
{context}

Answer the question:
{question}

Provide only the PromQL query enclosed in code blocks.
"""

In [None]:
def format_docs(docs):
    return " ".join([doc.page_content for doc in docs])

class CustomStrOutputParser:
    def __call__(self, output):

        start_index = output.find('```')
        end_index = output.rfind('```')
        if start_index != -1 and end_index != -1 and start_index != end_index:
            return output[start_index + 3:end_index].strip()
        else:
            return output.strip()

def rag_chain(context, question):
    formatted_context = format_docs(context)
    prompt = prompt_template.format(context=formatted_context, question=question)
    response = llm.invoke(prompt)
    return CustomStrOutputParser()(response)

def invoke_without_history(query):

    relevant_docs = retriever.get_relevant_documents(query)
    formatted_context = format_docs(relevant_docs)
    prompt = prompt_template.format(context=formatted_context, question=query)
    result = llm.invoke(prompt)
    parsed_result = CustomStrOutputParser()(result)

    return {"answer": parsed_result, "sources": relevant_docs}

In [None]:
def ask_question(query):
    result = invoke_without_history(query)

    if isinstance(result, dict) and "answer" in result and "sources" in result:
        answer = result["answer"]
        relevant_docs = result["sources"]
    else:
        answer = result
        relevant_docs = []

    sources = set(doc.metadata["source"] for doc in relevant_docs)

    print(f"Query: {query}")
    print(f"Answer: {answer}")
    print(f"Sources: {sources}")
    print()

In [None]:
ask_question("Give me the disk space usage percentage for the past 12 hours.")

Query: Give me the disk space usage percentage for the past 12 hours.
Answer: rate(node_filesystem_free{fstype="ext4"}[12h]/rate(node_filesystem_size{fstype="ext4"}[12h]))*100
Sources: {'https://promlabs.com/promql-cheat-sheet/', 'https://prometheus.io/docs/prometheus/latest/querying/examples/', 'https://prometheus.io/docs/prometheus/latest/querying/functions/'}

