# Setting up Environment

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY ")
os.environ["LANGCHAIN_TRACING_V2"] = "false"
os.environ["USER_AGENT"] = "myagent"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

from langchain_groq import ChatGroq

llm = ChatGroq(model="llama-3.1-8b-instant")

# Loading Documents

In [2]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

urls = [
    "https://docs.opensearch.org/docs/latest/query-dsl/",
    "https://docs.opensearch.org/docs/latest/query-dsl/query-filter-context/",
    "https://docs.opensearch.org/docs/latest/query-dsl/term-vs-full-text/",
    "https://docs.opensearch.org/docs/latest/query-dsl/term/index/",
    "https://docs.opensearch.org/docs/latest/query-dsl/term/exists/",
    "https://docs.opensearch.org/docs/latest/query-dsl/term/fuzzy/",
    "https://docs.opensearch.org/docs/latest/query-dsl/term/ids/",
    "https://docs.opensearch.org/docs/latest/query-dsl/term/prefix/",
    "https://docs.opensearch.org/docs/latest/query-dsl/term/range/",
    "https://docs.opensearch.org/docs/latest/query-dsl/term/regexp/",
    "https://docs.opensearch.org/docs/latest/query-dsl/term/term/",
    "https://docs.opensearch.org/docs/latest/query-dsl/term/terms/",
    "https://docs.opensearch.org/docs/latest/query-dsl/term/terms-set/",
    "https://docs.opensearch.org/docs/latest/query-dsl/term/wildcard/",
    "https://docs.opensearch.org/docs/latest/query-dsl/full-text/index/",
    "https://docs.opensearch.org/docs/latest/query-dsl/full-text/match/",
    "https://docs.opensearch.org/docs/latest/query-dsl/full-text/match-bool-prefix/",
    "https://docs.opensearch.org/docs/latest/query-dsl/full-text/match-phrase/",
    "https://docs.opensearch.org/docs/latest/query-dsl/full-text/match-phrase-prefix/",
    "https://docs.opensearch.org/docs/latest/query-dsl/full-text/multi-match/",
    "https://docs.opensearch.org/docs/latest/query-dsl/full-text/query-string/",
    "https://docs.opensearch.org/docs/latest/query-dsl/full-text/simple-query-string/",
    "https://docs.opensearch.org/docs/latest/query-dsl/full-text/intervals/",
    "https://docs.opensearch.org/docs/latest/query-dsl/compound/index/",
    "https://docs.opensearch.org/docs/latest/query-dsl/compound/bool/",
    "https://docs.opensearch.org/docs/latest/query-dsl/compound/boosting/",
    "https://docs.opensearch.org/docs/latest/query-dsl/compound/constant-score/",
    "https://docs.opensearch.org/docs/latest/query-dsl/compound/disjunction-max/",
    "https://docs.opensearch.org/docs/latest/query-dsl/compound/function-score/",
    "https://docs.opensearch.org/docs/latest/query-dsl/compound/hybrid/",
    "https://docs.opensearch.org/docs/latest/query-dsl/geo-and-xy/index/",
    "https://docs.opensearch.org/docs/latest/query-dsl/geo-and-xy/geo-bounding-box/",
    "https://docs.opensearch.org/docs/latest/query-dsl/geo-and-xy/geodistance/",
    "https://docs.opensearch.org/docs/latest/query-dsl/geo-and-xy/geopolygon/",
    "https://docs.opensearch.org/docs/latest/query-dsl/geo-and-xy/geoshape/",
    "https://docs.opensearch.org/docs/latest/query-dsl/geo-and-xy/xy/",
    "https://docs.opensearch.org/docs/latest/query-dsl/joining/index/",
    "https://docs.opensearch.org/docs/latest/query-dsl/joining/has-child/",
    "https://docs.opensearch.org/docs/latest/query-dsl/joining/has-parent/",
    "https://docs.opensearch.org/docs/latest/query-dsl/joining/nested/",
    "https://docs.opensearch.org/docs/latest/query-dsl/joining/parent-id/",
    "https://docs.opensearch.org/docs/latest/query-dsl/span/index/",
    "https://docs.opensearch.org/docs/latest/query-dsl/span/span-containing/",
    "https://docs.opensearch.org/docs/latest/query-dsl/span/span-field-masking/",
    "https://docs.opensearch.org/docs/latest/query-dsl/span/span-first/",
    "https://docs.opensearch.org/docs/latest/query-dsl/span/span-multi-term/",
    "https://docs.opensearch.org/docs/latest/query-dsl/span/span-near/",
    "https://docs.opensearch.org/docs/latest/query-dsl/span/span-not/",
    "https://docs.opensearch.org/docs/latest/query-dsl/span/span-or/",
    "https://docs.opensearch.org/docs/latest/query-dsl/span/span-term/",
    "https://docs.opensearch.org/docs/latest/query-dsl/span/span-within/",
    "https://docs.opensearch.org/docs/latest/query-dsl/match-all/",
    "https://docs.opensearch.org/docs/latest/query-dsl/specialized/index/",
    "https://docs.opensearch.org/docs/latest/query-dsl/specialized/distance-feature/",
    "https://docs.opensearch.org/docs/latest/query-dsl/specialized/k-nn/",
    "https://docs.opensearch.org/docs/latest/query-dsl/specialized/neural/",
    "https://docs.opensearch.org/docs/latest/query-dsl/specialized/neural-sparse/",
    "https://docs.opensearch.org/docs/latest/query-dsl/specialized/script-score/",
    "https://docs.opensearch.org/docs/latest/query-dsl/specialized/template/",
    "https://docs.opensearch.org/docs/latest/query-dsl/minimum-should-match/",
]

bs4_strainer = bs4.SoupStrainer("div", class_="main-content")

web_loader = WebBaseLoader(
    web_paths=urls,
    bs_kwargs={"parse_only": bs4_strainer}
)

web_documents = web_loader.load()

In [3]:
import re

# Clean the content by replacing multiple whitespace characters with a single space
for doc in web_documents:
    doc.page_content = re.sub(r'\s+', ' ', doc.page_content).strip()

# Chunking

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
)

docs = text_splitter.split_documents(web_documents)

print(len(docs))
# print(docs[1].page_content)

393


# Embedding and Storing

In [5]:
from langchain_milvus import Milvus
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "BAAI/bge-m3"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}

embedding = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

URI = "./os_qdsl.db"

vectorstore = Milvus.from_documents(
    documents=docs,
    embedding=embedding,
    collection_name="opensearch_docs",
    connection_args={"uri": URI},
    drop_old=True,
)

# Context Retreival

In [6]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# retrieved_docs = retriever.invoke("Get logs where status is 404 and method is GET")
# print(retrieved_docs[0])

# Prompt Template

In [7]:
from langchain.prompts import PromptTemplate

template = """
You are an intelligent assistant that converts natural language questions into OpenSearch Query DSL.

You are provided with context that may include field names, data types, index patterns, and filter conditions relevant to the OpenSearch dataset.

### Context:
{context}

### Guidelines:
- Parse the input to identify fields, values, operators (equals, greater than, etc.), and logical relationships.
- Use appropriate Query DSL clauses such as `match`, `term`, `range`, or `bool` with `must`, `should`, and `filter`.
- Always return a valid OpenSearch query body (JSON only), suitable for use in `_search` endpoints.
- Do **not** include explanations or comments—output only the JSON inside a code block.

### Input:
{question}

### Example:
**Input**: "Get logs where status is 404 and method is GET"

**Output**:
```json
{{
  "query": {{
    "bool": {{
      "must": [
        {{ "term": {{ "status": 404 }} }},
        {{ "term": {{ "method": "GET" }} }}
      ]
    }}
  }}
}}
"""
custom_rag_prompt = PromptTemplate.from_template(template)

# RAG Chain

In [8]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs , "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("Get all logs where status is 404 and method is POST")

'```json\n{\n  "query": {\n    "bool": {\n      "must": [\n        { "term": { "status": 404 } },\n        { "term": { "method": "POST" } }\n      ]\n    }\n  }\n}\n```'