# Config-Assistant RAG Pipeline

**1. Intialising Tracers and LLM**

In [2]:
from langchain_core.documents import Document
from langchain_together import ChatTogether, TogetherEmbeddings
from langchain.chains import LLMChain
from langchain_chroma import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv # type: ignore
from pathlib import Path
import json 
import os
from tqdm import tqdm

load_dotenv()
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
langsmith_project = os.getenv("LANGSMITH_PROJECT")
langchain_tracking = os.getenv("LANGCHAIN_TRACKING")
together_api_key = os.getenv("TOGETHER_API_KEY")
langchain_endpoint = os.getenv("LANGCHAIN_ENDPOINT")

llm = ChatTogether(
    model = "mistralai/Mistral-7B-Instruct-v0.3",
    together_api_key= os.getenv("TOGETHER_API_KEY"), # type: ignore
    temperature=0
)

**2. Declaring Funcion to Vectorise Dofiles**

In [3]:
def vectorise_configs(path: Path):
    documents = []

    for file in tqdm(list(path.rglob("*.json")), desc="Processing JSON files"):
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)

        for block in data.get("structure", {}).values():
            chunk_text = f"{block.get('purpose', '')}\n\n{block.get('content', '')}"

            raw_metadata = {
                "repo_name": data.get("repo_name"),
                "path": block.get("path"),
                "language": block.get("language"),
                "dependencies": block.get("dependencies"),
            }

            # Simple manual filter:
            allowed_types = (str, int, float, bool, type(None))
            safe_metadata = {k: v for k, v in raw_metadata.items() if isinstance(v, allowed_types)}

            documents.append(
                Document(
                    page_content=chunk_text,
                    metadata=safe_metadata,
                )
            )

    embeddings = TogetherEmbeddings(
        model="togethercomputer/m2-bert-80M-32k-retrieval",
        api_key=os.getenv("TOGETHER_API_KEY")
    )

    persist_dir = "/Users/krishiv/Desktop/Projects/config-assistant/retrievers/configs"

    vectorstore = Chroma.from_documents(
        documents,
        embeddings,
        persist_directory=persist_dir
    )

    print(f"[+] Stored {len(documents)} documents in vectorstore at: {persist_dir}")

    return vectorstore

**3. Declaring Funtion to Vectorise Docs**

In [10]:
def vectorise_docs(path):
    documents = []
    for file in tqdm(list(path.rglob("*.json")), desc="Processing JSON files"):
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Use the correct key: "sections"
        chunk_text = ""
        for block in data.get("sections", []):
            if block.get("type") == "header":
                if chunk_text:
                    documents.append(Document(
                        page_content=chunk_text.strip(),
                        metadata={
                            "page": data.get("page", ''),
                            "url": data.get("url", '')
                        }
                    ))
                    chunk_text = ""
                chunk_text += f"{block.get('content', '')}\n\n"
            elif block.get("type") in ["paragraph", "list", "code"]:
                content = block.get("items", []) if block.get("type") == "list" else block.get('content', '')
                chunk_text += f"{content}\n\n"
            elif block.get("type") == "table":
                headers = block.get("headers", [])
                rows = block.get("rows", [])
                header_line = "| " + " | ".join(headers) + " |"
                separator_line = "| " + " | ".join(["---"] * len(headers)) + " |"
                row_lines = []
                for row in rows:
                    row_line = "| " + " | ".join(row.get(header, "") for header in headers) + " |"
                    row_lines.append(row_line)
                chunk_text += "\n".join([header_line, separator_line] + row_lines) + "\n\n"
        if chunk_text:
            documents.append(Document(
                page_content=chunk_text.strip(),
                metadata={
                    "page": data.get("page", ''),
                    "url": data.get("url", '')
                }
            ))

    print(f"Number of documents to store: {len(documents)}")
    if not documents:
        raise ValueError("No documents found to vectorise. Check your input data.")

    embeddings = TogetherEmbeddings(
        model="togethercomputer/m2-bert-80M-32k-retrieval",
        api_key=os.getenv("TOGETHER_API_KEY")
    )

    persist_dir = "/Users/krishiv/Desktop/Projects/config-assistant/retrievers/docs"

    vectorstore = Chroma.from_documents(
        documents,
        embeddings,
        persist_directory=persist_dir
    )

    print(f"[+] Stored {len(documents)} documents in vectorstore at: {persist_dir}")

    return vectorstore

**4. Setting Up Retriver for Docs**

In [13]:
docs_path = Path("/Users/krishiv/Desktop/Projects/config-assistant/data/docs")
vectorstore_docs = vectorise_docs(docs_path)
retriever_docs = vectorstore_docs.as_retriever()

Processing JSON files: 100%|██████████| 12/12 [00:00<00:00, 2396.06it/s]


Number of documents to store: 83
[+] Stored 83 documents in vectorstore at: /Users/krishiv/Desktop/Projects/config-assistant/retrievers/docs


**5. Setting Up Retreiver for Configs**

In [14]:
configs_path = Path("/Users/krishiv/Desktop/Projects/config-assistant/data/configs")
vectorstore_configs = vectorise_configs(configs_path)
retriever_configs = vectorstore_configs.as_retriever()

Processing JSON files: 100%|██████████| 16/16 [00:00<00:00, 475.44it/s]


[+] Stored 388 documents in vectorstore at: /Users/krishiv/Desktop/Projects/config-assistant/retrievers/configs


**6. Docs Retrieval Chain**

In [None]:
from langchain.prompts import ChatPromptTemplate
def retrieve_docs(query: str):
    template_docs = """
    You are a helpful assistant tasked with helping users configure SketchyBar on macOS.

    You will be given a USER QUERY about some aspect of configuring SketchyBar.

    Your job is to rewrite this query into a *precise search query*, optimized for retrieving the most relevant DOCUMENTATION chunks from the SketchyBar documentation vector store.

    For reference, the following are some hihg-level fields of the SketchyBar documentation:
    - animations 
    - bar
    - components
    - events
    - features
    - items
    - popups
    - plugins 
    - scripting
    - setup
    - tricks
    - types
    - querying
    - reloading 

    Guidelines:
    - Focus on rewriting to match the style and phrasing of the SketchyBar docs.
    - If the user asks about a feature, turn it into a direct documentation topic.
    - Do not include user greetings, personal statements, or conversational fluff — only the technical search query.
    - Output a single line, no extra text.

    USER QUERY: {query}

    DOCUMENTATION SEARCH QUERY:
    """
    rewriting_template =  ChatPromptTemplate.from_template(template_docs)


    rewritten_query = (
        rewriting_template
        | ChatTogether(temperature=0)
        | (lambda x: x.content if hasattr(x, "content") else str(x))
    )

    rewritten_query_result = rewritten_query.invoke({"query": query})

    # Use .content if needed
    retrieved_docs = retriever_docs.invoke(
        rewritten_query_result.content if hasattr(rewritten_query_result, "content") else rewritten_query_result
    )

    print(f"Rewritten query: {rewritten_query_result.content.strip() if hasattr(rewritten_query_result, 'content') else str(rewritten_query_result).strip()}")
    print(f"Retrieved {len(retrieved_docs)} docs:")
    for doc in retrieved_docs:
        print(doc.page_content[:300])

In [None]:

from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ran
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results


def retrieve_configs(query: str):
    template_configs = """
You are a helpful assistant tasked with helping users configure SketchyBar on macOS.

You will be given a USER QUERY asking for some feature or behavior in SketchyBar.

Your task is to generate 5 diverse and precise search queries that will help retrieve the most relevant documents from a vector store of community-contributed SketchyBar configuration files (dotfiles).

Guidelines:
- Each search query should be clear, concise, and optimized for finding matching configuration code.
- Try different angles: vary the wording, include relevant SketchyBar terms, possible item names, script types, or config variables.
- Think about common patterns used in SketchyBar config files (such as `sketchybarrc`, `.sh` files, scripting hooks, item setup, animations, etc.).
- Do NOT include user greetings, conversational fluff, or explanations — only the 5 search queries as separate lines.

Example USER QUERY:
"How can I add a Spotify player with album art to my SketchyBar?"

OUTPUT:
spotify player sketchybar config
spotify now playing sketchybarrc
sketchybar media controls config
spotify album art sketchybar item
sketchybar integrate spotify script

Now for the actual query:

USER QUERY: {query}

OUTPUT (5 search queries):
"""

    multi_query_template = ChatPromptTemplate.from_template(template_configs)  

    generate_queries = (
        multi_query_template 
        | ChatTogether(temperature=0) 
        | (lambda x: x.content if hasattr(x, "content") else str(x))
        | (lambda x: x.split("\n"))
    )


    configs_retrieval_chain = generate_queries | retriever_configs.map() | reciprocal_rank_fusion
    retrieved_configs = configs_retrieval_chain.invoke({"query": query })
    return retrieved_configs