In [70]:
from langchain_qdrant import QdrantVectorStore, FastEmbedSparse
from qdrant_client import QdrantClient
from dotenv import load_dotenv
import os
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_community.tools import WikipediaQueryRun
from langchain_tavily import TavilySearch
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_tavily import TavilySearch
from langchain_core.tools import Tool
from langchain_core.vectorstores.base import VectorStoreRetriever
from langchain.agents import create_agent
from langchain_core.prompts import ChatPromptTemplate


from dataclasses import dataclass
from pydantic import BaseModel, Field
# from typing import List
from langchain_core.documents import Document

from langchain_groq import ChatGroq
from langgraph.graph import StateGraph, START, END
from langgraph_supervisor import create_supervisor
from IPython.display import display, Image
from typing import List, Annotated
from langgraph.graph.message import add_messages
from langchain_core.messages import AnyMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from pprint import pprint
load_dotenv()


True

In [71]:
class ProcessLoader:

    def __init__(self):
        self.client = QdrantClient(
                            url=os.getenv('QDRANT_API_URL'),
                            api_key=os.getenv('QDRANT_API_KEY')
                        )
        self.dense_embedding = HuggingFaceEmbeddings(model = 'sentence-transformers/all-MiniLM-L12-v2')
        self.sparse_embedding = FastEmbedSparse(model_name ='Qdrant/bm25')
        
        self.qdrantdb = QdrantVectorStore(
            client=self.client,
            collection_name= os.getenv('COLLECTIONNAME'),
            embedding=self.dense_embedding,
            sparse_embedding=self.sparse_embedding
        )
        self.wiki_tool = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper(top_k_results= 5, doc_content_chars_max= 2000))
        self.tavily_tool = TavilySearch(
            max_results=5,
            topic="general",
        )

    @property
    def getPBIRetriver(self):

        return self.qdrantdb.as_retriever(
            search_type = 'mmr',
                search_kwarg = {
                    "k":10
                }
        )
    
    @property
    def wikiTool(self):
        return self.wiki_tool
    
    @property
    def tavilyTool(self):
        return self.tavily_tool
    
    @property
    def getllm(self):
        return ChatGroq(model='openai/gpt-oss-120b')
    
    


In [72]:
class structuredstatus(BaseModel):
    relevence : str = Field(description=""" 
                                        The field is populated based on the validation outcome: "Yes" is assigned 
                                        if the document is deemed relevant, and "No" if it is subsequently discarded.
                                        """,
                                        default_factory= str)


class DocWithStatus(BaseModel):

    doc: Document
    relevence : str = Field(description=""" 
                                        The field is populated based on the validation outcome: "Yes" is assigned 
                                        if the document is deemed relevant, and "No" if it is subsequently discarded.
                                        """,
                                        default_factory= str)

class CorrectiveRAGMetaData(BaseModel):

    query : str
    doclist : List[DocWithStatus] = Field(default_factory=list,description= """
                                                                                Field is consolidating all the Documents received from Retriever. 
                                                                                create list of DocWithStatus type.
                                                                            """)
    threshold : float = Field(default_factory=float, description= """ 
                                                                    Determines the currectness of retrived Document. 
                                                                    It is decimal number, representing Percentage of Currectness.
                                                                    """)
    answer: str = Field(default_factory=str)

In [73]:
obj = ProcessLoader()
pbiRetriever = obj.getPBIRetriver

In [74]:
def docRetreiver(state: CorrectiveRAGMetaData) -> CorrectiveRAGMetaData:

    """ 
    Retrieves relevant documents from a vector store based on a query sent by LLM. 
    This node handles the critical step of fetching context based on question attribute of state.

    :param state: object of CorrectiveRAGMetaData include below members.
                    a. query (String)
                    b. doclist (List of Document)
                    c. threshold (string)
                    d. answer (String)

    :type : CorrectiveRAGMetaData

    :returs : Object of CorrectiveRAGMetaData type where doclist will be updated by retriver (getPBIRetriver)  

    """

    print('---- In generateRetriverDocs-------')
    docs = pbiRetriever.invoke(state.query)
    return state.model_copy(update={
        "doclist" : [DocWithStatus(doc= doc, relevence= "") for doc in docs]
    })
    



In [75]:
def validateDocs(state: CorrectiveRAGMetaData) -> CorrectiveRAGMetaData:
    """ 
    The node performs relevance validation on the retrieved document list. 
    It processes each document and filters out those identified as irrelevant with the help of LLM, thereby ensuring the quality of the final result set.

    :param state: object of CorrectiveRAGMetaData include below members.
                    a. query (String)
                    b. doclist (List of Document)
                    c. threshold (string)
                    d. answer (String)

    :type : CorrectiveRAGMetaData

    :returns : Update the Doc List by discarding docs which are irrelevent. Also Calculate Threshold Percentage.
    """
    
    system_instruction = """You are a grader assessing relevance of a retrieved document to a user question.
                If the document is relevant, output only the word 'yes'. If it is not relevant, output only the word 'no'.
                Your response MUST be a single word: 'yes' or 'no'. Do not output anything else."""

    llm_base = obj.getllm

    grade_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_instruction),
            # The human message takes the dynamic input variables
            ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
        ]
    )

    doclist = []
    for docobj in state.doclist:

        doc_grade_retriever = grade_prompt | llm_base
        result = doc_grade_retriever.invoke({"question" : state.query,
                                    "document" : docobj.doc.page_content})
        
        print(result)
        if result.content.strip().lower() == 'yes':
            update_docobj =  DocWithStatus(doc= docobj.doc, relevence= result.content.strip().lower())
            doclist.append(update_docobj)

    threshold = len(doclist) / len(state.doclist) if len(state.doclist) != 0 else -1

    return state.model_copy(update={
        "doclist" : doclist,
        "threshold" : threshold
    })
    

    
    

In [76]:
def decideThreshold(state: CorrectiveRAGMetaData):

    """ 
    This will work as consitional Edge and deciding if Web Search is required or Not. 
    If State Threshold Value is less then 70% then it will return Web Seach Function.
    If State Threshold Value is greater then Or Equal to 70% then it will generate Output.

    Also it Considers No of Document Retured. 
    If retriever returns 3 or more Valid document then it will generate Answer.
    If retriever returns less then 3 Valid document then it will go for Web Search.

    :param state: object of CorrectiveRAGMetaData include below members.
                    a. query (String)
                    b. doclist (List of Document)
                    c. threshold (string)
                    d. answer (String)

    :type : CorrectiveRAGMetaData

    :returns : Function Name need to call in next step.
    """
    print(f"no of docs {len(state.doclist)} and Threshold is {state.threshold}")
    if len(state.doclist) >= 3 and state.threshold >= 0.7:
        print("generate")
        return "generateAnswer"
    else:
        print("Web Search")
        return "webSearch"


In [77]:
def generateAnswer(state: CorrectiveRAGMetaData) -> CorrectiveRAGMetaData:

    """
        Synthesizes the final answer by consolidating retrieved documents.

        This terminal node in the LLM chain takes the list of relevant documents, 
        feeds them into the Language Model (LLM) as context, and prompts the LLM to generate a single, coherent, and
        comprehensive answer that addresses all parts of the original query.

        :param state: object of CorrectiveRAGMetaData include below members.
                    a. query (String)
                    b. doclist (List of Document)
                    c. threshold (string)
                    d. answer (String)

        :type : CorrectiveRAGMetaData

        :returs : Object of CorrectiveRAGMetaData type where answer will be populated by LLM based on Documents received from retriver (pbi_retriver)  
    """
    print('---- In generateAnswer-------')
    context_documents = "\n\n".join([objdoc.doc.page_content for objdoc in state.doclist])
    prompt = f""" 
             You are an expert Q&A system. Your task is to generate a single, comprehensive, and well-structured answer to the user's original query.

                    **Instructions:**
                    1.  Read the **Original Query** 
                    2.  Carefully analyze the content of the **Context Documents** provided below.
                    3.  Synthesize the information from the documents to construct a complete answer that addresses all parts of the Original Query.
                    4.  Do not introduce any information that is not explicitly present in the Context Documents.
                    5.  Structure your final answer clearly using markdown headings and bullet points where appropriate.
                    6.  Refer Docstring for more details

                    **Original Query:**
                    {state.query}

                    **Context Documents:**
                    {context_documents}

                    **Final Answer:**

            """
    
    result = obj.getllm.invoke(prompt).content
    return state.model_copy(
        update={
            "answer": result
        }
    )

In [78]:
def getPrompt() -> str:

    return f""" 
                ### üìù GENERIC RESEARCH MANDATE ###

                    **PRIMARY TOPIC:** Based on User Query Topic should be taken

                    **RESEARCH OBJECTIVE:**
                    You are an expert, objective research analyst. Your task is to produce a comprehensive, multi-faceted report on the **PRIMARY TOPIC** that achieves a deep understanding of its current state, challenges, and future trajectory.

                    ---

                    ### üîç RESEARCH SCOPE & CONSTRAINTS ###
                    * **Timeframe:** Focus primarily on developments and data from the last **5 years** (e.g., 2020‚Äìpresent), unless historical context is strictly necessary to understand the current state.
                    * **Depth:** Research must move beyond Wikipedia-level summaries and include synthesis, analysis, and critical evaluation of information.
                    * **Exclusions:** Avoid pure dictionary definitions or tangential subjects. Focus exclusively on the topic's direct implications.

                    ---

                    ### üîë MANDATORY REPORT SECTIONS ###

                    The final output MUST be organized into the following four distinct sections.

                    **SECTION 1: Overview & Current State**
                    * **Goal:** Define the topic and establish its importance.
                    * **Content:** Provide a concise, professional definition. Detail the topic's current scale, major players, or widely accepted status quo.

                    **SECTION 2: Key Challenges & Controversies**
                    * **Goal:** Identify and analyze critical obstacles.
                    * **Content:** Identify and analyze the **three most significant challenges** or **major points of controversy** surrounding the topic. For each challenge, explain the nature of the problem and its primary contributing factors.

                    **SECTION 3: Emerging Trends & Future Trajectory**
                    * **Goal:** Forecast and highlight new developments.
                    * **Content:** Identify and analyze **two major emerging trends** or **potential future applications/solutions** that are likely to shape the topic over the next 5‚Äì10 years.

                    **SECTION 4: Conclusion & Strategic Implications**
                    * **Goal:** Summarize findings and provide actionable takeaways.
                    * **Content:** Summarize the main analytical findings. Answer this critical question: **"What is the single most important strategic implication for an organization or individual engaging with this topic?"**

                    ---

                    ### ‚ú® OUTPUT FORMATTING RULES ###
                    1.  **Tone:** Maintain an **analytical, objective, and professional** tone throughout.
                    2.  **Citations (Fictional):** For each of the three mandatory challenges and two emerging trends, invent and list a realistic-sounding **source type** (e.g., "[Source: 2024 Market Analysis Report]", "[Source: Nature Communications Paper, 2023]"). This simulates a research process and improves analytical depth.
                    3.  **Formatting:** Use markdown headings and bolding to clearly delineate the four sections and their sub-points.

                   
                """

In [79]:


def webSearch(state: CorrectiveRAGMetaData) -> CorrectiveRAGMetaData:

    """ 
    Based on Threshold, If Threshold Value is Less then 70%, Question is taken by React Agent. 
    It Consolidates all the Valid Previous Output with Web Search and regenerate DocList.

    :param state: object of CorrectiveRAGMetaData include below members.
                    a. query (String)
                    b. doclist (List of Document)
                    c. threshold (string)
                    d. answer (String)

    :type : CorrectiveRAGMetaData

    :returs : Object of CorrectiveRAGMetaData type where new List is upended to original DocList
    """

    tools = [obj.wikiTool, obj.tavilyTool]

    web_agent=create_agent(
                obj.getllm,
                tools=tools,
                system_prompt=getPrompt()
    )

    web_result = web_agent.invoke({"messages" :state.query})
    # print(web_result['messages'][-1].content)

    doc = DocWithStatus(doc = Document(page_content=web_result['messages'][-1].content), relevence='yes')
    state.doclist.append(doc)

    return state.model_copy(update={
        "doclist" : state.doclist
    })
    


In [80]:
state = CorrectiveRAGMetaData(
    query="What is Power BI?"   ,
    documents=[], # Provide an empty list
    messages=[], # Provide an empty list
    answer="" # Provide an empty string (or whatever type 'answer' is) 
)


In [81]:
st_obj = docRetreiver(state=state)
st_obj = validateDocs(state=st_obj)
st_obj = generateAnswer(state=st_obj)
web_agent = webSearch(state=st_obj)

# result = st_obj.messages
# print(result)


---- In generateRetriverDocs-------
content='yes' additional_kwargs={'reasoning_content': 'We need to decide if the retrieved document is relevant to the user question "What is Power BI?" The document defines what Power BI is, includes description. So yes.'} response_metadata={'token_usage': {'completion_tokens': 45, 'prompt_tokens': 428, 'total_tokens': 473, 'completion_time': 0.100100264, 'completion_tokens_details': {'reasoning_tokens': 35}, 'prompt_time': 0.020124351, 'prompt_tokens_details': None, 'queue_time': 0.034495723, 'total_time': 0.120224615}, 'model_name': 'openai/gpt-oss-120b', 'system_fingerprint': 'fp_4867cb64c5', 'service_tier': 'on_demand', 'finish_reason': 'stop', 'logprobs': None, 'model_provider': 'groq'} id='lc_run--912efd13-18df-4e5d-93d9-be746b9f95e7-0' usage_metadata={'input_tokens': 428, 'output_tokens': 45, 'total_tokens': 473, 'output_token_details': {'reasoning': 35}}
content='no' additional_kwargs={'reasoning_content': 'We need to decide if the retrieved 

In [82]:
graph = StateGraph(CorrectiveRAGMetaData)

graph.add_node("docRetreiver", docRetreiver)
graph.add_node("validateDocs", validateDocs)
graph.add_node("generateAnswer", generateAnswer)
graph.add_node("webSearch", webSearch)

# 1. Start the workflow
graph.add_edge(START, "docRetreiver")

# 2. Sequential step
graph.add_edge("docRetreiver", "validateDocs")

# 3. Conditional branch based on validation
graph.add_conditional_edges("validateDocs",
                             decideThreshold,
                             {
                                 "generateAnswer": "generateAnswer",  # Valid docs -> Generate Answer
                                 "webSearch": "webSearch"     # Invalid docs -> Web Search
                             }
                           )

# 4. Rejoin the main path after Web Search
graph.add_edge("webSearch", "generateAnswer")

# 5. Connect the final step to the END state (This was the missing/commented part)
graph.add_edge("generateAnswer", END) 

# Compile the graph
graph_builder = graph.compile()


In [85]:
graph_builder.invoke(CorrectiveRAGMetaData(query="What Power BI Q&A feature?"   ,
    documents=[], # Provide an empty list
    messages=[], # Provide an empty list
    answer="" # Provide an empty string (or whatever type 'answer' is) 
))

---- In generateRetriverDocs-------
content='no' additional_kwargs={'reasoning_content': 'We need to decide if the retrieved document is relevant to the user question: "What Power BI Q&A feature?" The document says: "That\'s an extensive list covering Power BI performance, advanced features, and scenario-based questions! Here are the answers." That\'s generic, not specifically about Q&A feature. It doesn\'t mention Q&A. So not relevant. Output "no".'} response_metadata={'token_usage': {'completion_tokens': 85, 'prompt_tokens': 175, 'total_tokens': 260, 'completion_time': 0.205048326, 'completion_tokens_details': {'reasoning_tokens': 75}, 'prompt_time': 0.007742901, 'prompt_tokens_details': None, 'queue_time': 0.03439204, 'total_time': 0.212791227}, 'model_name': 'openai/gpt-oss-120b', 'system_fingerprint': 'fp_4867cb64c5', 'service_tier': 'on_demand', 'finish_reason': 'stop', 'logprobs': None, 'model_provider': 'groq'} id='lc_run--736ac3fa-ad51-40a9-b638-e8d4ffda22d5-0' usage_metadata=

{'query': 'What Power BI Q&A feature?',
 'doclist': [DocWithStatus(doc=Document(metadata={}, page_content='## **SECTION 1 ‚Äì Overview & Current State**\n\n**What is Power\u202fBI Q&A?**  \nPower\u202fBI\u202fQ&A is the natural‚Äëlanguage query engine embedded in Microsoft\u202fPower\u202fBI that lets users type questions (e.g., ‚Äútotal sales by region last quarter‚Äù) and instantly receive visual answers‚Äîcharts, tables, or maps‚Äîgenerated on the fly from the underlying data model. It leverages a semantic layer (the *model‚Äôs* tables, columns, relationships, and metadata) plus built‚Äëin natural‚Äëlanguage processing (NLP) to translate user intent into DAX (Data Analysis Expressions) queries.\n\n**Why it matters today**  \n- **Self‚Äëservice analytics:** Q&A removes the need for every stakeholder to learn DAX or build visuals, dramatically shortening the ‚Äúdata‚Äëto‚Äëinsight‚Äù cycle.  \n- **Adoption driver:** Since its GA in 2016, Microsoft reports that Q&A is now enabled in **