In [1]:
import os
import logging

processed_docs_path = "C:\BlueAI_bkp\data\processed\wikiextractor"


  processed_docs_path = "C:\BlueAI_bkp\data\processed\wikiextractor"


In [2]:
from phoenix.otel import register

# configure the Phoenix tracer
tracer_provider = register(
    project_name="Base-RAG",                 # your project
    endpoint="http://localhost:6006/v1/traces",  # Phoenix Docker HTTP collector
    protocol="grpc",               # force HTTP instead of gRPC
    auto_instrument=True,                   # auto-instrument LangChain + others
    batch=True, 
)

OpenTelemetry Tracing Details
|  Phoenix Project: Base-RAG
|  Span Processor: BatchSpanProcessor
|  Collector Endpoint: http://localhost:6006/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [3]:
from pathlib import Path
from typing import Iterator, Dict, Any, List
import json
from dotenv import load_dotenv
import os

from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# NEW
from langchain_text_splitters import RecursiveCharacterTextSplitter
import tiktoken

load_dotenv('.env')


BASE_DIR = Path("data/processed/wikiextractor")
CHROMA_DIR = Path("chromadb")
EMBEDDING_MODEL = "text-embedding-3-small"


In [15]:
# os.getenv("OPENAI_API_KEY")

In [5]:
from typing import Any
from typing_extensions import Optional

from langchain.agents import create_agent, AgentState
from langchain.agents.middleware import before_model, after_model
from langchain.messages import HumanMessage
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnableConfig

from langgraph.checkpoint.memory import InMemorySaver

In [6]:

class ChatAgentState(AgentState):
    audit_state: Optional[bool]
    audit_remarks: Optional[str]
    initial_user_query: str
    retrieval_query: Optional[str]
    retrieved_docs: Optional[List[str]]

model = ChatOpenAI(
    model="gpt-5",   # or "gpt-4o", "gpt-4.1-mini", etc.
    temperature=0.0,
)

In [7]:
from typing import List, Tuple

from langchain_core.documents import Document
from langchain.tools import tool

@tool
def similarity_search_tool(state:ChatAgentState, query: str, k: int = 4, title:str | None = None) -> Tuple[str, List[Document]]:
    """
    Retrieve documents similar to the query using the vector store.

    Inputs: 
        1. Query(str) -> User query to perform vector search over vectorDB. 
        2. k:int (default 4) 
        3. title(str|None) -> title used for metadata filtering in the database.

    Returns:
        A (content, artifact) tuple:
        - content: serialized text for the chat model
        - artifact: raw List[Document] for downstream use / debugging
    """

    embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
    print(F"Input to Similarity search tool: {query}")
    vectordb = Chroma(
        persist_directory="chromadb",
        embedding_function=embeddings,
        collection_name="wiki_short_150",
    )
    if title is None:
        doc_score_pairs = vectordb.similarity_search_with_relevance_scores(
            query=query,
            k=k
        )
    else:
        print(f"Title filter supplied: {title}")
        doc_score_pairs = vectordb.similarity_search_with_relevance_scores(
            query=query,
            k=k,
            filter={"title": title}
        )

    docs: List[Document] = [doc for doc, _ in doc_score_pairs]

    # 3) Serialize in an LLM-friendly, structured way
    parts = []
    for idx, (doc, score) in enumerate(doc_score_pairs, start=1):
        part = (
            f"### Document {idx}\n"
            f"relevance_score: {score:.4f}\n"
            f"metadata: {doc.metadata}\n"
            f"content:\n{doc.page_content}\n"
        )
        parts.append(part)

    content_for_llm = "\n\n".join(parts) if parts else "No documents found."

    # 4) Return string for LLM + raw docs for downstream use
    return {
        "retrieved_docs": f"{state['retrieved_docs']}\n\nRetrieval Query:{query}\n{content_for_llm}",
        "retrieval_query": query
    }


In [8]:
# similarity_search_tool.run("Who was Julius Caesar?")

In [9]:
from langchain.messages import AIMessage, ToolMessage
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

auditor_prompt = PromptTemplate.from_template(
"""
Your role is to audit a given retrieval question and determine whether the retrieval query (and its parameters) are sufficient to obtain meaningful results from the similarity_search_tool.
If not, you must suggest precise modifications that will improve retrieval.

You are NOT judging whether the retrieved content is perfectly factually correct in the real world.
You are ONLY judging whether the retrieved content is thematically relevant and provides some usable signal for the model to attempt an answer.

#### Description of similarity_search_tool:
- Inputs: retrieval_query (required), k (optional), title (optional).
- The tool returns top-k most similar chunks from the vector database.
- 'title' filters results by metadata, helping group related documents.
- Your job is to improve the retrieval query when needed.

#### Inputs:
retrieval_query used: {retrieval_query}
previous_tool_runs (tool outputs): {tool_messages}

#### Internal Reasoning Style (Chain-of-Thought):
- First, briefly reason about:
  - What the retrieval_query is trying to get.
  - Whether the retrieved chunks are on the same topic or closely related topics.
  - Whether there is at least some useful information that the model could use to form a partial or approximate answer.
- You should be tolerant of approximate or partially overlapping context.
  - Example: If the query is about “the last battle of Caesar” and the retrieved chunks talk about the “Battle of Munda” within the broader Caesar campaigns, treat this as relevant context, not a failure, even if there are nuances about campaigns like Pharsalus.
- Only consider it a failure if:
  - The chunks are clearly off-topic or unrelated, OR
  - There is essentially no substantive information that could help answer the query.

Keep your reasoning short (2–4 sentences) and do NOT include external world knowledge beyond what is implied by the query and tool outputs.

#### Rules:
1. You must ONLY analyze the retrieval_query and the tool outputs (previous_tool_runs).
2. DO NOT invent new knowledge about the world or query subject.
3. Determine whether the retrieved context (seen in previous_tool_runs) is reasonably sufficient to let the model attempt an answer:
   - If at least one or two chunks are on-topic and contain some relevant information, even if incomplete or not perfectly aligned, treat this as sufficient.
   - Only mark as insufficient when the retrieved chunks are mostly or entirely irrelevant, empty, or contain no usable signal.
4. If sufficient → audit_state = true and audit_remarks = "Successful".
5. If insufficient → audit_state = false and audit_remarks must contain:
     - The specific issue (e.g., missing title, query too broad, irrelevant results, too few retrieved chunks).
     - A concrete suggestion on how to modify the retrieval_query, title, or k (e.g., “increase k from 4 to 10”, “add a title filter related to ‘Roman Empire’”, “make the query more specific to XYZ”).

#### Output Format (JSON only):
{{
  "audit_state": true | false,
  "audit_remarks": "Successful" | "Failure: <explanation + suggested changes>"
}}

"""
)


@tool
def audit_node(state: ChatAgentState):
    """
    Tool is used to Audit the status of a response, specifically in scenarios where it returns I don't know or is unable to answer. 
    """
    chat_history = state['messages']
    latest_ai_message = ""
    for message in chat_history:
        if isinstance(message, AIMessage):
            latest_ai_message=message
    # tool_messages = [
    # msg for msg in chat_history
    # if isinstance(msg, ToolMessage) and msg.name == "similarity_search_tool"
    # ]
    tool_messages = state['retrieved_docs']

    chain = auditor_prompt | model | JsonOutputParser()

    output = chain.invoke({
        "initial_user_query": state['initial_user_query'],
        "retrieval_query": state['retrieval_query'],
        "tool_messages": tool_messages,
    })
    print("Auditor Ouptut: ",output)
    # print(f"{output['messages'][0].content}")
    return {
        "audit_state": output['audit_state'],
        "audit_remarks": output['audit_remarks']
    }

In [10]:
# audit_node("What river did he cross?","turn 1: Caesar was a king. turn 2: I dont know")

In [11]:


@before_model
def pre_model(state: ChatAgentState, runtime) -> dict[str, Any] | None:
    """
    Runs before the LLM is called.
    You can:
      - trim / summarize messages
      - inject extra context
      - modify state["remaining_steps"], etc.
    Return a partial state update or None for no-op.
    """
    # Example NO-OP (scaffolding only)
    # You could, for example, keep only the last N messages here.
    return None


@after_model
def post_model(state: ChatAgentState, runtime) -> dict[str, Any] | None:
    """
    Runs after the LLM responds (but before tools run again).
    You can:
      - validate / edit model output
      - add guardrails
      - log things
    Return a partial state update or None for no-op.
    """
    return None

checkpointer = InMemorySaver()

# --- 5. Build the agent -------------------------------------------------------
tools = [similarity_search_tool, audit_node]

# #### Rules: 
# 1. You cannot answer from beyond the retrieved documents. 
# 2. Use the similarity_search_tool when you need documents to answer a user's question. DO NOT embellish the question with your own knowledge. All knwoledge must come from chat history and retreived documents                 
# 3. When the retrieved documents do not contain the data to a question, use the auditor tool and audit remarks to improve the query being sent to the similarity_search_tool. 
# 4. Always provide the title and url as wel as the chunk_index of the passage that is being used. 
# 5. Provide the user's query directly to the similiarty_search_tool
# 6. Once audit status turns to True, you can end the process

# #### Situations where context is you are unable to answer a question
# 1. Call the RAG Auditor tool. Rephrase the question based on Audit and provide the answer 
# 2. Retry a maximum of 2 times, If data is not found, return with "I am sorry, I am unable to answer the question.

system_prompt = """
#### Role: 
- You are a chat assistant that is developed to answer the user's question. 

#### Rules: 
1. You cannot answer from beyond the retrieved documents.
2. Pass the user's question to the similarity_search_tool, **DO NOT** use your own data while rephrasing questions. The rephrasing should be driven through Auditor feedback Or Previously messages ONLY.
3. Get the output Auditted.
4. Once Audit Status is True, provide your output
5. Always provide the title and url as wel as the chunk_index of the passage that is being used. 

## Examples
#### Example 1: 


Inputs: 
Initial user query: {initial_user_query}
Audit Status: {audit_status}
Audit Remarks: {audit_remarks}
"""

agent = create_agent(
    model=model,
    tools=tools,
    system_prompt=system_prompt,
    state_schema=ChatAgentState,          # ensures state has messages + remaining_steps
    middleware=[pre_model, post_model], # hooks for future context management
    checkpointer=checkpointer,          # short-term memory (thread-scoped)
)

# @tool
# def audit_node(state: )

# --- 6. Simple helper for calling the agent -----------------------------------
def run_agent(user_query: str, thread_id: str = "default") -> str:
    """
    Thin wrapper to send a message into the agent and get the final reply content.
    `thread_id` controls the short-term memory thread.
    """
    config: RunnableConfig = {
        "configurable": {
            "thread_id": thread_id,
        }
    }

    state = agent.invoke(
        {
            "initial_user_query": user_query,
        },
        config=config,
    )

    final_msg = state["messages"][-1]
    return final_msg.content


In [12]:
# run_agent("What's so special about this letter?")

In [13]:

# --- State scaffold (adapt to your AgentState if you have a TypedDict) ---

from langchain.messages import AIMessage


def make_initial_state(max_steps: int = 8) -> Dict[str, Any]:
    return {
        "messages": [],          # type: list[BaseMessage]
        "remaining_steps": max_steps,
        'audit_status': None,
        "audit_remarks": None,
        "initial_user_query": ""
    }


# --- Single turn runner (with checkpointer config) ---

def run_one_turn(
    agent,
    state: Dict[str, Any],
    thread_id: str,
) -> Dict[str, Any]:
    """
    Runs a single 'turn' of your agent given the current state.
    Adds the required LangGraph config for the checkpointer.
    """
    config = {
        "configurable": {
            "thread_id": thread_id,
            # add "checkpoint_ns" or "checkpoint_id" here if your graph needs them
        }
    }

    new_state = agent.invoke(state, config=config)
    return new_state


# --- Turn-based conversation over a list of questions ---

def simulate_turn_based_conversation(
    agent,
    questions: List[str],
    max_steps: int = 8,
    thread_id: str | None = None,
) -> Dict[str, Any]:
    """
    1. Creates an initial state.
    2. For each question:
       - appends a HumanMessage
       - calls the agent
       - prints the latest AIMessage
    """
    if thread_id is None:
        thread_id = f"test-thread-{uuid.uuid4()}"

    state = make_initial_state(max_steps=max_steps)

    for turn_idx, question in enumerate(questions, start=1):
        print(f"\n========== TURN {turn_idx} ==========")
        print(f"User: {question}")

        # append HumanMessage instead of dict
        state["messages"].append(HumanMessage(content=question))
        state['initial_user_query'] = question

        # run the agent for this turn
        state = run_one_turn(agent, state, thread_id=thread_id)

        # find the last AIMessage and print it
        assistant_msgs = [m for m in state["messages"] if isinstance(m, AIMessage)]
        if assistant_msgs:
            last_assistant = assistant_msgs[-1]
            print(f"Assistant: {last_assistant.content}")
        else:
            print("Assistant: <no AIMessage found in state>")

        if "remaining_steps" in state:
            print(f"(remaining_steps: {state['remaining_steps']})")

    return state


In [14]:
questions = [
    # --- PHASE 1: History Base Context ---
    "Who was Julius Caesar?",
    "Which major battle marked the end of his civil war?",
    "Without naming Caesar, tell me the river he famously crossed.",
    
    # --- PHASE 2: Geography Shift ---
    "What is the capital of Argentina?",
    "Name one UNESCO World Heritage site in that country.",
    "Compare the population of Buenos Aires to the city where Caesar was assassinated.",
    
    # --- PHASE 3: Science Injection ---
    "What is the chemical formula of water?",
    "What property of water allows insects like water striders to walk on its surface?",
    "Is that property more related to cohesion or adhesion?",
    
    # --- PHASE 4: Literature Divergence ---
    "Who wrote 'Pride and Prejudice'?",
    "Without naming the author, describe the central theme.",
    "Does that theme relate in any way to the political alliances Caesar formed?",
    
    # --- PHASE 5: Return to Geography ---
    "Earlier we spoke about a capital city. Which city was it?",
    "Now tell me one major river running through that city.",
    
    # --- PHASE 6: Animals / Biology ---
    "What is the largest species of shark?",
    "Where in the world's oceans is it most commonly found?",
    "Compare the size of this shark to the height of the tallest mountain in the world.",
    
    # --- PHASE 7: Aviation ---
    "What is the Boeing 747 commonly nicknamed?",
    "Which airline was the first to operate it commercially?",
    "How does its typical cruising altitude compare to the elevation of Mount Everest?",
    
    # --- PHASE 8: Sports ---
    "Who holds the record for the most goals in World Cup history?",
    "Which national team did he represent?",
    "Does that team share a continent with the capital city we mentioned earlier?",
    
    # --- PHASE 9: Return to Early Context ---
    "Back to chemistry: what is the pH of pure water at room temperature?",
    "And how does that compare to the acidity of lemon juice?",
    
    # --- PHASE 10: Movies ---
    "Who directed the movie 'Inception'?",
    "Name one major theme of this film.",
    "Is that theme conceptually similar to the literary theme we discussed earlier?",
    
    # --- PHASE 11: Space ---
    "What is the largest planet in our solar system?",
    "What is the name of its most famous storm?",
    "Is that storm larger or smaller than Earth?",
    
    # --- PHASE 12: Politics / Return to Caesar ---
    "Which Roman leader succeeded Caesar as the first Emperor?",
    "What relationship did he have with Caesar?",
    "Does this familial relationship relate to any theme discussed in the novel earlier?",
    
    # --- PHASE 13: Mathematics ---
    "What is the value of Pi rounded to 5 decimal places?",
    "Is Pi a rational or irrational number?",
    "Compare this mathematical concept to the precision required in aviation altimeters.",
    
    # --- PHASE 14: Companies / Technology ---
    "Who founded Microsoft?",
    "Which operating system became its early mainstream success?",
    "Is that operating system older or younger than the movie 'Inception'?",
    
    # --- PHASE 15: FINAL CONTEXT STRESS ---
    "Earlier, we talked about an animal, a city, a storm, and a political alliance. List all four without explanation.",
    "Now, from those four, which one existed first historically?",
    "Finally, relate that oldest entity to the theme of 'power' we discussed in one of the earlier topics."
]


final_state = simulate_turn_based_conversation(
    agent,
    questions,
    max_steps=8,
    thread_id="dev-session-2",  # or let it auto-generate
)



User: Who was Julius Caesar?
Input to Similarity search tool: Who was Julius Caesar?


  vectordb = Chroma(


Auditor Ouptut:  {'audit_state': True, 'audit_remarks': 'Successful'}
Assistant: Gaius Julius Caesar (12 July 100 BC – 15 March 44 BC) was a Roman military commander, politician, and author at the end of the Roman Republic. He joined the First Triumvirate, later fought and won a civil war against Pompey the Great, and ruled as Roman dictator until his assassination on the Ides of March, 44 BC, at the Curia of Pompey in Rome. He is widely regarded as one of history’s greatest military commanders, and his name “Caesar” became a title for emperors, influencing words like Kaiser and Tsar.

Sources used:
- Title: Julius Caesar | URL: https://simple.wikipedia.org/wiki?curid=5940 | chunk_index: 0
- Title: Julius Caesar | URL: https://simple.wikipedia.org/wiki?curid=5940 | chunk_index: 1
- Title: Battle of Thapsus | URL: https://simple.wikipedia.org/wiki?curid=1006782 | chunk_index: 1

User: Which major battle marked the end of his civil war?
Input to Similarity search tool: Which major battle

TypeError: Expected either a dictionary with a 'type' key or an object with a 'type' attribute. Instead got type <class 'dict'>.

In [None]:
agent.get_state({"configurable": {"thread_id":"dev-session-2"}})

In [None]:
agent.checkpointer.get({"configurable": {"thread_id": "debug-thread-1"}})