In [1]:
import os
import logging

processed_docs_path = "C:\BlueAI_bkp\data\processed\wikiextractor"


  processed_docs_path = "C:\BlueAI_bkp\data\processed\wikiextractor"


In [2]:
from phoenix.otel import register

# configure the Phoenix tracer
tracer_provider = register(
    project_name="Base-RAG",                 # your project
    endpoint="http://localhost:6006/v1/traces",  # Phoenix Docker HTTP collector
    protocol="grpc",               # force HTTP instead of gRPC
    auto_instrument=True,                   # auto-instrument LangChain + others
    batch=True, 
)

OpenTelemetry Tracing Details
|  Phoenix Project: Base-RAG
|  Span Processor: BatchSpanProcessor
|  Collector Endpoint: http://localhost:6006/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [3]:
for folder in os.listdir(processed_docs_path):
    for file in (os.listdir(f"{processed_docs_path}/{folder}"))[0]:
        print(f"{folder}: {file}")

AA: w
AA: i
AA: k
AA: i
AA: _
AA: 0
AA: 0
AB: w
AB: i
AB: k
AB: i
AB: _
AB: 0
AB: 0
AC: w
AC: i
AC: k
AC: i
AC: _
AC: 0
AC: 0


In [4]:
example_path = "C:\BlueAI_bkp\data\processed\wikiextractor\AA\wiki_00"

with open(example_path, "r") as f:
    lines = f.readlines()


  example_path = "C:\BlueAI_bkp\data\processed\wikiextractor\AA\wiki_00"


In [5]:
import json 

for line in lines:
    article_json = json.loads(line) 
    text = article_json['text']
    print(text)
    break

April (Apr.) is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.
April always begins on the same day of the week as July, and additionally, January in leap years. April always ends on the same day of the week as December.
The Month.
April comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.
April begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other's last days are exactly 35 weeks (245 days) apart.
In common years, April starts on the same day of the week as October of the previous year, and in leap years, May of the previous year. In common years, April finishes on the same day of the week as July of the previous yea

In [6]:
from pathlib import Path
from typing import Iterator, Dict, Any, List
import json
from dotenv import load_dotenv
import os

from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# NEW
from langchain_text_splitters import RecursiveCharacterTextSplitter
import tiktoken

load_dotenv('.env')


BASE_DIR = Path("data/processed/wikiextractor")
CHROMA_DIR = Path("chromadb")
EMBEDDING_MODEL = "text-embedding-3-small"


# ---------- JSONL READER ----------

def iter_jsonl_files(base_dir: Path = BASE_DIR) -> Iterator[Path]:
    for subdir in sorted(base_dir.iterdir()):
        if not subdir.is_dir():
            continue
        for fp in sorted(subdir.glob("wiki_*")):
            if fp.is_file():
                yield fp


def iter_wiki_objects(base_dir: Path = BASE_DIR) -> Iterator[Dict[str, Any]]:
    for fp in iter_jsonl_files(base_dir):
        with fp.open("r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue

                obj = json.loads(line)

                title = obj.get("title", "")
                text = obj.get("text", "")

                yield {
                    "id": obj.get("id"),
                    "title": title,
                    "text": text,
                    "metadata": {
                        "title": title,
                        "source_file": str(fp),
                        "revid": obj.get("revid"),
                        "url": obj.get("url"),
                    },
                }


# ---------- EMBEDDING CLIENT ----------

class WikiEmbeddingClient:
    def __init__(self, model: str = EMBEDDING_MODEL):
        self._emb = OpenAIEmbeddings(model=model)

    def embed_text(self, text: str) -> List[float]:
        return self._emb.embed_query(text)

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        return self._emb.embed_documents(texts)


# ---------- CHROMA PIPELINE ----------

def build_chroma_from_wiki(
    base_dir: Path = BASE_DIR,
    chroma_dir: Path = CHROMA_DIR,
    model: str = EMBEDDING_MODEL,
) -> Chroma:
    chroma_dir.mkdir(parents=True, exist_ok=True)

    embeddings = OpenAIEmbeddings(model=model, api_key=os.getenv("OPENAI_API_KEY"))

    # NEW: token-based recursive splitter, 500 token chunks, 125 overlap
    encoding = tiktoken.get_encoding("o200k_base")

    def _token_len(text: str) -> int:
        return len(encoding.encode(text))

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=150,
        chunk_overlap=30,
        length_function=_token_len,
    )

    docs: List[Document] = []
    for obj in iter_wiki_objects(base_dir):
        text = obj["text"]
        if not text:
            continue

        base_metadata = obj["metadata"]

        # NEW: split into chunks, keep metadata (incl. title, url) on each chunk
        chunks = text_splitter.split_text(text)
        for i, chunk in enumerate(chunks):
            docs.append(
                Document(
                    page_content=chunk,
                    metadata={**base_metadata, "chunk_index": i},
                )
            )

    vectordb = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        collection_name="wiki_short_150",
        persist_directory=str(chroma_dir),
    )
    vectordb.persist()
    return vectordb


In [8]:
# db = build_chroma_from_wiki()

In [9]:
from typing import List, Tuple

from langchain_core.documents import Document
from langchain.tools import tool

@tool(response_format="content_and_artifact")
def similarity_search_tool(query: str, k: int = 4) -> Tuple[str, List[Document]]:
    """
    Retrieve documents similar to the query using the vector store.

    Returns:
        A (content, artifact) tuple:
        - content: serialized text for the chat model
        - artifact: raw List[Document] for downstream use / debugging
    """
    print(f"Input to similarity search tool: {query}")
    embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)

    vectordb = Chroma(
        persist_directory="chromadb",
        embedding_function=embeddings,
        collection_name="wiki_short_2",
    )

    docs: List[Document] = vectordb.similarity_search_with_relevance_scores(
        query, k=k
    )
    doc_score_pairs = vectordb.similarity_search_with_relevance_scores(
        query=query,
        k=k,
    )

    docs: List[Document] = [doc for doc, _ in doc_score_pairs]

    # 3) Serialize in an LLM-friendly, structured way
    parts = []
    for idx, (doc, score) in enumerate(doc_score_pairs, start=1):
        part = (
            f"### Document {idx}\n"
            f"relevance_score: {score:.4f}\n"
            f"metadata: {doc.metadata}\n"
            f"content:\n{doc.page_content}\n"
        )
        parts.append(part)

    content_for_llm = "\n\n".join(parts) if parts else "No documents found."

    # 4) Return string for LLM + raw docs for downstream use
    return content_for_llm, docs


In [10]:
# similarity_search_tool.run("Tell me about the letter a")

In [11]:
from typing import Any

from langchain.agents import create_agent, AgentState
from langchain.agents.middleware import before_model, after_model
from langchain.messages import HumanMessage
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnableConfig

from langgraph.checkpoint.memory import InMemorySaver

# --- 0. Your tool import ------------------------------------------------------
# Assuming you defined it with @tool(response_format="content_and_artifact")


# --- 1. Model -----------------------------------------------------------------
# Any OpenAI-compatible chat model; adjust name/params as needed.
model = ChatOpenAI(
    model="gpt-5-nano",   # or "gpt-4o", "gpt-4.1-mini", etc.
    temperature=0.0,
)


# --- 2. State schema (short-term memory) --------------------------------------
# AgentState already provides:
#   - messages: list[AnyMessage]
#   - remaining_steps: int
# You can extend this later with extra keys if needed.
class MyAgentState(AgentState):
    # Example placeholder for future context fields:
    # conversation_topic: str
    pass


# --- 3. In-memory short-term memory (checkpointer) ----------------------------
# This keeps per-thread conversation state in memory only (not persisted).
checkpointer = InMemorySaver()


# --- 4. Pre / post model hooks (middleware) -----------------------------------
# These are the “pre_model_hook / post_model_hook” equivalents in LangChain v1.
# They give you a clean place to do context management later (trimming, summarizing, etc.).

@before_model
def pre_model(state: MyAgentState, runtime) -> dict[str, Any] | None:
    """
    Runs before the LLM is called.
    You can:
      - trim / summarize messages
      - inject extra context
      - modify state["remaining_steps"], etc.
    Return a partial state update or None for no-op.
    """
    # Example NO-OP (scaffolding only)
    # You could, for example, keep only the last N messages here.
    return None


@after_model
def post_model(state: MyAgentState, runtime) -> dict[str, Any] | None:
    """
    Runs after the LLM responds (but before tools run again).
    You can:
      - validate / edit model output
      - add guardrails
      - log things
    Return a partial state update or None for no-op.
    """
    return None


# --- 5. Build the agent -------------------------------------------------------
tools = [similarity_search_tool]

system_prompt = """
#### Role: 
- You are a chat assistant that is developed to answer the user's question. 

#### Rules: 
1. You cannot answer from beyond the retrieved documents. 
2. Use the similarity_search_tool when you need documents to answer a question. 
3. When the retrieved documents do not contain the data to a question, you simply return saying "I don't know". 
4. Always provide the title and url as well as chunk_index from the metadata of the passage that is being used. 
"""

agent = create_agent(
    model=model,
    tools=tools,
    system_prompt=system_prompt,
    state_schema=MyAgentState,          # ensures state has messages + remaining_steps
    middleware=[pre_model, post_model], # hooks for future context management
    checkpointer=checkpointer,          # short-term memory (thread-scoped)
)


# --- 6. Simple helper for calling the agent -----------------------------------
def run_agent(user_query: str, thread_id: str = "default") -> str:
    """
    Thin wrapper to send a message into the agent and get the final reply content.
    `thread_id` controls the short-term memory thread.
    """
    config: RunnableConfig = {
        "configurable": {
            "thread_id": thread_id,
        }
    }

    state = agent.invoke(
        {
            "messages": [
                HumanMessage(content=user_query)
            ]
        },
        config=config,
    )

    final_msg = state["messages"][-1]
    return final_msg.content


In [12]:
# run_agent("What's so special about this letter?")

In [13]:
from typing import List, Dict, Any
import uuid

from langchain_core.messages import HumanMessage, AIMessage, BaseMessage


# --- State scaffold (adapt to your AgentState if you have a TypedDict) ---

def make_initial_state(max_steps: int = 8) -> Dict[str, Any]:
    return {
        "messages": [],          # type: list[BaseMessage]
        "remaining_steps": max_steps,
    }


# --- Single turn runner (with checkpointer config) ---

def run_one_turn(
    agent,
    state: Dict[str, Any],
    thread_id: str,
) -> Dict[str, Any]:
    """
    Runs a single 'turn' of your agent given the current state.
    Adds the required LangGraph config for the checkpointer.
    """
    config = {
        "configurable": {
            "thread_id": thread_id,
            # add "checkpoint_ns" or "checkpoint_id" here if your graph needs them
        }
    }

    new_state = agent.invoke(state, config=config)
    return new_state


# --- Turn-based conversation over a list of questions ---

def simulate_turn_based_conversation(
    agent,
    questions: List[str],
    max_steps: int = 8,
    thread_id: str | None = None,
) -> Dict[str, Any]:
    """
    1. Creates an initial state.
    2. For each question:
       - appends a HumanMessage
       - calls the agent
       - prints the latest AIMessage
    """
    if thread_id is None:
        thread_id = f"test-thread-{uuid.uuid4()}"

    state = make_initial_state(max_steps=max_steps)

    for turn_idx, question in enumerate(questions, start=1):
        print(f"\n========== TURN {turn_idx} ==========")
        print(f"User: {question}")

        # append HumanMessage instead of dict
        state["messages"].append(HumanMessage(content=question))

        # run the agent for this turn
        state = run_one_turn(agent, state, thread_id=thread_id)

        # find the last AIMessage and print it
        assistant_msgs = [m for m in state["messages"] if isinstance(m, AIMessage)]
        if assistant_msgs:
            last_assistant = assistant_msgs[-1]
            print(f"Assistant: {last_assistant.content}")
        else:
            print("Assistant: <no AIMessage found in state>")

        if "remaining_steps" in state:
            print(f"(remaining_steps: {state['remaining_steps']})")

    return state


In [14]:
questions = [
    # --- PHASE 1: History Base Context ---
    "Who was Julius Caesar?",
    "Which major battle marked the end of his civil war?",
    "Without naming Caesar, tell me the river he famously crossed.",
    
    # --- PHASE 2: Geography Shift ---
    "What is the capital of Argentina?",
    "Name one UNESCO World Heritage site in that country.",
    "Compare the population of Buenos Aires to the city where Caesar was assassinated.",
    
    # --- PHASE 3: Science Injection ---
    "What is the chemical formula of water?",
    "What property of water allows insects like water striders to walk on its surface?",
    "Is that property more related to cohesion or adhesion?",
    
    # --- PHASE 4: Literature Divergence ---
    "Who wrote 'Pride and Prejudice'?",
    "Without naming the author, describe the central theme.",
    "Does that theme relate in any way to the political alliances Caesar formed?",
    
    # --- PHASE 5: Return to Geography ---
    "Earlier we spoke about a capital city. Which city was it?",
    "Now tell me one major river running through that city.",
    
    # --- PHASE 6: Animals / Biology ---
    "What is the largest species of shark?",
    "Where in the world's oceans is it most commonly found?",
    "Compare the size of this shark to the height of the tallest mountain in the world.",
    
    # --- PHASE 7: Aviation ---
    "What is the Boeing 747 commonly nicknamed?",
    "Which airline was the first to operate it commercially?",
    "How does its typical cruising altitude compare to the elevation of Mount Everest?",
    
    # --- PHASE 8: Sports ---
    "Who holds the record for the most goals in World Cup history?",
    "Which national team did he represent?",
    "Does that team share a continent with the capital city we mentioned earlier?",
    
    # --- PHASE 9: Return to Early Context ---
    "Back to chemistry: what is the pH of pure water at room temperature?",
    "And how does that compare to the acidity of lemon juice?",
    
    # --- PHASE 10: Movies ---
    "Who directed the movie 'Inception'?",
    "Name one major theme of this film.",
    "Is that theme conceptually similar to the literary theme we discussed earlier?",
    
    # --- PHASE 11: Space ---
    "What is the largest planet in our solar system?",
    "What is the name of its most famous storm?",
    "Is that storm larger or smaller than Earth?",
    
    # --- PHASE 12: Politics / Return to Caesar ---
    "Which Roman leader succeeded Caesar as the first Emperor?",
    "What relationship did he have with Caesar?",
    "Does this familial relationship relate to any theme discussed in the novel earlier?",
    
    # --- PHASE 13: Mathematics ---
    "What is the value of Pi rounded to 5 decimal places?",
    "Is Pi a rational or irrational number?",
    "Compare this mathematical concept to the precision required in aviation altimeters.",
    
    # --- PHASE 14: Companies / Technology ---
    "Who founded Microsoft?",
    "Which operating system became its early mainstream success?",
    "Is that operating system older or younger than the movie 'Inception'?",
    
    # --- PHASE 15: FINAL CONTEXT STRESS ---
    "Earlier, we talked about an animal, a city, a storm, and a political alliance. List all four without explanation.",
    "Now, from those four, which one existed first historically?",
    "Finally, relate that oldest entity to the theme of 'power' we discussed in one of the earlier topics."
]


final_state = simulate_turn_based_conversation(
    agent,
    questions,
    max_steps=8,
    thread_id="dev-session-1",  # or let it auto-generate
)



User: Who was Julius Caesar?
Input to similarity search tool: Julius Caesar


  vectordb = Chroma(


Assistant: Julius Caesar was a central figure in the late Roman Republic—a Roman general (military commander), politician, and author who played a key role in the events that led to the end of the Republic. He became a member of the First Triumvirate, then fought a civil war against Pompey the Great, and eventually became Roman dictator until his assassination. He was born on July 12, 100 BC, and was killed on March 15, 44 BC (the Ides of March). His death was followed by famous stories such as the line “Et tu, Brute?” and he is the source of the surname Caesar, which became a title used for emperors (influencing words like Kaiser and Tsar).

Sources:
- Julius Caesar (title: Julius Caesar), https://simple.wikipedia.org/wiki?curid=5940, chunk_index 0
  "Gaius Julius Caesar (12 July 100 BC – 15 March 44 BC) was a military commander, politician and author at the end of the Roman Republic." 
- Julius Caesar (title: Julius Caesar), https://simple.wikipedia.org/wiki?curid=5940, chunk_index 9

In [15]:
agent.get_state({"configurable": {"thread_id":"dev-session-1"}})

StateSnapshot(values={'messages': [HumanMessage(content='Who was Julius Caesar?', additional_kwargs={}, response_metadata={}, id='303ff526-d221-4f52-b1dd-d5822d708e0d'), AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 353, 'prompt_tokens': 287, 'total_tokens': 640, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 320, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-5-nano-2025-08-07', 'system_fingerprint': None, 'id': 'chatcmpl-Ckp0nJHaaV4KbH8wgDoxIyRskDwli', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--019b029a-1768-7041-b158-82e8d5965d21-0', tool_calls=[{'name': 'similarity_search_tool', 'args': {'query': 'Julius Caesar', 'k': 4}, 'id': 'call_Jh3hEc9PDa5b3akBNbaIusK0', 'type': 'tool_call'}], usage_metadata={'input_tokens': 2