## Setup

In [1]:
!pip -q uninstall -y numpy scipy scikit-learn sentence-transformers

# Using Colab-stable numpy 2.x stack (Colab upgraded to 2.0.2 and friends) :contentReference[oaicite:1]{index=1}
!pip -q install --no-cache-dir --force-reinstall \
  "numpy==2.0.2" "scipy==1.13.1" "scikit-learn==1.5.2"

# Installing the rest (pin lightly to avoid downgrading numpy)
!pip -q install --no-cache-dir \
  chromadb sentence-transformers transformers evaluate sacrebleu

# Optional (for RAGAS comparable with baseline OpenAI judge)
!pip -q install --no-cache-dir ragas datasets langchain-community langchain-openai openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m297.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m328.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.2/38.2 MB[0m [31m233.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m315.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.1/309.1 kB[0m [31m382.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
hdbscan 0.8.41 requires scikit-learn>=1.6, but you have scikit-learn 1.5.2 which is incompa

### Mounting Drive + auto-detecting paths

In [3]:
from google.colab import drive
drive.mount("/content/drive")

import os, json
from pathlib import Path

def find_first(root, patterns):
    root = Path(root)
    for pat in patterns:
        hits = list(root.rglob(pat))
        if hits:
            # pick shortest path (usually the intended one)
            hits = sorted(hits, key=lambda p: len(str(p)))
            return str(hits[0])
    return None

# Try the expected location first (from your baseline notebook)
EXPECTED_BASE = "/content/drive/MyDrive/agentic-rag-telecom-thesis"
expected_trace = f"{EXPECTED_BASE}/results/traces/baseline_traces.jsonl"
expected_chroma = f"{EXPECTED_BASE}/results/chroma_baseline"

if os.path.exists(expected_trace):
    BASE_DIR  = EXPECTED_BASE
    TRACE_IN  = expected_trace
    CHROMA_DIR = expected_chroma
else:
    # Auto-find anywhere under MyDrive (handles if your folder name/path differs)
    TRACE_IN = find_first("/content/drive/MyDrive", [
        "baseline_traces.jsonl",
        "*baseline*traces*.jsonl",
    ])
    if TRACE_IN is None:
        raise FileNotFoundError(
            "Could not find baseline_traces.jsonl under /content/drive/MyDrive.\n"
            "Run: !ls -R /content/drive/MyDrive | grep baseline_traces.jsonl"
        )
    # infer BASE_DIR as the folder above '/results/...'
    if "/results/" in TRACE_IN:
        BASE_DIR = TRACE_IN.split("/results/")[0]
    else:
        BASE_DIR = str(Path(TRACE_IN).parents[3])

    # find chroma_baseline near BASE_DIR (or anywhere in BASE_DIR)
    CHROMA_DIR = find_first(BASE_DIR, ["chroma_baseline", "*chroma*baseline*"])
    if CHROMA_DIR is None:
        # fallback global search
        CHROMA_DIR = find_first("/content/drive/MyDrive", ["chroma_baseline", "*chroma*baseline*"])
    if CHROMA_DIR is None:
        raise FileNotFoundError("Could not find chroma_baseline directory under Drive.")

TRACE_OUT = f"{BASE_DIR}/results/traces/react_traces.jsonl"
OUT_CSV   = f"{BASE_DIR}/results/react_summary.csv"

print("BASE_DIR:", BASE_DIR)
print("TRACE_IN:", TRACE_IN)
print("CHROMA_DIR:", CHROMA_DIR)
print("TRACE_OUT:", TRACE_OUT)

Mounted at /content/drive
✅ BASE_DIR: /content/drive/MyDrive/agentic-rag-telecom-thesis
✅ TRACE_IN: /content/drive/MyDrive/agentic-rag-telecom-thesis/results/traces/baseline_traces.jsonl
✅ CHROMA_DIR: /content/drive/MyDrive/agentic-rag-telecom-thesis/results/chroma_baseline
✅ TRACE_OUT: /content/drive/MyDrive/agentic-rag-telecom-thesis/results/traces/react_traces.jsonl


## Setup (reuse SAME QA + SAME Chroma; FLAN-T5 generation via model.generate)


In [5]:
import time, re, csv, random
import numpy as np
import chromadb
from sentence_transformers import SentenceTransformer
import evaluate

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

N_QA = 50

rows = [json.loads(l) for l in open(TRACE_IN, "r", encoding="utf-8")]
rows = rows[:min(N_QA, len(rows))]
questions = [r["query"] for r in rows]
refs      = [r["reference"] for r in rows]

top_k = rows[0].get("top_k", 5)

# Chroma collection from baseline
client = chromadb.PersistentClient(path=CHROMA_DIR)
COLLECTION_NAME = "doc2dial_baseline"
col = client.get_collection(COLLECTION_NAME)

# Same embedder as baseline
emb = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def retrieve(query, k=5):
    q_emb = emb.encode([query], normalize_embeddings=True).tolist()[0]
    res = col.query(query_embeddings=[q_emb], n_results=k, include=["documents","metadatas"])
    return res["documents"][0], res["metadatas"][0]

# FLAN-T5 (no pipeline)
device = "cuda" if torch.cuda.is_available() else "cpu"
tok = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(device)
model.eval()

@torch.inference_mode()
def generate_text(prompt: str, max_new_tokens: int = 256) -> str:
    inputs = tok(prompt, return_tensors="pt", truncation=True).to(device)
    out_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,     # deterministic (fairer)
        num_beams=1
    )
    return tok.decode(out_ids[0], skip_special_tokens=True).strip()

rouge = evaluate.load("rouge")
bleu  = evaluate.load("sacrebleu")

print("Loaded QA:", len(questions), "| top_k:", top_k, "| device:", device)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Loaded QA: 50 | top_k: 5 | device: cuda


### ReAct loop + metrics + save traces + summary CSV

In [6]:
SYSTEM = """You are a telecom customer-support assistant.
You MUST use only information found in the SEARCH results.
If not found, say: I don't know.

Use this format exactly:
Thought: ...
Action: Search[query]
Observation: ...
(Repeat up to 3 times)
Final: ..."""

def react_answer(question, k=5, max_steps=3):
    t0 = time.time()
    scratch = ""
    used_contexts = []   # list[str]
    used_meta = []       # list[dict]
    action_queries = []

    forced_first_search = True

    for step in range(max_steps):
        prompt = SYSTEM + "\n\n" + scratch + f"\nQuestion: {question}\n"
        out = generate_text(prompt, max_new_tokens=256)

        if "Final:" in out:
            final = out.split("Final:", 1)[1].strip()
            return final, time.time()-t0, step+1, action_queries, used_contexts, used_meta

        m = re.search(r"Action:\s*Search\[(.*?)\]", out, re.S)
        q2 = m.group(1).strip() if m else None

        if (q2 is None) and forced_first_search:
            q2 = question
            forced_first_search = False

        if q2 is None:
            break

        action_queries.append(q2)
        docs, metas = retrieve(q2, k=k)

        obs_lines = [f"{i+1}. {d[:350]}" for i, d in enumerate(docs[:3])]
        observation = "\n".join(obs_lines)

        used_contexts.extend(docs)
        used_meta.extend(metas)

        scratch += f"\n{out}\nObservation:\n{observation}\n"

    # Fallback: single-shot RAG on original question
    docs, metas = retrieve(question, k=k)
    context = "\n\n".join([f"[{m.get('doc_id')}#{m.get('chunk_id')}] {d}" for d, m in zip(docs, metas)])
    prompt = (
        "Answer the QUESTION using only the CONTEXT. If not present, say: I don't know.\n\n"
        f"CONTEXT:\n{context}\n\nQUESTION: {question}\nANSWER:"
    )
    final = generate_text(prompt, max_new_tokens=256)
    return final, time.time()-t0, max_steps, action_queries, docs, metas

preds, lats = [], []

os.makedirs(os.path.dirname(TRACE_OUT), exist_ok=True)
os.makedirs(os.path.dirname(OUT_CSV), exist_ok=True)

with open(TRACE_OUT, "w", encoding="utf-8") as f:
    for i, (q, ref) in enumerate(zip(questions, refs), 1):
        ans, dt, steps, aq, ctxs, metas = react_answer(q, k=top_k, max_steps=3)
        preds.append(ans)
        lats.append(dt)

        f.write(json.dumps({
            "i": i,
            "query": q,
            "prediction": ans,
            "reference": ref,
            "latency_s": float(dt),
            "steps": int(steps),
            "top_k": int(top_k),
            "action_queries": aq,
            "contexts": (ctxs[:top_k*3] if isinstance(ctxs, list) else []),
            "retrieved": [{"doc_id": m.get("doc_id"), "chunk_id": m.get("chunk_id")} for m in (metas if isinstance(metas, list) else [])]
        }, ensure_ascii=False) + "\n")

        if i % 10 == 0:
            print(f"Processed {i}/{len(questions)}")

rou = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
ble = bleu.compute(predictions=preds, references=[[r] for r in refs])

lat_mean = float(np.mean(lats))
lat_med  = float(np.median(lats))
lat_p95  = float(np.percentile(lats, 95))

print("\n=== ReAct ===")
print("ROUGE-L:", rou["rougeL"])
print("BLEU:", ble["score"])
print("Latency mean/median/p95:", lat_mean, lat_med, lat_p95)
print("Saved traces:", TRACE_OUT)

with open(OUT_CSV, "w", newline="", encoding="utf-8") as fp:
    w = csv.DictWriter(fp, fieldnames=["system","rougeL","bleu","lat_mean_s","lat_median_s","lat_p95_s","n_pairs","top_k"])
    w.writeheader()
    w.writerow({
        "system":"ReAct (Doc2Dial)",
        "rougeL":rou["rougeL"],
        "bleu":ble["score"],
        "lat_mean_s":lat_mean,
        "lat_median_s":lat_med,
        "lat_p95_s":lat_p95,
        "n_pairs":len(questions),
        "top_k":top_k
    })

print("Saved summary:", OUT_CSV)

Processed 10/50
Processed 20/50
Processed 30/50
Processed 40/50
Processed 50/50

=== ReAct ===
ROUGE-L: 0.06534565677902753
BLEU: 0.9299832131513457
Latency mean/median/p95: 2.2375061178207396 0.8850269317626953 9.132748019695278
Saved traces: /content/drive/MyDrive/agentic-rag-telecom-thesis/results/traces/react_traces.jsonl
Saved summary: /content/drive/MyDrive/agentic-rag-telecom-thesis/results/react_summary.csv


### Comparable RAGAS (same as baseline: gpt-4o-mini + text-embedding-3-small)

In [8]:
import os, json
from datasets import Dataset
from ragas import evaluate as ragas_evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision
from ragas.llms import llm_factory
from ragas.embeddings import OpenAIEmbeddings
from openai import AsyncOpenAI

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')


client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
llm_eval = llm_factory("gpt-4o-mini", client=client)
emb_eval = OpenAIEmbeddings(client=client, model="text-embedding-3-small")

N = 20
rows = [json.loads(l) for l in open(TRACE_OUT, "r", encoding="utf-8")][:N]

ds = Dataset.from_dict({
    "question": [r["query"] for r in rows],
    "answer": [r["prediction"] for r in rows],
    "contexts": [r.get("contexts", []) for r in rows],
    "ground_truth": [r["reference"] for r in rows],
})

res = ragas_evaluate(ds, metrics=[faithfulness, answer_relevancy, context_precision], llm=llm_eval, embeddings=emb_eval)
df = res.to_pandas()

means = df[["faithfulness","answer_relevancy","context_precision"]].mean().to_dict()
print("ReAct RAGAS means on N=", N, means)

out_csv = f"{BASE_DIR}/results/react_ragas.csv"
df.to_csv(out_csv, index=False)
print("Saved:", out_csv)

  from ragas.metrics import faithfulness, answer_relevancy, context_precision
  from ragas.metrics import faithfulness, answer_relevancy, context_precision
  from ragas.metrics import faithfulness, answer_relevancy, context_precision


Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[10]: AttributeError('OpenAIEmbeddings' object has no attribute 'embed_query')
ERROR:ragas.executor:Exception raised in Job[13]: AttributeError('OpenAIEmbeddings' object has no attribute 'embed_query')
ERROR:ragas.executor:Exception raised in Job[1]: AttributeError('OpenAIEmbeddings' object has no attribute 'embed_query')
ERROR:ragas.executor:Exception raised in Job[7]: AttributeError('OpenAIEmbeddings' object has no attribute 'embed_query')
ERROR:ragas.executor:Exception raised in Job[4]: AttributeError('OpenAIEmbeddings' object has no attribute 'embed_query')
ERROR:ragas.executor:Exception raised in Job[19]: AttributeError('OpenAIEmbeddings' object has no attribute 'embed_query')
ERROR:ragas.executor:Exception raised in Job[16]: AttributeError('OpenAIEmbeddings' object has no attribute 'embed_query')
ERROR:ragas.executor:Exception raised in Job[28]: AttributeError('OpenAIEmbeddings' object has no attribute 'embed_query')
ERROR:ragas.executo

ReAct RAGAS means on N= 20 {'faithfulness': 0.48611111111111105, 'answer_relevancy': nan, 'context_precision': 0.3854166666542743}
Saved: /content/drive/MyDrive/agentic-rag-telecom-thesis/results/react_ragas.csv


ReAct is much faster (mean 2.24s vs 11.32s baseline), but text overlap metrics dropped (ROUGE-L and BLEU lower).

On RAGAS (N=20): Context Precision improved (0.3854 vs 0.3235 baseline), but Faithfulness decreased (0.4861 vs 0.6354 baseline).
(Hold final judgement until Answer Relevancy is fixed and you also run Planner–Executor.)

In [9]:
import os, json
from datasets import Dataset
from ragas import evaluate as ragas_evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

assert os.environ.get("OPENAI_API_KEY"), "Set OPENAI_API_KEY first."

BASE_DIR = "/content/drive/MyDrive/agentic-rag-telecom-thesis"
TRACE_OUT = f"{BASE_DIR}/results/traces/react_traces.jsonl"

N = 20
rows = [json.loads(l) for l in open(TRACE_OUT, "r", encoding="utf-8")][:N]

ds = Dataset.from_dict({
    "question": [r["query"] for r in rows],
    "answer": [r["prediction"] for r in rows],
    "contexts": [r.get("contexts", []) for r in rows],
    "ground_truth": [r["reference"] for r in rows],
})

llm_eval = ChatOpenAI(model="gpt-4o-mini", temperature=0)
emb_eval = OpenAIEmbeddings(model="text-embedding-3-small")

res = ragas_evaluate(ds, metrics=[faithfulness, answer_relevancy, context_precision], llm=llm_eval, embeddings=emb_eval)
df = res.to_pandas()

means = df[["faithfulness","answer_relevancy","context_precision"]].mean().to_dict()
print("ReAct RAGAS means on N=", N, means)

out_csv = f"{BASE_DIR}/results/react_ragas_fixed.csv"
df.to_csv(out_csv, index=False)
print("Saved:", out_csv)

  from ragas.metrics import faithfulness, answer_relevancy, context_precision
  from ragas.metrics import faithfulness, answer_relevancy, context_precision
  from ragas.metrics import faithfulness, answer_relevancy, context_precision


Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]



ReAct RAGAS means on N= 20 {'faithfulness': 0.55, 'answer_relevancy': 0.15835642977343747, 'context_precision': 0.37326388887624706}
Saved: /content/drive/MyDrive/agentic-rag-telecom-thesis/results/react_ragas_fixed.csv


**Quick delta summary (useful for Chapter 5 “Results” sentence)**

ReAct is ~5× faster on mean latency (11.32 → 2.24s).

ReAct has higher Context Precision than baseline (0.3235 → 0.3733 on N=20).

ReAct has lower Faithfulness (0.6354 → 0.55) and lower Answer Relevance (0.2279 → 0.1584) on N=20.