In [7]:
!pip -q install -U evaluate sacrebleu rouge-score chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.0/52.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.5/21.5 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.5/72.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


### Setup (defines questions, refs, top_k, retrieve(), generate_text())

In [8]:
import os, json, random, numpy as np, torch, chromadb
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

BASE_DIR   = "/content/drive/MyDrive/agentic-rag-telecom-thesis"
TRACE_IN   = f"{BASE_DIR}/results/traces/baseline_traces.jsonl"
CHROMA_DIR = f"{BASE_DIR}/results/chroma_baseline"

assert os.path.exists(TRACE_IN), f"Missing: {TRACE_IN}"
assert os.path.exists(CHROMA_DIR), f"Missing: {CHROMA_DIR}"

# Load SAME questions used in baseline (first N_QA)
N_QA = 50
rows = [json.loads(l) for l in open(TRACE_IN, "r", encoding="utf-8")][:N_QA]
questions = [r["query"] for r in rows]
refs      = [r["reference"] for r in rows]
top_k     = rows[0].get("top_k", 5)

# Open SAME Chroma + embedder
client = chromadb.PersistentClient(path=CHROMA_DIR)
col = client.get_collection("doc2dial_baseline")
emb = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def retrieve(query, k=5):
    q_emb = emb.encode([query], normalize_embeddings=True).tolist()[0]
    res = col.query(query_embeddings=[q_emb], n_results=k, include=["documents","metadatas"])
    return res["documents"][0], res["metadatas"][0]

# FLAN-T5 generator (no transformers pipeline)
device = "cuda" if torch.cuda.is_available() else "cpu"
tok = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(device)
model.eval()

@torch.inference_mode()
def generate_text(prompt: str, max_new_tokens: int = 256) -> str:
    inputs = tok(prompt, return_tensors="pt", truncation=True).to(device)
    out_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, num_beams=1)
    return tok.decode(out_ids[0], skip_special_tokens=True).strip()

print("Loaded QA:", len(questions), "| top_k:", top_k, "| device:", device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]



tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Loaded QA: 50 | top_k: 5 | device: cpu


### Installing + metrics objects (evaluate)

In [9]:
!pip -q install -U evaluate sacrebleu rouge-score

import evaluate
rouge = evaluate.load("rouge")
bleu  = evaluate.load("sacrebleu")

### Planner–Executor run (saves traces + summary)

In [10]:
import re, time, csv, os, json
import numpy as np

TRACE_OUT = f"{BASE_DIR}/results/traces/planner_traces.jsonl"
OUT_CSV   = f"{BASE_DIR}/results/planner_summary.csv"

PLANNER_SYS = """You are a planner for telecom support.
Given the QUESTION, write a short plan of 2-3 steps.
Return ONLY numbered steps, one per line (e.g., "1. ...")."""

EXECUTOR_SYS = """You are a telecom customer-support assistant.
Use ONLY the CONTEXT to answer. If not in context, say: I don't know."""

def make_plan(question: str) -> list[str]:
    plan_txt = generate_text(f"{PLANNER_SYS}\n\nQUESTION: {question}\nPLAN:")
    steps = []
    for line in plan_txt.splitlines():
        m = re.match(r"^\s*\d+\.\s*(.+)$", line.strip())
        if m:
            steps.append(m.group(1).strip())
    return steps[:3] if steps else [question]

def planner_executor_answer(question, k=5):
    t0 = time.time()
    steps = make_plan(question)
    action_queries = []

    all_docs, all_metas = [], []
    for s in steps:
        action_queries.append(s)
        docs, metas = retrieve(s, k=k)
        all_docs.extend(docs)
        all_metas.extend(metas)

    # de-dupe while keeping order
    seen, dedup_docs, dedup_metas = set(), [], []
    for d, m in zip(all_docs, all_metas):
        key = (m.get("doc_id"), m.get("chunk_id"))
        if key not in seen:
            seen.add(key)
            dedup_docs.append(d)
            dedup_metas.append(m)

    context = "\n\n".join([f"[{m.get('doc_id')}#{m.get('chunk_id')}] {d}" for d, m in zip(dedup_docs, dedup_metas)])
    prompt = f"{EXECUTOR_SYS}\n\nCONTEXT:\n{context}\n\nQUESTION: {question}\nANSWER:"
    ans = generate_text(prompt)
    return ans, time.time()-t0, steps, action_queries, dedup_docs, dedup_metas

preds, lats = [], []
os.makedirs(os.path.dirname(TRACE_OUT), exist_ok=True)

with open(TRACE_OUT, "w", encoding="utf-8") as f:
    for i, (q, ref) in enumerate(zip(questions, refs), 1):
        ans, dt, steps, aq, ctxs, metas = planner_executor_answer(q, k=top_k)
        preds.append(ans); lats.append(dt)
        f.write(json.dumps({
            "i": i, "query": q, "prediction": ans, "reference": ref,
            "latency_s": float(dt), "top_k": int(top_k),
            "plan_steps": steps, "action_queries": aq,
            "contexts": ctxs[:top_k*3],
            "retrieved": [{"doc_id": m.get("doc_id"), "chunk_id": m.get("chunk_id")} for m in metas]
        }, ensure_ascii=False) + "\n")

rou = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
ble = bleu.compute(predictions=preds, references=[[r] for r in refs])
lat_mean, lat_med, lat_p95 = float(np.mean(lats)), float(np.median(lats)), float(np.percentile(lats, 95))

print("=== Planner–Executor ===")
print("ROUGE-L:", rou["rougeL"])
print("BLEU:", ble["score"])
print("Latency mean/median/p95:", lat_mean, lat_med, lat_p95)
print("Saved traces:", TRACE_OUT)

with open(OUT_CSV, "w", newline="", encoding="utf-8") as fp:
    w = csv.DictWriter(fp, fieldnames=["system","rougeL","bleu","lat_mean_s","lat_median_s","lat_p95_s","n_pairs","top_k"])
    w.writeheader()
    w.writerow({
        "system":"Planner–Executor (Doc2Dial)",
        "rougeL":rou["rougeL"], "bleu":ble["score"],
        "lat_mean_s":lat_mean, "lat_median_s":lat_med, "lat_p95_s":lat_p95,
        "n_pairs":len(questions), "top_k":top_k
    })
print("Saved summary:", OUT_CSV)

=== Planner–Executor ===
ROUGE-L: 0.04487328628913493
BLEU: 0.35366353568338826
Latency mean/median/p95: 9.284378247261047 6.655436992645264 22.116019797325134
Saved traces: /content/drive/MyDrive/agentic-rag-telecom-thesis/results/traces/planner_traces.jsonl
Saved summary: /content/drive/MyDrive/agentic-rag-telecom-thesis/results/planner_summary.csv


In [1]:
!pip install ragas

Collecting ragas
  Downloading ragas-0.4.3-py3-none-any.whl.metadata (23 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting instructor (from ragas)
  Downloading instructor-1.14.5-py3-none-any.whl.metadata (12 kB)
Collecting scikit-network (from ragas)
  Downloading scikit_network-0.33.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting langchain-community (from ragas)
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain_openai (from ragas)
  Downloading langchain_openai-1.1.10-py3-none-any.whl.metadata (3.1 kB)
Collecting jiter<1,>=0.10.0 (from openai>=1.0.0->ragas)
  Downloading jiter-0.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community->rag

Tracking planner Trace

In [9]:
from google.colab import drive
drive.mount("/content/drive")

!ls -lah /content/drive/MyDrive/agentic-rag-telecom-thesis/results/traces | sed -n '1,200p'
!find /content/drive/MyDrive/agentic-rag-telecom-thesis -maxdepth 6 -type f -name "planner_traces.jsonl" -print

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
total 603K
-rw------- 1 root root  45K Feb 27 09:33 baseline_traces.jsonl
-rw------- 1 root root 289K Feb 27 11:50 planner_traces.jsonl
-rw------- 1 root root 270K Feb 27 11:13 react_traces.jsonl
/content/drive/MyDrive/agentic-rag-telecom-thesis/results/traces/planner_traces.jsonl


### RAGAS for Planner–Executor (N=20)

In [12]:
import os, json
from datasets import Dataset
from ragas import evaluate as ragas_evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision  # legacy metric OBJECTS (works)
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

assert os.environ.get("OPENAI_API_KEY"), "Set OPENAI_API_KEY first."

TRACE_PLANNER = "/content/drive/MyDrive/agentic-rag-telecom-thesis/results/traces/planner_traces.jsonl"
BASE_DIR = "/content/drive/MyDrive/agentic-rag-telecom-thesis"

N = 20
rows = [json.loads(l) for l in open(TRACE_PLANNER, "r", encoding="utf-8")][:N]

ds = Dataset.from_dict({
    "question": [r["query"] for r in rows],
    "answer": [r["prediction"] for r in rows],
    "contexts": [r.get("contexts", []) for r in rows],
    "ground_truth": [r["reference"] for r in rows],
})

llm_eval = ChatOpenAI(model="gpt-4o-mini", temperature=0)
emb_eval = OpenAIEmbeddings(model="text-embedding-3-small")

res = ragas_evaluate(
    ds,
    metrics=[faithfulness, answer_relevancy, context_precision],
    llm=llm_eval,
    embeddings=emb_eval
)

df = res.to_pandas()
means = df[["faithfulness","answer_relevancy","context_precision"]].mean().to_dict()
print("Planner RAGAS means on N=", N, means)

out_csv = f"{BASE_DIR}/results/planner_ragas_fixed.csv"
df.to_csv(out_csv, index=False)
print("Saved:", out_csv)

  from ragas.metrics import faithfulness, answer_relevancy, context_precision  # legacy metric OBJECTS (works)
  from ragas.metrics import faithfulness, answer_relevancy, context_precision  # legacy metric OBJECTS (works)
  from ragas.metrics import faithfulness, answer_relevancy, context_precision  # legacy metric OBJECTS (works)


Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]



Planner RAGAS means on N= 20 {'faithfulness': 0.4956140350877193, 'answer_relevancy': 0.17485477705665056, 'context_precision': 0.35770833331618923}
Saved: /content/drive/MyDrive/agentic-rag-telecom-thesis/results/planner_ragas_fixed.csv


On RAGAS (N=20), Baseline RAG achieved the highest faithfulness (0.6354) and answer relevance (0.2279), while ReAct achieved the highest context precision (0.3733). In latency (N=50), ReAct was fastest (mean 2.24s) compared to Planner–Executor (9.28s) and Baseline (11.32s).