<a href="https://colab.research.google.com/github/keerthi1605/MultiTurnRAG/blob/main/ai_da2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
# Full Multi-Turn RAG Notebook for Demo
# -----------------------------

# 1️⃣ Clone IBM mt-rag-benchmark repo and list files
!rm -rf /content/mt-rag-benchmark
!git clone https://github.com/IBM/mt-rag-benchmark.git
!echo "Current directory:" && pwd && ls -la
!ls -R /content/mt-rag-benchmark | head -40

Cloning into 'mt-rag-benchmark'...
remote: Enumerating objects: 173, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 173 (delta 19), reused 13 (delta 13), pack-reused 142 (from 1)[K
Receiving objects: 100% (173/173), 216.24 MiB | 31.32 MiB/s, done.
Resolving deltas: 100% (57/57), done.
Current directory:
/content
total 20
drwxr-xr-x 1 root root 4096 Nov  8 15:00 .
drwxr-xr-x 1 root root 4096 Nov  8 14:20 ..
drwxr-xr-x 4 root root 4096 Nov  5 14:33 .config
drwxr-xr-x 7 root root 4096 Nov  8 15:00 mt-rag-benchmark
drwxr-xr-x 1 root root 4096 Nov  5 14:33 sample_data
/content/mt-rag-benchmark:
corpora
human
LICENSE
README.md
scripts
synthetic

/content/mt-rag-benchmark/corpora:
document_level
passage_level
README.md

/content/mt-rag-benchmark/corpora/document_level:
clapnq.jsonl.zip
cloud.jsonl.zip
fiqa.jsonl.zip
govt.jsonl.zip

/content/mt-rag-benchmark/corpora/passage_level:
clapnq.jsonl.zip
cloud.jsonl.zip
fiq

In [None]:
# 2️⃣ Check for ClapNQ dataset and unzip if needed
# -----------------------------
import os, zipfile, shutil

passage_zip = "/content/mt-rag-benchmark/corpora/passage_level/clapnq.jsonl.zip"
passage_dir = "/content/mt-rag-benchmark/corpora/passage_level/clapnq"
jsonl_file = os.path.join(passage_dir, "clapnq.jsonl")

os.makedirs(passage_dir, exist_ok=True)

if os.path.exists(passage_zip):
    print("Unzipping passage-level ClapNQ dataset...")
    with zipfile.ZipFile(passage_zip, 'r') as zip_ref:
        zip_ref.extractall(passage_dir)
elif not os.path.exists(jsonl_file):
    print("ClapNQ JSONL missing! Make sure it is in the path.")
else:
    print("Dataset already exists.")

print("Files in dataset dir:", os.listdir(passage_dir))

Unzipping passage-level ClapNQ dataset...
Files in dataset dir: ['clapnq.jsonl']


In [None]:
# -----------------------------
# 3️⃣ Load passages and build FAISS index
# -----------------------------
!pip install -q sentence-transformers faiss-cpu transformers accelerate

import json, pickle
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

index_dir = "/content/mt-rag-benchmark/indexes/clapnq"
os.makedirs(index_dir, exist_ok=True)

# Load passages from JSONL
texts, ids = [], []
with open(jsonl_file, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if not line.strip(): continue
        obj = json.loads(line)
        text = obj.get("text") or obj.get("passage") or obj.get("content") or obj.get("body") or ""
        texts.append(text)
        ids.append(obj.get("id", str(i)))

print("Loaded passages:", len(texts))

# Encode passages
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
emb = embed_model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

# Normalize and index with FAISS
faiss.normalize_L2(emb)
dim = emb.shape[1]
idx = faiss.IndexFlatIP(dim)
idx.add(emb)

# Save index and metadata
faiss.write_index(idx, os.path.join(index_dir, "index.faiss"))
with open(os.path.join(index_dir, "passages.pkl"), "wb") as f:
    pickle.dump({"ids": ids, "texts": texts}, f)
print("Index built and saved at:", index_dir)

Loaded passages: 183408


Batches:   0%|          | 0/5732 [00:00<?, ?it/s]

Index built and saved at: /content/mt-rag-benchmark/indexes/clapnq


In [None]:
# -----------------------------
# 4️⃣ Load FAISS index & passages for retrieval
# -----------------------------
with open(os.path.join(index_dir, "passages.pkl"), "rb") as f:
    meta = pickle.load(f)
idx = faiss.read_index(os.path.join(index_dir, "index.faiss"))

In [None]:
# 5️⃣ Define multi-turn retriever
# -----------------------------
def retrieve(conversation_history, latest_query, k=5):
    multi_turn_query = " ".join(conversation_history + [latest_query])
    qemb = embed_model.encode([multi_turn_query], convert_to_numpy=True)
    faiss.normalize_L2(qemb)
    D, I = idx.search(qemb, k)
    results = []
    for score, ii in zip(D[0], I[0]):
        results.append({
            "id": meta["ids"][ii],
            "score": float(score),
            "text": meta["texts"][ii]
        })
    return results

In [None]:
# 6️⃣ Test retrieval with conversations
# -----------------------------

# Use the first conversation from the loaded data
if conversations_data:
    first_conversation = conversations_data[0]
    conversation_history = [f"{msg['speaker']}: {msg['text']}" for msg in first_conversation['messages'][:-1]]
    latest_query = first_conversation['messages'][-1]['text']

    print("Conversation History:")
    for turn in conversation_history:
        print(turn)
    print(f"\nLatest Query: {latest_query}")

    # Retrieve top passages using the multi-turn query
    top_passages_multi_turn = retrieve(conversation_history, latest_query, k=5)

    print("\nTop passages retrieved using multi-turn query:")
    for i, p in enumerate(top_passages_multi_turn):
        print(f"{i+1}. Score: {p['score']:.4f} - {p['text'][:150]}...")

    # Compare with retrieval using only the latest query
    print("\nTop passages retrieved using only the latest query:")
    top_passages_latest_turn = retrieve([], latest_query, k=5) # Pass empty conversation history
    for i, p in enumerate(top_passages_latest_turn):
        print(f"{i+1}. Score: {p['score']:.4f} - {p['text'][:150]}...")

else:
    print("No conversation data loaded to test retrieval.")

Conversation History:
user: where do the arizona cardinals play this week
agent: I'm sorry, but I don't have the answer to your question.
user: Do the Arizona Cardinals play outside the US?
agent: The Arizona Cardinals do play outside the United States. They had a game in London, England, on October 22, 2017, against the Los Angeles Rams at Twickenham Stadium and in 2005 they played in Mexico.
user: Are the Arizona Cardinals and the Chicago Cardinals the same team?
agent: The Chicago Cardinals became the St. Louis Cardinals in 1960 and eventually moved and became the Arizona Cardinals. The Chicago Cardinals ( now the Arizona Cardinals ) were a founding member of the NFL.
user: How many teams are in the NFL?
agent: There are 32 teams in the National Football League (NFL).
user: How many teams are in the NFL playoffs?
agent: Six teams from each conference (AFC and NFC), for a total of 12 team playoff system.
user: Which team has won the most Super Bowls?
agent: The Pittsburgh Steelers ha

In [None]:
import json

qa_file = "/content/mt-rag-benchmark/human/generation_tasks/reference.jsonl"
qa_data = []
with open(qa_file, "r", encoding="utf-8") as f:
    for line in f:
        qa_data.append(json.loads(line))

print(f"Loaded {len(qa_data)} QA pairs.")
# Display the first few QA pairs to understand the structure
if qa_data:
    print("\nFirst 3 QA pairs:")
    for i in range(min(3, len(qa_data))):
        display(qa_data[i])

Loaded 842 QA pairs.

First 3 QA pairs:


{'conversation_id': 'dd6b6ffd177f2b311abe676261279d2f',
 'task_id': 'dd6b6ffd177f2b311abe676261279d2f<::>1',
 'task_type': 'rag',
 'turn': '1',
 'dataset': 'MT-RAG Authors (Internal)',
 'contexts': [],
 'input': [{'speaker': 'user',
   'text': 'where do the arizona cardinals play this week',
   'metadata': {'author_type': 'human',
    'author_id': 'c9727a43-b616-4467-98ce-03501b1f6953',
    'created_at': 1724708469}}],
 'targets': [{'speaker': 'agent',
   'text': "I'm sorry, but I don't have the answer to your question.",
   'metadata': {'author_type': 'model',
    'author_id': 'mixtral-8x7b-instruct-v01',
    'created_at': 1724708471}}],
 'Question Type': ['Factoid'],
 'Multi-Turn': ['N/A'],
 'Answerability': ['UNANSWERABLE'],
 'Collection': 'mt-rag-clapnq-elser-512-100-20240503'}

{'conversation_id': 'dd6b6ffd177f2b311abe676261279d2f',
 'task_id': 'dd6b6ffd177f2b311abe676261279d2f<::>2',
 'task_type': 'rag',
 'turn': '2',
 'dataset': 'MT-RAG Authors (Internal)',
 'contexts': [{'document_id': '822086267_7384-8758-0-1374',
   'text': "2017 Arizona Cardinals season\nWeek Date Opponent Result Record Game site NFL.com recap September 10 at Detroit Lions L 23 -- 35 0 -- 1 Ford Field Recap September 17 at Indianapolis Colts W 16 -- 13 ( OT ) 1 -- 1 Lucas Oil Stadium Recap September 25 Dallas Cowboys L 17 -- 28 1 -- 2 University of Phoenix Stadium Recap October 1 San Francisco 49ers W 18 -- 15 ( OT ) 2 -- 2 University of Phoenix Stadium Recap 5 October 8 at Philadelphia Eagles L 7 -- 34 2 -- 3 Lincoln Financial Field Recap 6 October 15 Tampa Bay Buccaneers W 38 -- 33 3 -- 3 University of Phoenix Stadium Recap 7 October 22 at Los Angeles Rams L 0 -- 33 3 -- 4 Twickenham Stadium ( London , England ) Recap 8 Bye 9 November 5 at San Francisco 49ers W 20 -- 10 4 -- 4 Levi 's

{'conversation_id': 'dd6b6ffd177f2b311abe676261279d2f',
 'task_id': 'dd6b6ffd177f2b311abe676261279d2f<::>3',
 'task_type': 'rag',
 'turn': '3',
 'dataset': 'MT-RAG Authors (Internal)',
 'contexts': [{'document_id': '866343245_65754-66191-0-437',
   'text': "Illinois\nThe NFL 's Arizona Cardinals , who currently play in the Phoenix suburb of Glendale , Arizona , played in Chicago as the Chicago Cardinals , until moving to St. Louis , Missouri after the 1959 season . An NBA expansion team known as the Chicago Packers in 1961 -- 1962 , and as the Chicago Zephyrs the following year , moved to Baltimore after the 1962 -- 1963 season . The franchise is now known as the Washington Wizards . Professional",
   'title': 'Illinois',
   'score': 17.550575,
   'feedback': {'relevant': {'c9727a43-b616-4467-98ce-03501b1f6953': {'value': 'yes',
      'timestamp': 1724714175}}},
   'query': {'query': {'bool': {'must': {'text_expansion': {'ml.tokens': {'model_id': '.elser_model_1',
         'model_text'

In [None]:
# -----------------------------
# Subtask B: Generation from Reference Passages (ClapNQ Sample)
# -----------------------------

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 1️⃣ Load the FLAN-T5 model
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 2️⃣ Use the loaded qa_data
qa_samples = qa_data[:6] # Using the first 6 samples from loaded data

# 3️⃣ Helper function to build prompt
def build_prompt(question, context):
    return f"""
Read the following passage carefully and answer the question in a complete sentence, using your own words if necessary.
Passage: {context}
Question: {question}
Answer (complete sentence):
"""


# 4️⃣ Generate answers
generated_answers = []
for qa in qa_samples:
    # For generation, we will use the provided reference contexts
    context_text = " ".join([c['text'] for c in qa.get("contexts", [])]) # Use 'contexts' field if available, otherwise use an empty list
    if not context_text:
      # If no contexts are available in the QA data, skip this sample for generation
      print(f"Skipping QA sample {qa.get('id', 'N/A')} due to missing contexts for generation.")
      continue

    prompt = build_prompt(qa["input"][-1]["text"], context_text) # Use the last turn of input as the question
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = model.generate(
    **inputs,
    max_new_tokens=150,       # allow longer answers
    do_sample=True,           # allow creativity
    top_p=0.9,               # nucleus sampling
    temperature=0.7           # control randomness
)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_answers.append({
        "id": qa.get("id", "N/A"), # Use .get() with a default value
        "question": qa["input"][-1]["text"], # Use the last turn of input as the question
        "answer": answer,
        "reference_answer": qa["targets"][0]["text"] # Include the reference answer for comparison
    })

# 5️⃣ Display results
for ga in generated_answers:
    print(f"Q: {ga['question']}")
    print(f"Reference A: {ga['reference_answer']}")
    print(f"Generated A: {ga['answer']}")
    print("-"*50)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Skipping QA sample N/A due to missing contexts for generation.
Q: Do the Arizona Cardinals play outside the US?
Reference A: The Arizona Cardinals do play outside the United States. They had a game in London, England, on October 22, 2017, against the Los Angeles Rams at Twickenham Stadium and in 2005 they played in Mexico.
Generated A: The Cardinals will play the Los Angeles Rams as one of the NFL International Series at Twickenham Stadium in London , England , with the Rams serving as the home team .
--------------------------------------------------
Q: Are the Arizona Cardinals and the Chicago Cardinals the same team?
Reference A: The Chicago Cardinals became the St. Louis Cardinals in 1960 and eventually moved and became the Arizona Cardinals. The Chicago Cardinals ( now the Arizona Cardinals ) were a founding member of the NFL.
Generated A: St. Louis Cardinals moved from Chicago in 1960 ; moved to Tempe , Arizona in 1988 and are now the Arizona Cardinals .
-------------------------

In [None]:
import json

conversations_file = "/content/mt-rag-benchmark/human/conversations/conversations.json"

with open(conversations_file, "r", encoding="utf-8") as f:
    conversations_data = json.load(f)

print(f"Loaded {len(conversations_data)} conversations.")
# Display the first conversation to understand its structure
if conversations_data:
    print("\nFirst conversation:")
    display(conversations_data[0])

Loaded 110 conversations.

First conversation:


{'author': 'd2b56e91-6d89-4df4-b89a-eef8d969850b',
 'retriever': {'collection': {'name': 'mt-rag-clapnq-elser-512-100-20240503',
   'size': '183408'},
  'parameters': {'max_count': 3,
   'max_utterances': -1,
   'query_syntax': '{\n  "query": {\n    "bool": {\n      "must": {\n        "text_expansion": {\n          "ml.tokens": {\n            "model_id": ".elser_model_1",\n            "model_text": "${QUERY}"\n          }\n        }\n      }\n    }\n  }\n}',
   'project': '{\n  "text": "text",\n  "title": "title",\n  "url": "url"\n}'}},
 'generator': {'id': 'mistralai/mixtral-8x7b-instruct-v01',
  'name': 'mixtral-8x7b-instruct-v01',
  'prompt': {'template': '[INST]\n${CONTEXT}\n${SYSTEM_INST}\n${INPUT}\n[/INST]\nanswer:',
   'input': '${SPEAKER}: ${TEXT}\n',
   'context': '[DOCUMENT]\n${TEXT}\n[END]\n',
   'system_instruction': 'You are an AI Assistant, tasked with providing responses that are well-grounded in the provided documents. Given one or more documents and a user query, gener

In [None]:
# @title
# Inspect the downloaded files
print("Listing files in mt-rag-benchmark directory:")
!ls -R /content/mt-rag-benchmark

Listing files in mt-rag-benchmark directory:
/content/mt-rag-benchmark:
corpora  human	indexes  LICENSE  README.md  scripts  synthetic

/content/mt-rag-benchmark/corpora:
document_level	passage_level  README.md

/content/mt-rag-benchmark/corpora/document_level:
clapnq.jsonl.zip  cloud.jsonl.zip  fiqa.jsonl.zip  govt.jsonl.zip

/content/mt-rag-benchmark/corpora/passage_level:
clapnq	clapnq.jsonl.zip  cloud.jsonl.zip  fiqa.jsonl.zip  govt.jsonl.zip

/content/mt-rag-benchmark/corpora/passage_level/clapnq:
clapnq.jsonl

/content/mt-rag-benchmark/human:
conversations  evaluations  generation_tasks  retrieval_tasks

/content/mt-rag-benchmark/human/conversations:
conversations.json

/content/mt-rag-benchmark/human/evaluations:
RAG.json	reference+RAG.json
reference.json	reference_subset_with_human_evaluations.json

/content/mt-rag-benchmark/human/generation_tasks:
RAG.jsonl  README.md  reference.jsonl  reference+RAG.jsonl

/content/mt-rag-benchmark/human/retrieval_tasks:
clapnq	cloud  fiqa  go

In [None]:
# -----------------------------
# 🔹 Subtask C: Multi-Turn Retrieval + Generation with Gemini 2.5 Flash
# -----------------------------
!pip install -q -U google-generativeai

import google.generativeai as genai
from google.colab import userdata

# 1️⃣ Configure Gemini API
GOOGLE_API_KEY = userdata.get("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
    raise ValueError("⚠️ Please add your Google API key in Colab Secrets as 'GOOGLE_API_KEY'.")
genai.configure(api_key=GOOGLE_API_KEY)

# 2️⃣ Initialize the Gemini model
gemini_model = genai.GenerativeModel("gemini-2.5-flash")

# 3️⃣ Helper function for generating answers
def generate_with_gemini(question, retrieved_passages):
    # Combine top-k passages into a single context
    context = "\n\n".join([f"{i+1}. {p['text']}" for i, p in enumerate(retrieved_passages)])
    prompt = f"""
You are an intelligent assistant answering questions using factual context.
Read the passages below and answer the question clearly and in simple language
so that anyone can understand.

Passages:
{context}

Question: {question}

Answer:
"""
    response = gemini_model.generate_content(prompt)
    return response.text.strip()

# 4️⃣ Example Multi-Turn Query from Conversation Data
if conversations_data:
    # Use the first conversation for demo
    conv = conversations_data[0]
    conversation_history = [f"{msg['speaker']}: {msg['text']}" for msg in conv['messages'][:-1]]
    latest_query = conv['messages'][-1]['text']

    print("🗣️ Conversation History:")
    for turn in conversation_history:
        print(turn)
    print("\n❓ Latest Query:", latest_query)

    # Retrieve top 5 passages based on the multi-turn query
    top_passages = retrieve(conversation_history, latest_query, k=5)

    print("\n📚 Retrieved Passages:")
    for i, p in enumerate(top_passages):
        print(f"{i+1}. (score={p['score']:.4f}) {p['text'][:200]}...")

    # Generate an answer with Gemini
    answer = generate_with_gemini(latest_query, top_passages)

    print("\n💬 Gemini 2.5 Flash Generated Answer:")
    print(answer)

else:
    print("⚠️ No conversation data found. Please ensure 'conversations_data' is loaded.")

print("\n✅ Subtask C complete — Gemini integrated for multi-turn RAG generation.")


🗣️ Conversation History:
user: where do the arizona cardinals play this week
agent: I'm sorry, but I don't have the answer to your question.
user: Do the Arizona Cardinals play outside the US?
agent: The Arizona Cardinals do play outside the United States. They had a game in London, England, on October 22, 2017, against the Los Angeles Rams at Twickenham Stadium and in 2005 they played in Mexico.
user: Are the Arizona Cardinals and the Chicago Cardinals the same team?
agent: The Chicago Cardinals became the St. Louis Cardinals in 1960 and eventually moved and became the Arizona Cardinals. The Chicago Cardinals ( now the Arizona Cardinals ) were a founding member of the NFL.
user: How many teams are in the NFL?
agent: There are 32 teams in the National Football League (NFL).
user: How many teams are in the NFL playoffs?
agent: Six teams from each conference (AFC and NFC), for a total of 12 team playoff system.
user: Which team has won the most Super Bowls?
agent: The Pittsburgh Steelers