<a href="https://colab.research.google.com/github/kanawanttotimetravel/MultiHop-RAG/blob/main/MultiHopRAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!rm -rf /content/sample_data

In [None]:
from pathlib import Path

# Read and split the document into passages
def load_corpus(path):
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read()
    return [p.strip() for p in text.split('<endofpassage>') if p.strip()]

passages = load_corpus("/content/multihoprag_corpus.txt")

from sentence_transformers import SentenceTransformer
import numpy as np

embedder = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = embedder.encode(passages, convert_to_tensor=True)

import torch
import heapq

def retrieve_topk(query, k=2):
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cosine_scores = torch.nn.functional.cosine_similarity(query_embedding, corpus_embeddings)
    top_k_indices = torch.topk(cosine_scores, k).indices
    return [passages[i] for i in top_k_indices]

from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-1.7B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             trust_remote_code=True,
                                             torch_dtype=torch.float16,
                                             device_map="auto").eval()


In [None]:
def generate_answer(query):
    context = retrieve_topk(query)
    system = "Please answer the question based on the contexts. Only generate the question and nothing else."
    prompt = f"System:\n{system}\nContext:\n{context[0]}\n{context[1]}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens=100)
    full_response = tokenizer.decode(output[0], skip_special_tokens=True)
    # Extract only the answer part after "Answer:"
    answer = full_response.split("Answer:")[-1].strip().split('\n')[0]
    return answer

In [None]:
from collections import Counter

def f1_score(prediction, ground_truth):
    # Simple whitespace tokenization
    pred_tokens = prediction.lower().strip().split()
    gt_tokens = ground_truth.lower().strip().split()

    common = Counter(pred_tokens) & Counter(gt_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return 0.0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(gt_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    return f1

In [None]:
import json
import random
from pathlib import Path
from sklearn.metrics import accuracy_score
import re

# --- Load Evaluation Queries ---
def load_eval_data(path, limit=None):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    if limit:
        return data[:limit]
        # data = random.sample(data, min(limit, len(data)))

    return data

# --- Evaluate MultiHop RAG ---
def evaluate_rag(model_fn, eval_data):
    predictions = []
    targets = []
    f1_scores = []

    for example in eval_data:
        query = example["query"]
        expected = example["answer"].strip()
        generated = model_fn(query).strip()

        expected = re.sub(r'[^\w\s]', '', expected.lower())
        generated = re.sub(r'[^\w\s]', '', generated.lower())

        predictions.append(generated)
        targets.append(expected)

        f1 = f1_score(generated, expected)
        f1_scores.append(f1)

        print(f"\nQ: {query}\nExpected: {expected}\nPredicted: {generated}\nF1: {f1:.2f}\n{'-'*50}")
        avg_f1 = sum(f1_scores) / len(f1_scores)

    # Simple accuracy (exact match)
    correct = sum(p.lower() == t.lower() for p, t in zip(predictions, targets))
    total = len(eval_data)
    accuracy = correct / total
    print(f"\n Evaluation Accuracy: {accuracy*100:.2f}% ({correct}/{total})")

    avg_f1 = sum(f1_scores) / len(f1_scores)
    print(f"\n Average F1 Score: {avg_f1:.3f}")

    return accuracy, avg_f1

In [None]:
eval_data = load_eval_data("/content/MultiHopRAG.json", limit=200)  # change limit as desired
evaluate_rag(generate_answer, eval_data)