# Step 0: Prepare Environment

In [None]:
%pip install -q sentence-transformers qdrant-client transformers accelerate bitsandbytes evaluate rouge_score nltk sacrebleu

In [None]:
import evaluate
import json
import numpy as np
import os
import pandas as pd
import psutil
import re
import requests
import shutil
import torch
import time
import uuid

from datasets import load_dataset
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, PointStruct, ScoredPoint, VectorParams
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

# Step 1: Enter your Hugging Face access token to be able to use Gemma

In [None]:
from huggingface_hub import login

login()

# Step 2: Data Selection & Preparation

In [None]:
# Selected book
title = "The Tale of Two Bad Mice"
document_id = "08c3429e9717bc663bc5f41cd3a2c701c222ed2f"
story_url = "http://www.gutenberg.org/ebooks/45264.txt.utf-8"


print("title:", title)
print("document_id: ", document_id)
print("story_url: ", story_url)

In [None]:
# Download the text
response = requests.get(story_url)
response.raise_for_status()
raw_text = response.text

print("Downloaded characters:", len(raw_text))

# Cleanthe text
def clean_gutenberg(text):
  # Remove header
  start_pattern = r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK THE TALE OF TWO BAD MICE \*\*\*"
  start_match = re.search(start_pattern, text, re.IGNORECASE)
  if start_match:
    text = text[start_match.end():]
  else:
    print("START marker not found")

  # Remove footer
  end_pattern = r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK"
  end_match = re.search(end_pattern, text, re.IGNORECASE)
  if end_match:
    text = text[:end_match.start()]
  else:
    print("END marker not found")

  # Additional cleanining from the start
  story_start = re.search(r"THE TALE OF TWO BAD MICE", text, re.IGNORECASE)
  if story_start:
    text = text[story_start.start():]

  story_start = re.search(r"ONCE", text, re.IGNORECASE)
  if story_start:
    text = text[story_start.start():]

  # Additional cleanining from the end
  story_end = re.search(r"PRINTED BY", text, re.IGNORECASE)
  if story_end:
    text = text[:story_end.start()]

  # Clean Illustration
  text = text.replace("[Illustration]", "")

  # Clean whitespace
  text = text.strip()
  text = re.sub(r'\n\s*\n+', '\n\n', text)

  return text

# Apply cleaning
clean_text = clean_gutenberg(raw_text)

# Save cleaned text to a file
with open("The_Tale_of_Two_Bad_Mice_clean.txt", "w", encoding="utf-8") as f:
  f.write(clean_text)
print("Cleaned text saved to The_Tale_of_Two_Bad_Mice_clean.txt")


In [None]:
# Load NarrativeQA dataset from Hugging Face
narrativeqa = load_dataset("narrativeqa")

In [None]:
# Filter test set for your selected book using document_id
test_filtered = narrativeqa['test'].filter(
    lambda x: x['document']['id'] == document_id
)

# Print number of QA test pairs
print(f"\nNumber of QA pairs in TEST set for '{title}': {len(test_filtered)}")

# Preview QA pairs from test set
print("\n--- Sample QA pairs from TEST set ---")
for i, example in enumerate(test_filtered):
  print(f"\nQ{i+1}: {example['question']['text']}")
  print(f"A{i+1}.1: {example['answers'][0]['text']}")
  if len(example['answers']) > 1:
    print(f"A{i+1}.2: {example['answers'][1]['text']}")

In [None]:
with open("The_Tale_of_Two_Bad_Mice_clean.txt", "r", encoding="utf-8") as f:
  clean_text = f.read()

print(f"Total characters: {len(clean_text)}")
print(f"Total words: {len(clean_text.split())}")

# Step 3: Indexing Strategy: Hierarchical Chunking

In [None]:
# Chunking
def fast_chunk(text, parent_size=1000, child_size=200, overlap=20):
  parents, children = [], []
  for p_idx, p_start in enumerate(range(0, len(text), parent_size)):
    p_text = text[p_start:p_start + parent_size].strip()
    if not p_text:
      continue
    parents.append({'id': f'p{p_idx}', 'text': p_text})
    for c_idx, c_start in enumerate(range(0, len(p_text), child_size - overlap)):
      c_text = p_text[c_start:c_start + child_size].strip()
      if c_text:
        children.append({'id': f'p{p_idx}_c{c_idx}', 'parent_id': f'p{p_idx}', 'text': c_text})
  return parents, children

parent_chunks, child_chunks = fast_chunk(clean_text)
print(f"Parents: {len(parent_chunks)}, Children: {len(child_chunks)}")

with open("parent_chunks.json", "w") as f:
  json.dump({p['id']: p['text'] for p in parent_chunks}, f)

In [None]:
# Embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed one at a time to avoid RAM spike
child_embeddings = []
for c in child_chunks:
  emb = model.encode(c['text'], convert_to_numpy=True)
  child_embeddings.append(emb)

print(f"Done: {len(child_embeddings)} embeddings")

# Step 4: Retrieval

In [None]:
# DB
# Remove old database folder to release lock
if os.path.exists("./qdrant_db"):
  shutil.rmtree("./qdrant_db")

client = QdrantClient(path="./qdrant_db")

client.create_collection("bad_mice", vectors_config=VectorParams(size=384, distance=Distance.COSINE))

points = [
  PointStruct(id=i, vector=child_embeddings[i].tolist(),
              payload={'id': child_chunks[i]['id'], 'parent_id': child_chunks[i]['parent_id'], 'text': child_chunks[i]['text']})
  for i in range(len(child_chunks))
]

client.upsert(collection_name="bad_mice", points=points)
print(f"Indexed {len(points)} chunks")

In [None]:
document_id = "08c3429e9717bc663bc5f41cd3a2c701c222ed2f"

# Extract QA pairs
qa_pairs = []
for ex in test_filtered:
  qa_pairs.append({
    'question': ex['question']['text'],
    'answers': [a['text'] for a in ex['answers']]
  })

print(f"Sample Q: {qa_pairs[0]['question']}")
print(f"Sample A: {qa_pairs[0]['answers']}")

In [None]:
with open("parent_chunks.json", "r") as f:
  parent_lookup = json.load(f)

# Retrieval: Retrieve top-k child chunks and optionally expand to parent context
def retrieve_with_context(question, top_k=3, use_parent=True):
  query_vec = model.encode(question).tolist()

  results = client.query_points(
    collection_name="bad_mice",
    query=query_vec,
    limit=top_k
  )

  child_texts = []
  parent_ids = set()

  for r in results.points:
    child_texts.append(r.payload['text'])
    parent_ids.add(r.payload['parent_id'])

  if use_parent:
    # Use parent chunks for expanded context
    context = "\n\n".join([parent_lookup[pid] for pid in sorted(parent_ids)])
  else:
    # Use only child chunks
    context = "\n\n".join(child_texts)

  return context, child_texts, list(parent_ids)

# Test retrieval
ctx, children, parents = retrieve_with_context("What did the mice do?")
print(f"Retrieved {len(children)} children from {len(parents)} parents")
print(f"Context length: {len(ctx)} chars")

In [None]:
# Set LLM model
model_name = "google/gemma-3-1b-it"

tokenizer = AutoTokenizer.from_pretrained(model_name)
llm = AutoModelForCausalLM.from_pretrained(
  model_name,
  torch_dtype=torch.float16,
  device_map="auto"
)

print("Gemma loaded")

In [None]:
# Creare RAG prompt
def create_prompt(context, question):
  prompt = f"""<start_of_turn>user
Read the context and answer the question in 1-3 words only.

Context:
{context}

Question: {question}

Answer in 1-3 words only:
<end_of_turn>
<start_of_turn>model
"""
  return prompt


# Create baseline prompt
def create_baseline_prompt(question):
  prompt = f"""<start_of_turn>user
Answer the question in 1-3 words only: {question}
<end_of_turn>
<start_of_turn>model
"""
  return prompt


# Generate answer
def generate_answer(prompt, max_new_tokens=100):
  inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)

  with torch.no_grad():
    outputs = llm.generate(
      **inputs,
      max_new_tokens=max_new_tokens,
      do_sample=False,
      pad_token_id=tokenizer.eos_token_id
    )

  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
  answer = response.split("model")[-1].strip()
  return answer

# Test generation
test_ctx, _, _ = retrieve_with_context(qa_pairs[0]['question'])
test_prompt = create_prompt(test_ctx, qa_pairs[0]['question'])
test_answer = generate_answer(test_prompt)
print(f"Question: {qa_pairs[0]['question']}")
print(f"Generated: {test_answer}")
print(f"Reference: {qa_pairs[0]['answers']}")

In [None]:
# Load metrics
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

In [None]:
# Get current RAM usage in MB
def get_memory_usage():
  process = psutil.Process(os.getpid())
  return process.memory_info().rss / 1024 / 1024


# Get disk usage of a folder in MB
def get_disk_usage(path="./qdrant_db"):
  total_size = 0
  if os.path.exists(path):
    for dirpath, dirnames, filenames in os.walk(path):
      for f in filenames:
        fp = os.path.join(dirpath, f)
        total_size += os.path.getsize(fp)
  return total_size / 1024 / 1024


# Print current resource usage
def monitor_resources():
  ram = get_memory_usage()
  disk = get_disk_usage()
  print(f"RAM: {ram:.1f} MB | Qdrant DB: {disk:.2f} MB")
  return ram, disk

print("Initial state:")
monitor_resources()

In [None]:
baseline_preds = []
rag_preds = []
all_refs = []

# Resource tracking
peak_ram = 0
inference_times = {'baseline': [], 'rag': []}

print("Running evaluation...")

for qa in tqdm(qa_pairs):
  question = qa['question']
  references = qa['answers']

  # Store references
  all_refs.append(references)

  # Baseline (no RAG)
  start = time.time()
  baseline_prompt = create_baseline_prompt(question)
  baseline_answer = generate_answer(baseline_prompt)
  inference_times['baseline'].append(time.time() - start)
  baseline_preds.append(baseline_answer)

  # RAG
  start = time.time()
  context, _, _ = retrieve_with_context(question, top_k=3, use_parent=True)
  rag_prompt = create_prompt(context, question)
  rag_answer = generate_answer(rag_prompt)
  inference_times['rag'].append(time.time() - start)
  rag_preds.append(rag_answer)

  # Track peak RAM
  current_ram = get_memory_usage()
  peak_ram = max(peak_ram, current_ram)

print("Evaluation complete!")

In [None]:
print("=" * 50)
print("EVALUATION RESULTS")
print("=" * 50)
print(f"Number of test QA pairs: {len(qa_pairs)}")

# Baseline metrics
print(f"\n--- BASELINE (No RAG) ---")
baseline_bleu = bleu.compute(predictions=baseline_preds, references=all_refs, lowercase=True)
baseline_rouge = rouge.compute(predictions=baseline_preds, references=all_refs)
print(f"BLEU-4:  {baseline_bleu['score']:.4f}")
print(f"ROUGE-L: {baseline_rouge['rougeL']:.4f}")

print(baseline_preds)

# RAG metrics
print(f"\n--- RAG SYSTEM ---")
rag_bleu = bleu.compute(predictions=rag_preds, references=all_refs, lowercase=True)
rag_rouge = rouge.compute(predictions=rag_preds, references=all_refs)
print(f"BLEU-4:  {rag_bleu['score']:.4f}")
print(f"ROUGE-L: {rag_rouge['rougeL']:.4f}")

print(rag_preds)

# Improvement
print(f"\n--- IMPROVEMENT ---")
print(f"BLEU-4:  {rag_bleu['score'] - baseline_bleu['score']:+.4f}")
print(f"ROUGE-L: {rag_rouge['rougeL'] - baseline_rouge['rougeL']:+.4f}")

In [None]:
print("=" * 50)
print("SAMPLE COMPARISONS")
print("=" * 50)

for i in range(len(qa_pairs)):
  print(f"\n--- Example {i+1} ---")
  print(f"Question: {qa_pairs[i]['question']}")
  print(f"Reference: {all_refs[i]}")
  print(f"Baseline: {baseline_preds[i]}")
  print(f"RAG: {rag_preds[i]}")

In [None]:
print("=" * 50)
print("RESOURCE MONITORING REPORT")
print("=" * 50)

# Memory
print(f"\n--- MEMORY (RAM) ---")
print(f"Peak RAM usage: {peak_ram:.1f} MB")
print(f"Final RAM usage: {get_memory_usage():.1f} MB")
print(f"Colab free tier limit: ~12,000 MB")
print(f"Usage: {peak_ram/12000*100:.1f}% of limit")

# Disk
print(f"\n--- DISK (Vector DB) ---")
db_size = get_disk_usage("./qdrant_db")
print(f"Qdrant DB size: {db_size:.2f} MB")
print(f"Parent chunks JSON: {os.path.getsize('parent_chunks.json')/1024:.2f} KB")

# Inference time
print(f"\n--- INFERENCE TIME ---")
print(f"Baseline avg: {np.mean(inference_times['baseline']):.2f}s per question")
print(f"RAG avg: {np.mean(inference_times['rag']):.2f}s per question")
print(f"RAG overhead: {np.mean(inference_times['rag']) - np.mean(inference_times['baseline']):.2f}s")
print(f"Total baseline time: {sum(inference_times['baseline']):.1f}s")
print(f"Total RAG time: {sum(inference_times['rag']):.1f}s")