<a href="https://colab.research.google.com/github/harald-gen01/My_AI_learning_path/blob/main/Why_naive_RAG_fails.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install > /dev/null 2>&1 \
  google-genai>=1.0.0 \
  chromadb

In [2]:
import textwrap
import chromadb
import numpy as np
import pandas as pd

from IPython.display import Markdown
from chromadb import Documents, EmbeddingFunction, Embeddings
from google import genai

In [3]:
import os
from google.colab import userdata
GEMINI_API_KEY = userdata.get('gemini_api_key')
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

In [4]:
EMBEDDDING_MODEL_ID = "gemini-embedding-001"
client = genai.Client()

# Documents to embed

In [5]:
DOCUMENT1 = """
  Operating the Climate Control System  Your Googlecar has a climate control
  system that allows you to adjust the temperature and airflow in the car.
  To operate the climate control system, use the buttons and knobs located on
  the center console.  Temperature: The temperature knob controls the
  temperature inside the car. Turn the knob clockwise to increase the
  temperature or counterclockwise to decrease the temperature.
  Airflow: The airflow knob controls the amount of airflow inside the car.
  Turn the knob clockwise to increase the airflow or counterclockwise to
  decrease the airflow. Fan speed: The fan speed knob controls the speed
  of the fan. Turn the knob clockwise to increase the fan speed or
  counterclockwise to decrease the fan speed.
  Mode: The mode button allows you to select the desired mode. The available
  modes are: Auto: The car will automatically adjust the temperature and
  airflow to maintain a comfortable level.
  Cool: The car will blow cool air into the car.
  Heat: The car will blow warm air into the car.
  Defrost: The car will blow warm air onto the windshield to defrost it.
"""
DOCUMENT2 = """
  Your Googlecar has a large touchscreen display that provides access to a
  variety of features, including navigation, entertainment, and climate
  control. To use the touchscreen display, simply touch the desired icon.
  For example, you can touch the \"Navigation\" icon to get directions to
  your destination or touch the \"Music\" icon to play your favorite songs.
"""
DOCUMENT3 = """
  Shifting Gears Your Googlecar has an automatic transmission. To
  shift gears, simply move the shift lever to the desired position.
  Park: This position is used when you are parked. The wheels are locked
  and the car cannot move.
  Reverse: This position is used to back up.
  Neutral: This position is used when you are stopped at a light or in traffic.
  The car is not in gear and will not move unless you press the gas pedal.
  Drive: This position is used to drive forward.
  Low: This position is used for driving in snow or other slippery conditions.
"""

documents = [DOCUMENT1, DOCUMENT2, DOCUMENT3]

In [6]:
from google.genai import types

class GeminiEmbeddingFunction(EmbeddingFunction):
  def __call__(self, input: Documents) -> Embeddings:
    EMBEDDING_MODEL_ID = EMBEDDDING_MODEL_ID
    title = "Custom query"
    response = client.models.embed_content(
        model=EMBEDDING_MODEL_ID,
        contents=input,
        config=types.EmbedContentConfig(
          task_type="retrieval_document",
          title=title
        )
    )

    return response.embeddings[0].values

In [7]:
def create_chroma_db(documents, name):
  chroma_client = chromadb.Client()
  db = chroma_client.create_collection(
      name=name,
      embedding_function=GeminiEmbeddingFunction()
  )

  for i, d in enumerate(documents):
    db.add(
      documents=d,
      ids=str(i)
    )
  return db

In [8]:
# Set up the DB
db = create_chroma_db(documents, "google-car-db")

  embedding_function=GeminiEmbeddingFunction()


In [9]:
sample_data = db.get(include=['documents', 'embeddings'])

df = pd.DataFrame({
    "IDs": sample_data['ids'][:3],
    "Documents": sample_data['documents'][:3],
    "Embeddings": [str(emb)[:50] + "..." for emb in sample_data['embeddings'][:3]]  # Truncate embeddings
})

df

Unnamed: 0,IDs,Documents,Embeddings
0,0,\n Operating the Climate Control System Your...,[ 0.00971627 -0.00177013 0.00590323 ... 0.00...
1,1,\n Your Googlecar has a large touchscreen dis...,[ 0.00388563 -0.00036349 -0.00230268 ... 0.01...
2,2,\n Shifting Gears Your Googlecar has an autom...,[-0.00264773 0.010808 -0.00854844 ... 0.00...


In [10]:
def get_relevant_passage(query, db):
  passage = db.query(query_texts=[query], n_results=1)['documents'][0][0]
  return passage

In [11]:
# Perform embedding search
passage = get_relevant_passage("touch screen features", db)
Markdown(passage)


  Your Googlecar has a large touchscreen display that provides access to a
  variety of features, including navigation, entertainment, and climate
  control. To use the touchscreen display, simply touch the desired icon.
  For example, you can touch the "Navigation" icon to get directions to
  your destination or touch the "Music" icon to play your favorite songs.


In [12]:
def make_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""
    You are a helpful and informative bot that answers questions using
    text from the reference passage included below.
    Be sure to respond in a complete sentence, being comprehensive,
    including all relevant background information.
    However, you are talking to a non-technical audience, so be sure to
    break down complicated concepts and strike a friendly
    and converstional tone. If the passage is irrelevant to the answer,
    you may ignore it. If the answer isn’t in the passage, say you don’t know.
    QUESTION: '{query}'
    PASSAGE: '{relevant_passage}'

    ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt


In [32]:
query = "How do you use the touchscreen display in the Google car?"
prompt = make_prompt(query, passage)
Markdown(prompt)



    You are a helpful and informative bot that answers questions using
    text from the reference passage included below.
    Be sure to respond in a complete sentence, being comprehensive,
    including all relevant background information.
    However, you are talking to a non-technical audience, so be sure to
    break down complicated concepts and strike a friendly
    and converstional tone. If the passage is irrelevant to the answer,
    you may ignore it. If the answer isn’t in the passage, say you don’t know.
    QUESTION: 'How do you use the touchscreen display in the Google car?'
    PASSAGE: '   Your Googlecar has a large touchscreen display that provides access to a   variety of features, including navigation, entertainment, and climate   control. To use the touchscreen display, simply touch the desired icon.   For example, you can touch the Navigation icon to get directions to   your destination or touch the Music icon to play your favorite songs. '

    ANSWER:
  

In [33]:
MODEL_ID = "gemini-3-flash-preview" # @param ["gemini-2.5-flash-lite", "gemini-2.5-flash", "gemini-2.5-pro", "gemini-2.5-flash-preview", "gemini-3-pro-preview"] {"allow-input":true, isTemplate: true}
answer = client.models.generate_content(
    model = MODEL_ID,
    contents = prompt
)
Markdown(answer.text)

Using the touchscreen display in your Google car is designed to be very simple and user-friendly! This large screen acts as the main hub for your vehicle, allowing you to easily manage features like your navigation system, entertainment options, and the climate control to keep the cabin comfortable. To use it, you simply need to touch the icon on the screen for the feature you want to access—much like using a smartphone or a tablet. For instance, if you are ready to head to a new destination, you can just tap the Navigation icon to get directions, or if you're in the mood for some tunes, you can touch the Music icon to start playing your favorite songs.

Re-Ranker (LLM)

In [34]:
NUM_CANDIDATES = 10
results = db.query(
    query_texts=[query],
    n_results=NUM_CANDIDATES,
    include=["documents", "metadatas", "distances"]
)
candidates = results["documents"][0]
candidate_metas = results.get("metadatas", [[]])[0]
candidate_dists = results.get("distances", [[]])[0]


In [35]:
import json
import re

def llm_rerank(query: str, passages: list[str], top_n: int = 3):
    # Build one prompt that asks for JSON scores
    numbered = "\n\n".join([f"[{i}] {p}" for i, p in enumerate(passages)])
    rerank_prompt = f"""
You are a strict relevance re-ranker for RAG.
Given a QUESTION and PASSAGES, score each passage for how well it helps answer the question.

Return ONLY valid JSON of the form:
{{"scores":[{{"id":0,"score":0.0}}, ... ]}}

Scoring rules:
- score is between 0.0 and 1.0
- 1.0 means the passage directly contains the answer
- 0.0 means unrelated
- prefer passages with concrete facts, definitions, or direct evidence

QUESTION:
{query}

PASSAGES:
{numbered}
""".strip()

    resp = client.models.generate_content(model=MODEL_ID, contents=rerank_prompt)
    text = resp.text.strip()

    # Extract JSON even if model adds extra text
    m = re.search(r"\{.*\}", text, flags=re.S)
    data = json.loads(m.group(0) if m else text)

    scores = {item["id"]: float(item["score"]) for item in data["scores"]}
    ranked = sorted(range(len(passages)), key=lambda i: scores.get(i, 0.0), reverse=True)

    top_ids = ranked[:top_n]
    return top_ids, scores


In [36]:
top_ids, scores = llm_rerank(query, candidates, top_n=3)

top_passages = [candidates[i] for i in top_ids]
reranked_context = "\n\n---\n\n".join(top_passages)


In [37]:
for i in top_ids:
    print(i, "score=", scores.get(i), "dist=", candidate_dists[i] if candidate_dists else None)


0 score= 1.0 dist= 0.17842835187911987
1 score= 0.0 dist= 0.35738229751586914
2 score= 0.0 dist= 0.41413694620132446


In [38]:
prompt = make_prompt(query, reranked_context)
answer = client.models.generate_content(model=MODEL_ID, contents=prompt)
Markdown(answer.text)


Using the touchscreen display in your Google car is very straightforward; to use it, you simply need to touch the icon for the specific feature you want to access. This large screen serves as a central hub where you can manage various parts of your driving experience, including your navigation for directions, your entertainment like music, and even the car's climate control settings. For example, if you need help finding your way to a destination, you can just tap the Navigation icon, or if you want to listen to some music, you can simply touch the Music icon to start playing your favorite songs.

BM25 Excersise

In [None]:
!pip install rank_bm25

In [43]:

from rank_bm25 import BM25Okapi
import re

def tokenize(text: str) -> list[str]:
    # simple tokenizer: lowercase words only
    return re.findall(r"[a-z0-9]+", text.lower())

tokenized_docs = [tokenize(d) for d in documents]
bm25 = BM25Okapi(tokenized_docs)

query = "roller window"
tokenized_query = tokenize(query)

scores = bm25.get_scores(tokenized_query)
ranked = sorted(range(len(documents)), key=lambda i: scores[i], reverse=True)

for i in ranked:
    print(f"DOC{i+1}  score={scores[i]:.4f}")


DOC1  score=0.0000
DOC2  score=0.0000
DOC3  score=0.0000


In [51]:
tokenized_docs = [tokenize(d) for d in documents]
bm25 = BM25Okapi(tokenized_docs)

# you can play changin the query and observe the score results
query = "touchscreen features"
tokenized_query = tokenize(query)

scores = bm25.get_scores(tokenized_query)
ranked = sorted(range(len(documents)), key=lambda i: scores[i], reverse=True)

for i in ranked:
    print(f"DOC{i+1}  score={scores[i]:.4f}")

DOC2  score=1.5329
DOC1  score=0.0000
DOC3  score=0.0000


BM25 index (lexical) + Chroma db (vector)

In [52]:
from rank_bm25 import BM25Okapi
import re

def tokenize(text: str) -> list[str]:
    return re.findall(r"[a-z0-9]+", text.lower())

tokenized_docs = [tokenize(d) for d in documents]
bm25 = BM25Okapi(tokenized_docs)


# Merge BM25 + vector candidates

Get BM25 top-N candidates

In [53]:
def bm25_retrieve(query: str, bm25, documents: list[str], top_n: int = 2):
    q_tokens = tokenize(query)
    scores = bm25.get_scores(q_tokens)
    ranked = sorted(range(len(documents)), key=lambda i: scores[i], reverse=True)
    top = ranked[:top_n]
    return {
        "indices": top,
        "scores": scores,
        "docs": [documents[i] for i in top]
    }


Step 1.2 — Get vector top-N candidates from Chroma

Chroma returns ids, documents, and distances (distance = “how far”; smaller is usually better).

In [63]:
def vector_retrieve(query: str, db, top_n: int = 2):
    res = db.query(
        query_texts=[query],
        n_results=top_n,
        include=["documents", "distances"]  # <-- remove "ids"
    )
    return {
        "ids": res["ids"][0],  # <-- still available
        "distances": res.get("distances", [[]])[0],
        "docs": res["documents"][0],
    }


Merge the candidates

We’ll merge by document text (simple for this 3-doc). In production you could merge by stable IDs.

In [64]:
def merge_candidates(bm25_out, vector_out):
    merged = []
    seen = set()

    # Add BM25 candidates first
    for idx in bm25_out["indices"]:
        doc = documents[idx]
        if doc not in seen:
            merged.append({
                "source": "bm25",
                "doc": doc,
                "bm25_score": float(bm25_out["scores"][idx]),
                "vector_distance": None
            })
            seen.add(doc)

    # Add vector candidates
    for doc, dist in zip(vector_out["docs"], vector_out.get("distances", [])):
        if doc not in seen:
            merged.append({
                "source": "vector",
                "doc": doc,
                "bm25_score": None,
                "vector_distance": float(dist) if dist is not None else None
            })
            seen.add(doc)

    return merged


# Show cases where BM25 fails but vectors succeed

BM25 will return all zeros because none of your docs contain the token “infotainment”.

Vector search may still retrieve DOCUMENT2 because it’s about touchscreen/navigation/music (infotainment-like).

Run hybrid retrieval & print

In [65]:
def pretty_preview(doc: str, n=140) -> str:
    one_line = " ".join(doc.split())
    return one_line[:n] + ("..." if len(one_line) > n else "")

def hybrid_retrieve(query: str, bm25, documents: list[str], db,
                    bm25_top_n: int = 2, vector_top_n: int = 2):
    bm25_out = bm25_retrieve(query, bm25, documents, top_n=bm25_top_n)
    vector_out = vector_retrieve(query, db, top_n=vector_top_n)
    merged = merge_candidates(bm25_out, vector_out)
    return bm25_out, vector_out, merged

query = "infotainment"

bm25_out, vector_out, merged = hybrid_retrieve(query, bm25, documents, db)

print("BM25 top:")
for i in bm25_out["indices"]:
    print(f"  DOC{i+1} bm25={bm25_out['scores'][i]:.4f}  -> {pretty_preview(documents[i])}")

print("\nVector top:")
for doc, dist in zip(vector_out["docs"], vector_out.get("distances", [])):
    which = "DOC2" if doc == DOCUMENT2 else ("DOC1" if doc == DOCUMENT1 else "DOC3")
    print(f"  {which} dist={dist:.4f} -> {pretty_preview(doc)}")

print("\nMerged candidates:")
for j, item in enumerate(merged):
    print(f"  [{j}] source={item['source']}, bm25={item['bm25_score']}, dist={item['vector_distance']} -> {pretty_preview(item['doc'])}")


BM25 top:
  DOC1 bm25=0.0000  -> Operating the Climate Control System Your Googlecar has a climate control system that allows you to adjust the temperature and airflow in th...
  DOC2 bm25=0.0000  -> Your Googlecar has a large touchscreen display that provides access to a variety of features, including navigation, entertainment, and clima...

Vector top:
  DOC2 dist=0.4303 -> Your Googlecar has a large touchscreen display that provides access to a variety of features, including navigation, entertainment, and clima...
  DOC1 dist=0.5617 -> Operating the Climate Control System Your Googlecar has a climate control system that allows you to adjust the temperature and airflow in th...

Merged candidates:
  [0] source=bm25, bm25=0.0, dist=None -> Operating the Climate Control System Your Googlecar has a climate control system that allows you to adjust the temperature and airflow in th...
  [1] source=bm25, bm25=0.0, dist=None -> Your Googlecar has a large touchscreen display that provides ac

# Show how the LLM re-ranker fixes both

Now we take the merged shortlist (usually 2–6 passages, but for this excersice we will use 3), and ask Gemini to score them for “answerability”.

LLM-as-re-ranker (Gemini)

In [66]:
import json, re

def llm_rerank(query: str, candidates: list[dict], top_n: int = 1):
    passages = [c["doc"] for c in candidates]
    numbered = "\n\n".join([f"[{i}] {pretty_preview(p, n=500)}" for i, p in enumerate(passages)])

    rerank_prompt = f"""
You are a strict relevance re-ranker for RAG.

Goal:
Score each PASSAGE by how well it helps answer the QUESTION.

Return ONLY valid JSON:
{{"scores":[{{"id":0,"score":0.0}}, ... ]}}

Scoring:
- score is between 0.0 and 1.0
- 1.0 = passage directly contains the answer
- 0.0 = unrelated

QUESTION:
{query}

PASSAGES:
{numbered}
""".strip()

    resp = client.models.generate_content(model=MODEL_ID, contents=rerank_prompt)
    text = resp.text.strip()

    m = re.search(r"\{.*\}", text, flags=re.S)
    data = json.loads(m.group(0) if m else text)

    scores = {int(x["id"]): float(x["score"]) for x in data["scores"]}
    ranked_ids = sorted(scores.keys(), key=lambda i: scores[i], reverse=True)
    top_ids = ranked_ids[:top_n]

    return top_ids, scores


Use the top reranked passage to answer (or reject)

This is the “hard rejection threshold” logic (super important in real RAG).

In [67]:
def answer_with_rag(query: str, bm25, documents, db, threshold: float = 0.6):
    bm25_out, vector_out, merged = hybrid_retrieve(query, bm25, documents, db, bm25_top_n=2, vector_top_n=2)

    top_ids, scores = llm_rerank(query, merged, top_n=1)
    best_id = top_ids[0]
    best_score = scores[best_id]
    best_passage = merged[best_id]["doc"]

    print(f"Best passage score = {best_score:.2f}")

    if best_score < threshold:
        return Markdown("I don’t know based on the provided text.")

    prompt = make_prompt(query, best_passage)
    resp = client.models.generate_content(model=MODEL_ID, contents=prompt)
    return Markdown(resp.text)


# Run 3 demos (this is the best way to “see” it)

Demo 1 — BM25 fails, vectors succeed, reranker picks DOC2

In [68]:
display(answer_with_rag("infotainment", bm25, documents, db))

Best passage score = 1.00


While the provided text doesn’t specifically use the word "infotainment," it describes a system that fits this definition perfectly: it is the combination of **information** and **entertainment** features found on your Googlecar's large touchscreen display. This system acts as a central hub where you can access helpful information, like using the Navigation icon to get directions, while also enjoying entertainment, such as tapping the Music icon to play your favorite songs or managing the climate control to keep the cabin comfortable.

Demo 2 — Lexical query, BM25 succeeds, reranker still picks best


In [69]:
display(answer_with_rag("How do I defrost the windshield?", bm25, documents, db))


Best passage score = 0.20


I don’t know based on the provided text.

Demo 3 — Both fail (no windows anywhere) → reranker triggers rejection

In [70]:
display(answer_with_rag("roller window", bm25, documents, db))


Best passage score = 0.00


I don’t know based on the provided text.