<a href="https://colab.research.google.com/github/hardik-kumar-10/GenAI-learnings/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
import numpy as np
from collections import Counter
import re

corpus->

In [56]:
docs = [
    "RAG stands for Retrieval-Augmented Generation. It retrieves relevant context before generating answers.",
    "Vector embeddings map text to vectors so similar meanings are close in space.",
    "Cosine similarity measures the angle between two vectors to gauge similarity.",
    "Indexing documents enables fast retrieval of relevant passages for a user query."
]

cleaning the text->

In [57]:
def tokenize(text):
    if not isinstance(text, str):
        return []
    text = text.strip().lower()
    if not text:
        return []
    tokens = re.findall(r"[a-zA-Z0-9']+", text)
    if not tokens:
        tokens = text.split()
    return tokens

building a vocab-BoW->

In [58]:
vocab = sorted(set(token for d in docs for token in tokenize(d)))
idx = {t: i for i, t in enumerate(vocab)}

def bow_embed(text):
    vec = np.zeros(len(vocab), dtype=np.float32)
    counts = Counter(tokenize(text))
    for t, c in counts.items():
        if t in idx:
            vec[idx[t]] = c
    # L2 normalize to help cosine similarity
    norm = np.linalg.norm(vec) or 1.0
    return vec / norm

Building the index->

In [59]:
doc_vectors = np.stack([bow_embed(d) for d in docs], axis=0)

def cosine_sim(a, b):
    # a: (n, d), b: (d,)
    return a @ b

def retrieve(query, k=2):
    qv = bow_embed(query)
    sims = cosine_sim(doc_vectors, qv)  # (n,)
    topk = np.argsort(-sims)[:k]
    return [(docs[i], float(sims[i])) for i in topk]

In [60]:
print("Retrieved raw:", retrieved)
print("First retrieved item types:", type(retrieved), type(retrieved))
print("First retrieved text preview:", retrieved[:80] if isinstance(retrieved, str) else retrieved)


Retrieved raw: [('RAG stands for Retrieval-Augmented Generation. It retrieves relevant context before generating answers.', 0.3922322690486908), ('Vector embeddings map text to vectors so similar meanings are close in space.', 0.0)]
First retrieved item types: <class 'list'> <class 'list'>
First retrieved text preview: [('RAG stands for Retrieval-Augmented Generation. It retrieves relevant context before generating answers.', 0.3922322690486908), ('Vector embeddings map text to vectors so similar meanings are close in space.', 0.0)]


In [61]:
def generate_answer(query, retrieved):
    if not retrieved:
        return "(Demo) No relevant context found."

    # Take only strings from each (passage, score)
    passages = [ (it if isinstance(it, tuple) else it) for it in retrieved ]
    # If any passage is a list, join it into a single string
    passages = [ (" ".join(p) if isinstance(p, list) else p) for p in passages ]
    passages = [ p for p in passages if isinstance(p, str) and p.strip() ]

    if not passages:
        return "(Demo) No relevant context found."

    first = passages

    import re
    def tokenize(text):
        if not isinstance(text, str):
            return []
        text = text.strip().lower()
        if not text:
            return []
        toks = re.findall(r"[a-zA-Z0-9']+", text)
        return toks if toks else text.split()

    toks = tokenize(first)
    snippet = ", ".join(toks[:25]) if toks else (first.strip()[:100] if first.strip() else "(no content)")
    return f"(Demo) Using retrieved context, the answer likely involves: {snippet}"


In [62]:
query = "What is RAG and how does it help answering questions?"
retrieved = retrieve(query, k=2)

print("Retrieved:")
for passage, score in retrieved:
    print(f"- {repr(passage)}  (score={score:.3f})")

print("\nAnswer:")
print(generate_answer(query, retrieved))


Retrieved:
- 'RAG stands for Retrieval-Augmented Generation. It retrieves relevant context before generating answers.'  (score=0.392)
- 'Vector embeddings map text to vectors so similar meanings are close in space.'  (score=0.000)

Answer:
(Demo) No relevant context found.
