In [7]:
!pip -q install -U google-genai


In [8]:
import os, re
import fitz

PDFS = [f for f in os.listdir() if f.lower().endswith(".pdf")]
print("PDFs found:", PDFS)

def pdf_to_text(path):
    doc = fitz.open(path)
    pages = []
    for i in range(len(doc)):
        t = doc[i].get_text("text")
        t = re.sub(r"\s+", " ", t).strip()
        if t:
            pages.append(f"[{os.path.basename(path)} | page {i+1}] {t}")
    return "\n".join(pages)

docs = [pdf_to_text(p) for p in PDFS]
print("Loaded docs:", len(docs))
print("Sample:\n", docs[0][:400] if docs else "No docs loaded")


PDFs found: ['L05_WordSimilarity.pdf', 'L06_Embeddings_QA_Recommendations_visual_audio.pdf', 'L07_classic_OCR_image_procc_IR (1).pdf', 'L08_classic_Speech_Recognition_IR.pdf', 'L09_Web Search Deep Dive.pdf', 'L10_LanguageModels_LMMs.pdf', 'L11_IR_w_LLMs_N_RAGs.pdf', 'מטלת בית מספר 1.pdf', 'תרגול SPSS.pdf']
Loaded docs: 9
Sample:
 [L05_WordSimilarity.pdf | page 1] Information Retrieval - Word Similarity - WordNet - Word Vectors Development: Moshe Friedman Credits: Yoav Goldberg, Ido Dagan, Reut Tsarfaty , Moshe Koppel, Wei Song, David Bamman, Ed Grefenstette, Chris Manning, Tsvi Kuflik, Hinrich Schütze, Christina Lioma and more
[L05_WordSimilarity.pdf | page 2] Information Retrieval - administration Moshe Friedman Email: mo


In [9]:
def chunk_no_overlap(txt, size=1200):
    return [txt[i:i+size].strip() for i in range(0, len(txt), size) if txt[i:i+size].strip()]

def chunk_with_overlap(txt, size=1200, overlap=200):
    chunks = []
    i = 0
    while i < len(txt):
        block = txt[i:i+size].strip()
        if block:
            chunks.append(block)
        i += size - overlap
    return chunks

def build_chunks(docs, method="overlap"):
    chunks = []
    for d in docs:
        if method == "overlap":
            chunks.extend(chunk_with_overlap(d))
        else:
            chunks.extend(chunk_no_overlap(d))
    return chunks

chunks_overlap = build_chunks(docs, method="overlap")
chunks_no_ov   = build_chunks(docs, method="no_overlap")

print("Chunks (overlap):", len(chunks_overlap))
print("Chunks (no_overlap):", len(chunks_no_ov))
print("Example overlap chunk:\n", chunks_overlap[0][:300] if chunks_overlap else "None")


Chunks (overlap): 223
Chunks (no_overlap): 187
Example overlap chunk:
 [L05_WordSimilarity.pdf | page 1] Information Retrieval - Word Similarity - WordNet - Word Vectors Development: Moshe Friedman Credits: Yoav Goldberg, Ido Dagan, Reut Tsarfaty , Moshe Koppel, Wei Song, David Bamman, Ed Grefenstette, Chris Manning, Tsvi Kuflik, Hinrich Schütze, Christina Lioma and mo


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def build_tfidf_index(chunks):
    tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1,2))
    X = tfidf.fit_transform(chunks)
    return tfidf, X

def retrieve_tfidf(tfidf, X, chunks, query, k=5):
    q_vec = tfidf.transform([query])
    sims = cosine_similarity(q_vec, X).flatten()
    idx = np.argsort(sims)[::-1][:k]
    return [(int(i), float(sims[i]), chunks[i]) for i in idx]


In [11]:
from sentence_transformers import SentenceTransformer

embed = SentenceTransformer("all-MiniLM-L6-v2")

def build_emb_index(chunks):
    X = embed.encode(chunks, normalize_embeddings=True, show_progress_bar=True)
    return X

def retrieve_emb(X_emb, chunks, query, k=5):
    qv = embed.encode([query], normalize_embeddings=True)[0]
    sims = X_emb @ qv
    idx = np.argsort(sims)[::-1][:k]
    return [(int(i), float(sims[i]), chunks[i]) for i in idx]


In [12]:
import requests

def ask_ollama(prompt, model="llama3.2:1b"):
    r = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": model, "prompt": prompt, "stream": False},
        timeout=120
    )
    r.raise_for_status()
    return r.json()["response"]


In [13]:
import requests
r = requests.post(
    "http://localhost:11434/api/generate",
    json={"model":"llama3.2:1b","prompt":"Say hello","stream":False},
    timeout=60
)
print(r.status_code)
print(r.text[:300])


200
{"model":"llama3.2:1b","created_at":"2026-01-31T13:28:07.1259127Z","response":"Hello.","done":true,"done_reason":"stop","context":[128006,9125,128007,271,38766,1303,33025,2696,25,6790,220,2366,18,271,128009,128006,882,128007,271,46864,24748,128009,128006,78191,128007,271,9906,13],"total_duration":99


In [14]:
import requests
print(requests.get("http://localhost:11434/api/tags").status_code)


200


In [15]:
import os
os.environ["GEMINI_API_KEY"] = "AIzaSyBG-XYdUB21UfitSq-weEV-1ucx8lNrn-k"


In [16]:
from google import genai
import os

client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])

def ask_gemini(prompt, model="gemini-flash-latest"):
    resp = client.models.generate_content(
        model=model,
        contents=prompt
    )
    return resp.text

print(ask_gemini("Say hello"))


Hello!


In [18]:
import requests

tags = requests.get("http://localhost:11434/api/tags").json()
print([m["name"] for m in tags.get("models", [])])


['llama3.2:1b', 'mistral:latest']


In [19]:
def build_context(hits, max_chars=2500):
    parts, total = [], 0
    for i, score, txt in hits:
        block = f"[chunk_id={i} score={score:.4f}]\n{txt}\n"
        if total + len(block) > max_chars:
            break
        parts.append(block)
        total += len(block)
    return "\n---\n".join(parts)

def build_prompt(context, question):
    return f"""You are a QA assistant.
Answer using ONLY the provided context.
If the context is insufficient, answer briefly and say what is missing from the context.

QUESTION:
{question}

CONTEXT:
{context}

Return:
1) Answer (short and clear)
2) Evidence: list the chunk_id(s) you used
"""

def rag_answer(question, chunks, tfidf=None, X_tfidf=None, X_emb=None,
               retriever="tfidf", llm="local", top_k=5, local_model="llama3.2:1b"):

    if retriever == "tfidf":
        hits = retrieve_tfidf(tfidf, X_tfidf, chunks, question, k=top_k)
    else:
        hits = retrieve_emb(X_emb, chunks, question, k=top_k)

    context = build_context(hits)
    prompt = build_prompt(context, question)

    if llm == "local":
        answer = ask_ollama(prompt, model=local_model)
    else:
        answer = ask_gemini(prompt)

    return hits, answer


In [20]:
question = "Why is chunking important in RAG systems?"

# נבנה אינדקסים עבור overlap
tfidf_o, X_tfidf_o = build_tfidf_index(chunks_overlap)
X_emb_o = build_emb_index(chunks_overlap)

# נבנה אינדקסים עבור no-overlap
tfidf_n, X_tfidf_n = build_tfidf_index(chunks_no_ov)
X_emb_n = build_emb_index(chunks_no_ov)

experiments = [
    ("overlap", "tfidf", "local"),
    ("overlap", "tfidf", "external"),
    ("overlap", "emb",   "local"),
    ("overlap", "emb",   "external"),
    ("no_overlap", "tfidf", "local"),
    ("no_overlap", "emb",   "local"),
]

for chunking, retr, llm in experiments:
    print("\n" + "="*90)
    print(f"CHUNKING={chunking} | RETRIEVER={retr} | LLM={llm}")

    if chunking == "overlap":
        chunks = chunks_overlap
        tfidf, X_tfidf, X_emb = tfidf_o, X_tfidf_o, X_emb_o
    else:
        chunks = chunks_no_ov
        tfidf, X_tfidf, X_emb = tfidf_n, X_tfidf_n, X_emb_n

    hits, ans = rag_answer(
        question,
        chunks=chunks,
        tfidf=tfidf, X_tfidf=X_tfidf, X_emb=X_emb,
        retriever=retr, llm=llm,
        top_k=5,
        local_model="llama3.2:1b"
    )

    print(ans)
    print("\nTop evidence chunks:", [(i, round(s,4)) for i,s,_ in hits[:3]])


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]


CHUNKING=overlap | RETRIEVER=tfidf | LLM=local
Unfortunately, there is insufficient context to answer this question accurately. The provided text lacks information about why chunking is important in RAG systems or how it applies to specific tasks or scenarios.

Missing from the context:

* A clear explanation of what RAG system is and its purpose.
* More details on the benefits or goals of using chunking in RAGs.
* Information on specific task types (e.g., text summarization, question answering) where chunking would be useful.
* More context about the different chunking techniques mentioned (e.g., document structure-based, semantic chunking).

Therefore, I can only provide a brief answer:

Top evidence chunks: [(199, 0.2157), (198, 0.2061), (196, 0.1736)]

CHUNKING=overlap | RETRIEVER=tfidf | LLM=external
1) Answer (short and clear)
The provided context is insufficient to explain the fundamental importance of chunking in RAG systems. The context describes different methods of chunking