<a href="https://colab.research.google.com/github/krotov79/05_rag_document_agent/blob/main/notebooks/01_build_faiss_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install sentence-transformers faiss-cpu numpy pandas

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m118.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import faiss
from sentence_transformers import SentenceTransformer

print("torch:", torch.__version__)
print("faiss:", faiss.__version__)
print("cuda available:", torch.cuda.is_available())

torch: 2.9.0+cu126
faiss: 1.13.2
cuda available: True


In [3]:
from pathlib import Path

RAW_DIR = Path("data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

print("Raw dir:", RAW_DIR.resolve())
print("Files:", list(RAW_DIR.glob("*")))

Raw dir: /content/data/raw
Files: []


In [4]:
import requests

url = "https://www.rfc-editor.org/rfc/rfc2616.txt"
out = RAW_DIR / "rfc2616.txt"

resp = requests.get(url)
resp.raise_for_status()
out.write_text(resp.text, encoding="utf-8")

print("Saved:", out, "chars:", len(resp.text))


Saved: data/raw/rfc2616.txt chars: 422279


In [5]:
url = "https://www.gutenberg.org/cache/epub/35/pg35.txt"
out = RAW_DIR / "time_machine.txt"

resp = requests.get(url)
resp.raise_for_status()
out.write_text(resp.text, encoding="utf-8")

print("Saved:", out, "chars:", len(resp.text))

Saved: data/raw/time_machine.txt chars: 202461


In [6]:
url = "https://www.apache.org/licenses/LICENSE-2.0.txt"
out = RAW_DIR / "apache_license_2.0.txt"

resp = requests.get(url)
resp.raise_for_status()
out.write_text(resp.text, encoding="utf-8")

print("Saved:", out, "chars:", len(resp.text))

Saved: data/raw/apache_license_2.0.txt chars: 11358


In [7]:
def load_text_files(folder: Path):
    docs = []
    for p in sorted(folder.glob("*.txt")):
        text = p.read_text(encoding="utf-8", errors="ignore")
        docs.append({
            "id": p.stem,
            "path": str(p),
            "chars": len(text),
            "preview": text[:200].replace("\n", " ")
        })
    return docs

docs = load_text_files(RAW_DIR)
docs

[{'id': 'apache_license_2.0',
  'path': 'data/raw/apache_license_2.0.txt',
  'chars': 11358,
  'preview': '                                  Apache License                            Version 2.0, January 2004                         http://www.apache.org/licenses/     TERMS AND CONDITIONS FOR USE, REPRODUC'},
 {'id': 'rfc2616',
  'path': 'data/raw/rfc2616.txt',
  'chars': 422279,
  'preview': '      Network Working Group                                      R. Fielding Request for Comments: 2616                                   UC Irvine Obsoletes: 2068                                     '},
 {'id': 'time_machine',
  'path': 'data/raw/time_machine.txt',
  'chars': 198906,
  'preview': '\ufeffThe Project Gutenberg eBook of The Time Machine      This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions wha'}]

In [9]:
!pip -q install tiktoken

In [10]:
from pathlib import Path

RAW_DIR = Path("data/raw")

def load_docs(raw_dir: Path):
    docs = []
    for p in sorted(raw_dir.glob("*.txt")):
        text = p.read_text(encoding="utf-8", errors="ignore")
        docs.append({"doc_id": p.stem, "source": p.name, "text": text})
    return docs

docs = load_docs(RAW_DIR)
[(d["doc_id"], len(d["text"])) for d in docs]

[('apache_license_2.0', 11358), ('rfc2616', 422279), ('time_machine', 198906)]

In [11]:
#Step 3.3 — Basic cleaning
import re

def clean_text(s: str) -> str:
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"[ \t]+", " ", s)          # collapse spaces/tabs
    s = re.sub(r"\n{3,}", "\n\n", s)       # collapse huge blank areas
    return s.strip()

for d in docs:
    d["text"] = clean_text(d["text"])

[(d["doc_id"], len(d["text"])) for d in docs]

[('apache_license_2.0', 10421), ('rfc2616', 383127), ('time_machine', 198610)]

In [12]:
#Step 3.4 — Token counter
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")

def count_tokens(text: str) -> int:
    return len(enc.encode(text))

In [13]:
# Step 3.5 — Paragraph-first chunker
from typing import List, Dict

CHUNK_MIN = 500
CHUNK_MAX = 700
OVERLAP = 100

def split_paragraphs(text: str) -> List[str]:
    # paragraph split on blank lines
    paras = [p.strip() for p in text.split("\n\n") if p.strip()]
    return paras

def chunk_doc(doc_id: str, source: str, text: str) -> List[Dict]:
    paras = split_paragraphs(text)
    chunks = []
    buf = ""
    buf_tokens = 0

    def flush_buffer():
        nonlocal buf, buf_tokens
        if buf.strip():
            chunks.append(buf.strip())
        buf = ""
        buf_tokens = 0

    for p in paras:
        p_tokens = count_tokens(p)
        # If a single paragraph is huge, we will hard-split it later (rare but possible)
        if p_tokens > CHUNK_MAX:
            # flush current buffer first
            flush_buffer()
            # hard split by sentences-ish (simple fallback)
            sentences = re.split(r"(?<=[.!?])\s+", p)
            s_buf = ""
            for s in sentences:
                if not s.strip():
                    continue
                candidate = (s_buf + " " + s).strip()
                if count_tokens(candidate) <= CHUNK_MAX:
                    s_buf = candidate
                else:
                    if s_buf:
                        chunks.append(s_buf)
                    s_buf = s
            if s_buf:
                chunks.append(s_buf)
            continue

        # normal paragraph accumulation
        candidate = (buf + "\n\n" + p).strip() if buf else p
        candidate_tokens = count_tokens(candidate)

        if candidate_tokens <= CHUNK_MAX:
            buf = candidate
            buf_tokens = candidate_tokens
        else:
            # buffer is full enough -> flush and start new
            flush_buffer()
            buf = p
            buf_tokens = p_tokens

    flush_buffer()

    # Now add overlap by token window (lightweight overlap between consecutive chunks)
    final = []
    for i, ch in enumerate(chunks):
        if i == 0:
            final_text = ch
        else:
            prev = chunks[i-1]
            prev_tokens = enc.encode(prev)
            overlap_text = enc.decode(prev_tokens[-OVERLAP:]) if len(prev_tokens) > OVERLAP else prev
            final_text = (overlap_text + "\n\n" + ch).strip()

        final.append({
            "doc_id": doc_id,
            "chunk_id": i,
            "source": source,
            "text": final_text,
            "tokens": count_tokens(final_text),
        })

    return final

all_chunks = []
for d in docs:
    all_chunks.extend(chunk_doc(d["doc_id"], d["source"], d["text"]))

len(all_chunks), all_chunks[0]["tokens"], all_chunks[-1]["tokens"]


(214, 648, 281)

In [14]:
# Step 3.6 — Quality checks
import pandas as pd

df = pd.DataFrame(all_chunks)
df[["doc_id","chunk_id","tokens"]].describe()

Unnamed: 0,chunk_id,tokens
count,214.0,214.0
mean,53.976636,726.724299
std,35.465707,95.749397
min,0.0,114.0
25%,24.25,703.75
50%,51.0,759.0
75%,77.75,782.75
max,129.0,801.0


In [15]:
df.sort_values("tokens").head(3)[["doc_id","chunk_id","tokens","text"]]
df.sort_values("tokens", ascending=False).head(3)[["doc_id","chunk_id","tokens","text"]]

Unnamed: 0,doc_id,chunk_id,tokens,text
66,rfc2616,62,801,". However, the converse is not true, since\n t..."
27,rfc2616,23,801,more parts: A primary language tag and a possi...
33,rfc2616,29,800,a non-reserved URI character for a reserved pu...


In [16]:
# Step 3.7 — Save processed chunks
PROCESSED_DIR = Path("data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

out_path = PROCESSED_DIR / "chunks_v0.parquet"
df.to_parquet(out_path, index=False)

print("Saved:", out_path, "rows:", len(df))

Saved: data/processed/chunks_v0.parquet rows: 214


Step 4 — Embeddings + FAISS

In [17]:
# Step 4.1 — prepare inputs for embedding

texts = [c["text"] for c in all_chunks]
metas = [
    {"doc_id": c["doc_id"], "chunk_id": c["chunk_id"], "tokens": c["tokens"]}
    for c in all_chunks
]

print("num chunks:", len(texts))
print("sample text:", texts[0][:200].replace("\n", " "))
print("sample meta:", metas[0])

num chunks: 214
sample text: Apache License  Version 2.0, January 2004  http://www.apache.org/licenses/  TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION  1. Definitions.  "License" shall mean the terms and conditions
sample meta: {'doc_id': 'apache_license_2.0', 'chunk_id': 0, 'tokens': 648}


In [18]:
# 4.2 Load embedding model

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(embed_model_name, device=device)

print("model:", embed_model_name)
print("embedding dim:", model.get_sentence_embedding_dimension())

device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model: sentence-transformers/all-MiniLM-L6-v2
embedding dim: 384


In [19]:
# 4.3 Encode embeddings
import numpy as np

emb = model.encode(
    texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
).astype("float32")

print("emb shape:", emb.shape)
print("emb dtype:", emb.dtype)
print("first vector norm:", np.linalg.norm(emb[0]))

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

emb shape: (214, 384)
emb dtype: float32
first vector norm: 1.0


In [20]:
# 4.4 Create FAISS index + add vectors

dim = emb.shape[1]
index = faiss.IndexFlatIP(dim)  # cosine if vectors are normalized
index.add(emb)

print("index ntotal:", index.ntotal)

index ntotal: 214


In [21]:
# 4.5 Quick retrieval test
def search(query, k=5):
    q = model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    scores, ids = index.search(q, k)
    results = []
    for rank, (idx, score) in enumerate(zip(ids[0], scores[0]), start=1):
        results.append({
            "rank": rank,
            "score": float(score),
            "doc_id": metas[idx]["doc_id"],
            "chunk_id": metas[idx]["chunk_id"],
            "text_preview": texts[idx][:220].replace("\n", " ")
        })
    return results

search("What does HTTP status code 200 mean?", k=5)

[{'rank': 1,
  'score': 0.6828978061676025,
  'doc_id': 'rfc2616',
  'chunk_id': 32,
  'text_preview': '; Section 10.5.1: Internal Server Error  | "501" ; Section 10.5.2: Not Implemented  | "502" ; Section 10.5.3: Bad Gateway  | "503" ; Section 10.5.4: Service Unavailable  | "504" ; Section 10.5.5: Gateway Time-out  | "505'},
 {'rank': 2,
  'score': 0.6172425746917725,
  'doc_id': 'rfc2616',
  'chunk_id': 44,
  'text_preview': 'class of status code. Since HTTP/1.0 did not define any 1xx status  codes, servers MUST NOT send a 1xx response to an HTTP/1.0 client  except under experimental conditions.  A client MUST be prepared to accept one or mor'},
 {'rank': 3,
  'score': 0.5855302214622498,
  'doc_id': 'rfc2616',
  'chunk_id': 49,
  'text_preview': 'longer used, and the code is reserved.  Fielding, et al. Standards Track [Page 64] \x0c RFC 2616 HTTP/1.1 June 1999  10.3.8 307 Temporary Redirect  The requested resource resides temporarily under a different URI.  Since th'},
 {'rank': 4,


In [22]:
# 4.6 Save index + metadata to disk
import os, json

INDEX_DIR = "/content/indexes/faiss_v1"
os.makedirs(INDEX_DIR, exist_ok=True)

faiss.write_index(index, f"{INDEX_DIR}/index.faiss")

with open(f"{INDEX_DIR}/metas.json", "w", encoding="utf-8") as f:
    json.dump(metas, f, ensure_ascii=False, indent=2)

with open(f"{INDEX_DIR}/texts.json", "w", encoding="utf-8") as f:
    json.dump(texts, f, ensure_ascii=False, indent=2)

print("saved to:", INDEX_DIR)
print("files:", os.listdir(INDEX_DIR))

saved to: /content/indexes/faiss_v1
files: ['texts.json', 'metas.json', 'index.faiss']


Step 5 — Save FAISS index + metadata

In [26]:
#5.1
from pathlib import Path
OUT_DIR = Path("/content/indexes")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR


PosixPath('/content/indexes')

In [27]:
import faiss

faiss.write_index(index, str(OUT_DIR / "faiss.index"))
print("Saved:", OUT_DIR / "faiss.index")


Saved: /content/indexes/faiss.index


In [28]:
# 5.3 Save chunks metadata
import json

meta_path = OUT_DIR / "chunks.json"
with meta_path.open("w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)

print("Saved:", meta_path)
print("Chunks:", len(all_chunks))

Saved: /content/indexes/chunks.json
Chunks: 214


In [29]:
# 5.4 Save model name + settings


manifest = {
    "embed_model": embed_model_name,
    "index_type": "IndexFlatIP",
    "normalized_embeddings": True,
    "chunking": {
        "target_tokens": 700,
        "overlap_tokens": 120
    },
    "count_chunks": len(all_chunks)
}

with (OUT_DIR / "manifest.json").open("w", encoding="utf-8") as f:
    json.dump(manifest, f, ensure_ascii=False, indent=2)

print("Saved:", OUT_DIR / "manifest.json")


Saved: /content/indexes/manifest.json


In [30]:
# 5.5 Quick sanity check
for p in ["faiss.index", "chunks.json", "manifest.json"]:
    fp = OUT_DIR / p
    print(p, "exists:", fp.exists(), "size:", fp.stat().st_size if fp.exists() else None)

faiss.index exists: True size: 328749
chunks.json exists: True size: 726641
manifest.json exists: True size: 221


In [31]:
# Step 5.6 Reload test

OUT_DIR = Path("/content/indexes")

index2 = faiss.read_index(str(OUT_DIR / "faiss.index"))

with (OUT_DIR / "chunks.json").open("r", encoding="utf-8") as f:
    chunks2 = json.load(f)

print("Reloaded index:", type(index2))
print("Reloaded chunks:", len(chunks2))


Reloaded index: <class 'faiss.swigfaiss_avx512.IndexFlatIP'>
Reloaded chunks: 214


Step 6 — Build retrieve(query, k) with citations

In [32]:
# 6.1 Reload

OUT_DIR = Path("/content/indexes")

# Load FAISS index
index = faiss.read_index(str(OUT_DIR / "faiss.index"))

# Load chunks
with (OUT_DIR / "chunks.json").open("r", encoding="utf-8") as f:
    chunks = json.load(f)

# Load manifest
with (OUT_DIR / "manifest.json").open("r", encoding="utf-8") as f:
    manifest = json.load(f)

embed_model_name = manifest["embed_model"]
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(embed_model_name, device=device)

print("device:", device)
print("embed_model:", embed_model_name)
print("chunks:", len(chunks))
print("index ntotal:", index.ntotal)

device: cuda
embed_model: sentence-transformers/all-MiniLM-L6-v2
chunks: 214
index ntotal: 214


In [33]:
# 6.2 Define a clean retriever
def retrieve(query: str, k: int = 5):
    q_emb = model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True
    ).astype("float32")

    scores, ids = index.search(q_emb, k)

    results = []
    for rank, (idx, score) in enumerate(zip(ids[0], scores[0]), start=1):
        ch = chunks[int(idx)]
        results.append({
            "rank": rank,
            "score": float(score),
            "doc_id": ch["doc_id"],
            "chunk_id": ch["chunk_id"],
            "citation": f'[{ch["doc_id"]}#{ch["chunk_id"]}]',
            "text": ch["text"],
            "text_preview": ch["text"][:240].replace("\n", " ")
        })
    return results

In [34]:
# 6.3 Retrieval smoke tests
retrieve("What is HTTP status code 200?", k=3)

[{'rank': 1,
  'score': 0.6778225898742676,
  'doc_id': 'rfc2616',
  'chunk_id': 32,
  'citation': '[rfc2616#32]',
  'text': '; Section 10.5.1: Internal Server Error\n | "501" ; Section 10.5.2: Not Implemented\n | "502" ; Section 10.5.3: Bad Gateway\n | "503" ; Section 10.5.4: Service Unavailable\n | "504" ; Section 10.5.5: Gateway Time-out\n | "505" ; Section 10.5.6: HTTP Version not supported\n | extension-code\n\nextension-code = 3DIGIT\n Reason-Phrase = *<TEXT, excluding CR, LF>\n\nHTTP status codes are extensible. HTTP applications are not required\n to understand the meaning of all registered status codes, though such\n understanding is obviously desirable. However, applications MUST\n understand the class of any status code, as indicated by the first\n digit, and treat any unrecognized response as being equivalent to the\n x00 status code of that class, with the exception that an\n unrecognized response MUST NOT be cached. For example, if an\n unrecognized status code of 431 is 

In [35]:
retrieve("What does the Apache License require when distributing software?", k=3)

[{'rank': 1,
  'score': 0.6665295362472534,
  'doc_id': 'apache_license_2.0',
  'chunk_id': 3,
  'citation': '[apache_license_2.0#3]',
  'text': 'file format. We also recommend that a\n file or class name and description of purpose be included on the\n same "printed page" as the copyright notice for easier\n identification within third-party archives.\n\nCopyright [yyyy] [name of copyright owner]\n\nLicensed under the Apache License, Version 2.0 (the "License");\n you may not use this file except in compliance with the License.\n You may obtain a copy of the License at\n\nhttp://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\n distributed under the License is distributed on an "AS IS" BASIS,\n WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n See the License for the specific language governing permissions and\n limitations under the License.',
  'text_preview': 'file format. We also recommend that a  f

In [36]:
retrieve("Who is the Time Traveller talking to at the beginning?", k=3)

[{'rank': 1,
  'score': 0.5544801354408264,
  'doc_id': 'time_machine',
  'chunk_id': 10,
  'citation': '[time_machine#10]',
  'text': 'said the Editor hilariously, “these chaps here say you have\nbeen travelling into the middle of next week! Tell us all about little\nRosebery, will you? What will you take for the lot?”\n\nThe Time Traveller came to the place reserved for him without a word.\nHe smiled quietly, in his old way. “Where’s my mutton?” he said. “What\na treat it is to stick a fork into meat again!”\n\n“Story!” cried the Editor.\n\n“Story be damned!” said the Time Traveller. “I want something to eat. I\nwon’t say a word until I get some peptone into my arteries. Thanks. And\nthe salt.”\n\n“One word,” said I. “Have you been time travelling?”\n\n“Yes,” said the Time Traveller, with his mouth full, nodding his head.\n\n“I’d give a shilling a line for a verbatim note,” said the Editor. The\nTime Traveller pushed his glass towards the Silent Man and rang it with\nhis fingernail; 

In [37]:
# 6.4 Add one guardrail: “no evidence”
def retrieve_with_threshold(query: str, k: int = 5, min_score: float = 0.25):
    res = retrieve(query, k)
    res = [r for r in res if r["score"] >= min_score]
    return res

In [38]:
retrieve_with_threshold("What is the capital of Mongolia?", k=5, min_score=0.35)

[]

Step 7 — RAG Answering (Augment) with citations

In [39]:
# 7.2 Install + set up Gemini
!pip -q install -U google-generativeai

In [48]:
from google.colab import userdata
import google.generativeai as genai

GOOGLE_API_KEY = userdata.get("Gemini")

if not GOOGLE_API_KEY:
    raise ValueError("GEMINI_API_KEY not found in Colab Secrets")

genai.configure(api_key=GOOGLE_API_KEY)

model_llm = genai.GenerativeModel("gemini-2.5-flash")
print("Gemini ready ✅")



Gemini ready ✅


In [49]:
# 7.3 Build the “grounded prompt”
def build_context(passages):
    lines = []
    for p in passages:
        # keep context concise; we don't need the full chunk always
        lines.append(f'{p["citation"]} {p["text"]}')
    return "\n\n".join(lines)

SYSTEM_RULES = """You are a careful assistant.
Use ONLY the provided context passages to answer.
If the answer is not supported by the context, say: "I couldn't find that in the provided documents."
Always include citations like [doc_id#chunk_id] next to each claim.
Do NOT invent citations."""

In [63]:
def rag_answer(question: str, k: int = 5, min_score: float = 0.35):
    passages = retrieve_with_threshold(question, k=k, min_score=min_score)

    # HARD GUARDRAIL: no evidence => no LLM call
    if not passages:
        return {
            "answer": "I couldn't find that in the provided documents.",
            "citations": [],
            "passages": []
        }

    context = build_context(passages)

    # Define a strict citation format the model must follow
    # (Assumes each passage has p["citation"] like "rfc2616#32")
    prompt = f"""{SYSTEM_RULES}

You MUST answer using ONLY the context passages below.
If the answer is not contained in the context, say:
"I couldn't find that in the provided documents."

Cite sources inline using the provided citation IDs in square brackets.
Example: ... [rfc2616#32]

Context passages:
{context}

Question:
{question}

Write a short answer (3–6 sentences max)."""

    resp = model_llm.generate_content(prompt)

    # Robust extraction
    answer = ""
    if resp is not None:
        answer = getattr(resp, "text", "") or ""
    answer = answer.strip()

    # Fallback if the model returns nothing
    if not answer:
        answer = "I couldn't generate an answer. Please try again."

    return {
        "answer": answer,
        "citations": [p["citation"] for p in passages],
        "passages": passages
    }

In [64]:
# 7.5 Test on 3 queries
print(rag_answer("What is HTTP status code 200?", k=5, min_score=0.35)["answer"])

The 200 OK status code indicates that the request has succeeded [rfc2616#44]. The information returned with the response depends on the method used in the request [rfc2616#44]. For a GET request, an entity corresponding to the requested resource is sent [rfc2616#44]. A HEAD request returns only the entity-header fields without a message-body, while a POST request provides an entity describing the action's result [rfc2616#44]. For a TRACE request, an entity containing the received request message is sent [rfc2616#44].


In [65]:
print(rag_answer("What does the Apache License require when distributing software?", k=5, min_score=0.35)["answer"])

When distributing software under the Apache License, Version 2.0, you may reproduce and distribute copies of the Work or Derivative Works in any medium, with or without modifications, and in Source or Object form [apache_license_2.0#1]. However, you must provide recipients with a copy of the License [apache_license_2.0#1]. Any modified files must carry prominent notices stating that you changed the files [apache_license_2.0#1]. You must also retain all copyright, patent, trademark, and attribution notices from the Source form of the Work in the Source form of any Derivative Works that you distribute [apache_license_2.0#1]. If the Work includes a "NOTICE" text file, a readable copy of its attribution notices must be included in distributed Derivative Works in one of the specified locations [apache_license_2.0#1].


In [66]:
print(rag_answer("Who is the Time Traveller talking to at the beginning?", k=5, min_score=0.35)["answer"])

At the beginning of the interaction, the Time Traveller is speaking to a group of people, including the Editor and the narrator ("I") [time_machine#10]. Other individuals present in the room are the Silent Man, the Journalist, and the Medical Man [time_machine#10]. He later acknowledges three "new guests" named Blank, Dash, and Chose [time_machine#10].


In [67]:
# 7.6 Hallucination test
print(rag_answer("What is the capital of Mongolia?", k=5, min_score=0.35)["answer"])

I couldn't find that in the provided documents.


In [68]:
# 7.7
llm_model_name = "gemini-2.5-flash"

In [69]:
manifest_path = Path("/content/indexes/manifest.json")

with open(manifest_path, "r") as f:
    manifest = json.load(f)

manifest

{'embed_model': 'sentence-transformers/all-MiniLM-L6-v2',
 'index_type': 'IndexFlatIP',
 'normalized_embeddings': True,
 'chunking': {'target_tokens': 700, 'overlap_tokens': 120},
 'count_chunks': 214,
 'llm_model': 'gemini-2.5-flash',
 'rag_strategy': 'top-k cosine + min_score gating'}

In [70]:
manifest["llm_model"] = llm_model_name

In [71]:
manifest["rag_strategy"] = "top-k cosine + min_score gating"

In [72]:
with open(manifest_path, "w") as f:
    json.dump(manifest, f, indent=2)

print("manifest.json updated ✅")

manifest.json updated ✅


In [73]:
with open(manifest_path) as f:
    print(f.read())

{
  "embed_model": "sentence-transformers/all-MiniLM-L6-v2",
  "index_type": "IndexFlatIP",
  "normalized_embeddings": true,
  "chunking": {
    "target_tokens": 700,
    "overlap_tokens": 120
  },
  "count_chunks": 214,
  "llm_model": "gemini-2.5-flash",
  "rag_strategy": "top-k cosine + min_score gating"
}
