<a href="https://colab.research.google.com/github/krotov79/05_rag_document_agent/blob/main/notebooks/01_build_faiss_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip -q install sentence-transformers faiss-cpu numpy pandas

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m104.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import torch
import faiss
from sentence_transformers import SentenceTransformer

print("torch:", torch.__version__)
print("faiss:", faiss.__version__)
print("cuda available:", torch.cuda.is_available())

torch: 2.9.0+cu126
faiss: 1.13.2
cuda available: True


In [4]:
from pathlib import Path

RAW_DIR = Path("data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

print("Raw dir:", RAW_DIR.resolve())
print("Files:", list(RAW_DIR.glob("*")))

Raw dir: /content/data/raw
Files: []


In [5]:
import requests

url = "https://www.rfc-editor.org/rfc/rfc2616.txt"
out = RAW_DIR / "rfc2616.txt"

resp = requests.get(url)
resp.raise_for_status()
out.write_text(resp.text, encoding="utf-8")

print("Saved:", out, "chars:", len(resp.text))


Saved: data/raw/rfc2616.txt chars: 422279


In [6]:
url = "https://www.gutenberg.org/cache/epub/35/pg35.txt"
out = RAW_DIR / "time_machine.txt"

resp = requests.get(url)
resp.raise_for_status()
out.write_text(resp.text, encoding="utf-8")

print("Saved:", out, "chars:", len(resp.text))

Saved: data/raw/time_machine.txt chars: 202461


In [7]:
url = "https://www.apache.org/licenses/LICENSE-2.0.txt"
out = RAW_DIR / "apache_license_2.0.txt"

resp = requests.get(url)
resp.raise_for_status()
out.write_text(resp.text, encoding="utf-8")

print("Saved:", out, "chars:", len(resp.text))

Saved: data/raw/apache_license_2.0.txt chars: 11358


In [8]:
def load_text_files(folder: Path):
    docs = []
    for p in sorted(folder.glob("*.txt")):
        text = p.read_text(encoding="utf-8", errors="ignore")
        docs.append({
            "id": p.stem,
            "path": str(p),
            "chars": len(text),
            "preview": text[:200].replace("\n", " ")
        })
    return docs

docs = load_text_files(RAW_DIR)
docs

[{'id': 'apache_license_2.0',
  'path': 'data/raw/apache_license_2.0.txt',
  'chars': 11358,
  'preview': '                                  Apache License                            Version 2.0, January 2004                         http://www.apache.org/licenses/     TERMS AND CONDITIONS FOR USE, REPRODUC'},
 {'id': 'rfc2616',
  'path': 'data/raw/rfc2616.txt',
  'chars': 422279,
  'preview': '      Network Working Group                                      R. Fielding Request for Comments: 2616                                   UC Irvine Obsoletes: 2068                                     '},
 {'id': 'time_machine',
  'path': 'data/raw/time_machine.txt',
  'chars': 198906,
  'preview': '\ufeffThe Project Gutenberg eBook of The Time Machine      This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions wha'}]