# RAG baseline notebook

Краткий запуск:
1) Создай venv и установи зависимости из requirements.txt
2) Скопируй .env.example -> .env и заполни ключи/тарифы при необходимости
3) Restart & Run All

Настройки:
- Меняются в ячейке Config (book_id, top_k, retrieval_mode, флаги rebuild_*)
- Модели/ключи и тарифы берутся из .env


## Обоснования выбора

- Провайдер и модели: использован OpenAI-compatible SDK, чтобы при необходимости менять провайдера через `OPENAI_BASE_URL` без правки кода; модели задаются в `.env`, так как это быстрее всего для подбора баланса качества и стоимости.
- Векторный движок: in-memory numpy выбран для прототипа — минимальные зависимости, прозрачная математика и быстрый старт; альтернативы для продакшена: FAISS, Qdrant, Milvus, Weaviate, pgvector.
- Page-wise splitting: 1 страница = 1 чанк соответствует ТЗ и метрикам (gold=page), упрощает ссылки на источники.


In [1]:
from dataclasses import dataclass
from pathlib import Path
import os
import json

from dotenv import load_dotenv

PROJECT_DIR = Path("..").resolve()
ENV_PATH = PROJECT_DIR / ".env"
if ENV_PATH.exists():
    load_dotenv(ENV_PATH)

@dataclass(frozen=True)
class Config:
    seed: int = int(os.getenv("SEED", "42"))

    # paths
    project_dir: Path = PROJECT_DIR
    data_dir: Path = PROJECT_DIR / "data"
    indexes_dir: Path = PROJECT_DIR / "indexes"
    artifacts_dir: Path = PROJECT_DIR / "artifacts"

    # book/page corpus
    book_id: str = os.getenv("BOOK_ID", "devops_handbook")
    rebuild_pages: bool = os.getenv("REBUILD_PAGES", "true").lower() == "true"
    rebuild_indexes: bool = os.getenv("REBUILD_INDEXES", "true").lower() == "true"

    # vector backend
    vector_backend: str = os.getenv("VECTOR_BACKEND", "numpy").strip().lower()
    qdrant_url: str = os.getenv("QDRANT_URL", "http://localhost:6333").strip()
    qdrant_collection: str = os.getenv("QDRANT_COLLECTION", "rag").strip()
    rebuild_qdrant: bool = os.getenv("REBUILD_QDRANT", "false").lower() == "true"
    recreate_qdrant_collection: bool = os.getenv("RECREATE_QDRANT_COLLECTION", "false").lower() == "true"

    # retrieval params
    retrieval_mode: str = os.getenv("RETRIEVAL_MODE", "hybrid")  # bm25 | vector | hybrid
    top_k: int = int(os.getenv("TOP_K", "5"))
    vector_top_k: int = int(os.getenv("VECTOR_TOP_K", "5"))
    dense_top_k_subchunks: int = int(os.getenv("DENSE_TOP_K_SUBCHUNKS", "50"))
    subchunk_max_chars: int = int(os.getenv("SUBCHUNK_MAX_CHARS", "2200"))
    subchunk_overlap: int = int(os.getenv("SUBCHUNK_OVERLAP", "200"))
    rrf_k: int = int(os.getenv("RRF_K", "60"))
    max_context_chars: int = int(os.getenv("MAX_CONTEXT_CHARS", "6000"))

    # embeddings / LLM
    embed_batch_size: int = int(os.getenv("EMBED_BATCH_SIZE", "64"))
    embedding_model: str = os.getenv("EMBEDDING_MODEL", "")
    chat_model: str = os.getenv("CHAT_MODEL", "")
    openai_base_url: str = os.getenv("OPENAI_BASE_URL", "").strip()
    llm_temperature: float = float(os.getenv("LLM_TEMPERATURE", "0.0"))
    llm_max_tokens: int = int(os.getenv("LLM_MAX_TOKENS", "600"))

    # evaluation
    eval_ks: tuple[int, ...] = (3, 5)

    def load_questions(path: Path) -> list[dict]:
        data = json.loads(path.read_text(encoding="utf-8"))
        if isinstance(data, dict) and "questions" in data:
            data = data["questions"]
        if not isinstance(data, list):
            raise ValueError(f"Invalid questions format: {path}")
        return data
    
    def find_project_root(start: Path) -> Path:
        cur = start.resolve()
        markers = {"pyproject.toml", "requirements.txt", ".git"}
        for _ in range(10):
            if any((cur / m).exists() for m in markers):
                return cur
            if cur.parent == cur:
                break
            cur = cur.parent
        return start.resolve()

    # demo
    project_dir: Path = find_project_root(Path.cwd())
    eval_dir: Path = project_dir / "eval"
    demo_questions_path: Path = Path(os.getenv("DEMO_QUESTIONS_PATH", str(eval_dir / "demo_questions.json")))
    demo_n: int = int(os.getenv("DEMO_N", "10"))

    _demo = load_questions(Path(demo_questions_path))
    demo_queries: tuple[str, ...] = tuple(q["question"] for q in _demo[:demo_n])
    ask_questions: tuple[str, ...] = demo_queries[:3]

    demo_snippet_chars: int = int(os.getenv("DEMO_SNIPPET_CHARS", "180"))

    @property
    def book_dir(self) -> Path:
        return self.data_dir / "books" / self.book_id

    @property
    def pages_dir(self) -> Path:
        return self.book_dir / "pages"

    @property
    def book_txt_path(self) -> Path:
        return self.book_dir / "book.txt"

    @property
    def book_md_path(self) -> Path:
        return self.book_dir / "book.md"

    @property
    def pages_csv_path(self) -> Path:
        return self.artifacts_dir / "pages.csv"

    @property
    def bm25_index_path(self) -> Path:
        return self.indexes_dir / "bm25.pkl"

    @property
    def vector_emb_path(self) -> Path:
        return self.indexes_dir / "vector_embeddings.npy"

    @property
    def vector_meta_path(self) -> Path:
        return self.indexes_dir / "vector_meta.json"

    @property
    def qdrant_meta_path(self) -> Path:
        return self.indexes_dir / f"qdrant_{self.book_id}_meta.json"

    @property
    def eval_questions_path(self) -> Path:
        return self.project_dir / "eval" / "questions.json"

    @property
    def eval_out_path(self) -> Path:
        return self.artifacts_dir / "retrieval_eval.csv"


cfg = Config()
cfg

import sys
from pathlib import Path

ROOT = Path.cwd().resolve()

# Если Jupyter запустился из notebooks/, поднимемся на уровень выше
if not (ROOT / "src").exists() and (ROOT.parent / "src").exists():
    ROOT = ROOT.parent

assert (ROOT / "src").exists(), f"Не найден src/ в {ROOT} или {ROOT.parent}"

sys.path.insert(0, str(ROOT))
print("PYTHONPATH root:", ROOT)


PYTHONPATH root: D:\VS code\Rag_assistant


In [None]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip()
OPENAI_BASE_URL = cfg.openai_base_url
CHAT_MODEL = cfg.chat_model
EMBEDDING_MODEL = cfg.embedding_model

VECTOR_BACKEND = (cfg.vector_backend or "numpy").lower()
QDRANT_URL = cfg.qdrant_url
QDRANT_COLLECTION = cfg.qdrant_collection

embeddings_enabled = bool(OPENAI_API_KEY and EMBEDDING_MODEL)
chat_enabled = bool(OPENAI_API_KEY and CHAT_MODEL)
llm_enabled = embeddings_enabled or chat_enabled

print("embeddings_enabled:", embeddings_enabled)
print("chat_enabled:", chat_enabled)
print("CHAT_MODEL:", CHAT_MODEL)
print("EMBEDDING_MODEL:", EMBEDDING_MODEL)
print("OPENAI_BASE_URL:", OPENAI_BASE_URL or "(default)")
print("VECTOR_BACKEND:", VECTOR_BACKEND)
print("QDRANT_URL:", QDRANT_URL or "(not set)")
print("QDRANT_COLLECTION:", QDRANT_COLLECTION)
print("SUBCHUNK_MAX_CHARS:", cfg.subchunk_max_chars)
print("SUBCHUNK_OVERLAP:", cfg.subchunk_overlap)
print("DENSE_TOP_K_SUBCHUNKS:", cfg.dense_top_k_subchunks)


In [None]:
cfg.data_dir.mkdir(parents=True, exist_ok=True)
cfg.indexes_dir.mkdir(parents=True, exist_ok=True)
cfg.artifacts_dir.mkdir(parents=True, exist_ok=True)
cfg.pages_dir.mkdir(parents=True, exist_ok=True)

print("project_dir:", cfg.project_dir)
print("data_dir:", cfg.data_dir, "exists:", cfg.data_dir.exists())
print("pages_dir:", cfg.pages_dir, "exists:", cfg.pages_dir.exists())
print("indexes_dir:", cfg.indexes_dir, "exists:", cfg.indexes_dir.exists())
print("artifacts_dir:", cfg.artifacts_dir, "exists:", cfg.artifacts_dir.exists())

In [None]:
import os, sys
from pathlib import Path

print("CWD:", os.getcwd())
print("Python:", sys.executable)
print("Project root guess:", Path.cwd().resolve())
print("REPO_ROOT:", ROOT)
print("src exists:", (ROOT / "src").exists())

print("cfg.book_txt_path:", cfg.book_txt_path)
print("exists:", cfg.book_txt_path.exists())
print("absolute:", cfg.book_txt_path.resolve())
print("cfg.book_dir:", cfg.book_dir, "exists:", cfg.book_dir.exists())
print("cfg.pages_dir:", cfg.pages_dir, "exists:", cfg.pages_dir.exists())

# Проверка напрямую системным вызовом
print("os.path.exists:", os.path.exists(str(cfg.book_txt_path)))

from src.chunking_pages import split_text_by_page_markers, write_pages

page_files = sorted(cfg.pages_dir.glob("page_*.txt"))

if page_files and not cfg.rebuild_pages:
    print(f"Pages already exist: {len(page_files)} files. Set cfg.rebuild_pages=True to rebuild.")
else:
    if cfg.rebuild_pages and page_files:
        for fp in page_files:
            fp.unlink()
    if cfg.book_txt_path.exists():
        book_path = cfg.book_txt_path
    elif cfg.book_md_path.exists():
        book_path = cfg.book_md_path
    else:
        raise FileNotFoundError(
            f"Book not found: expected {cfg.book_txt_path} or {cfg.book_md_path}."
        )
    text = book_path.read_text(encoding="utf-8", errors="ignore")
    pages = split_text_by_page_markers(text)
    page_numbers = sorted({p for (p, _) in pages})
    print("pages count:", len(page_numbers))
    print("min page:", page_numbers[0], "max page:", page_numbers[-1])
    print("first 20:", page_numbers[:20])
    print("last 20:", page_numbers[-20:])
    write_pages(pages, cfg.pages_dir)
    print(f"Written {len(pages)} pages to {cfg.pages_dir}")


In [None]:
import pandas as pd
import numpy as np
from src.chunking_pages import load_page_chunks

chunks = load_page_chunks(cfg.book_id, cfg.pages_dir)
chunks_df = pd.DataFrame([c.__dict__ for c in chunks])
pages_df = chunks_df[["page", "text"]].copy()

lengths = [len(str(t)) for t in chunks_df["text"].tolist()]
if lengths:
    arr = np.array(lengths)
    print("page text lengths (chars):")
    print("  min:", int(arr.min()))
    print("  p50:", int(np.percentile(arr, 50)))
    print("  p90:", int(np.percentile(arr, 90)))
    print("  p99:", int(np.percentile(arr, 99)))
    print("  max:", int(arr.max()))
    print("top 10 lengths:", sorted(lengths, reverse=True)[:10])
    bins = [0, 500, 1000, 1500, 2000, 2500, 3000, 4000, 5000, 8000, 12000, 20000]
    print("length histogram (chars):")
    for i in range(len(bins) - 1):
        lo = bins[i]
        hi = bins[i + 1] - 1
        count = int(((arr >= lo) & (arr <= hi)).sum())
        print(f"  {lo}-{hi}: {count}")

chunks_df.head(), len(chunks_df)


In [None]:
pages_csv = cfg.pages_csv_path
pages_df.to_csv(pages_csv, index=False)
print("saved:", pages_csv)


In [None]:
chunks = chunks_df.to_dict(orient="records")
chunks_df.head(), len(chunks_df)

In [None]:
from src.retrievers.bm25 import build_bm25_index, load_bm25, save_bm25

bm25_path = cfg.bm25_index_path
if bm25_path.exists() and not cfg.rebuild_indexes:
    bm25_index = load_bm25(bm25_path)
    print("loaded:", bm25_path)
else:
    bm25_index = build_bm25_index(chunks)
    save_bm25(bm25_index, bm25_path)
    print("saved:", bm25_path)


In [None]:
from src.embeddings import EmbeddingConfig

VECTOR_BACKEND = (cfg.vector_backend or "numpy").lower()
if VECTOR_BACKEND not in ("numpy", "qdrant"):
    print(f"Unknown VECTOR_BACKEND '{VECTOR_BACKEND}', falling back to numpy.")
    VECTOR_BACKEND = "numpy"
print("VECTOR_BACKEND:", VECTOR_BACKEND)

vector_enabled = False
vector_index = None
emb_cfg = None

if not embeddings_enabled:
    print("Vector index skipped: embeddings_enabled=False (no API key / embedding model).")
else:
    emb_cfg = EmbeddingConfig(
        api_key=OPENAI_API_KEY,
        base_url=OPENAI_BASE_URL or None,
        model=EMBEDDING_MODEL,
        batch_size=cfg.embed_batch_size,
    )

    if VECTOR_BACKEND == "qdrant":
        from src.retrievers.vector_qdrant import build_qdrant_index, load_qdrant_meta, QdrantIndex
        from src.vectorstores.qdrant_store import QdrantVectorStore, get_client

        qdrant_url = cfg.qdrant_url
        base_collection = cfg.qdrant_collection
        meta_path = cfg.qdrant_meta_path
        rebuild_qdrant = cfg.rebuild_qdrant or cfg.rebuild_indexes
        recreate_qdrant_collection = cfg.recreate_qdrant_collection

        print("QDRANT_URL:", qdrant_url)
        print("QDRANT_COLLECTION_BASE:", base_collection)
        print("qdrant_meta_path:", meta_path)

        if not qdrant_url:
            print("Qdrant URL is empty, skipping vector/hybrid.")
        else:
            try:
                client = get_client(qdrant_url)
                client.get_collections()
                qdrant_ready = True
            except Exception as exc:
                print("Qdrant unavailable, skipping vector/hybrid:", exc)
                qdrant_ready = False

            if qdrant_ready:
                page_text_by_page = {int(ch["page"]): str(ch.get("text") or "") for ch in chunks}
                page_chunk_id_by_page = {int(ch["page"]): str(ch.get("chunk_id", f"p{ch['page']}")) for ch in chunks}

                if meta_path.exists() and not rebuild_qdrant and not recreate_qdrant_collection:
                    meta = load_qdrant_meta(meta_path)
                    collection_name = meta.get("collection")
                    print("loaded:", meta_path)
                    print("QDRANT_COLLECTION:", collection_name)

                    if meta.get("embedding_model") != EMBEDDING_MODEL:
                        print("Qdrant meta embedding_model mismatch; rebuilding.")
                        rebuild_qdrant = True

                    params = meta.get("subchunk_params", {})
                    if params.get("max_chars") != cfg.subchunk_max_chars or params.get("overlap") != cfg.subchunk_overlap:
                        print("Qdrant meta subchunk params mismatch; rebuilding.")
                        rebuild_qdrant = True

                    if not rebuild_qdrant and collection_name:
                        if not client.collection_exists(collection_name):
                            print("Qdrant collection missing; rebuilding.")
                            rebuild_qdrant = True

                    if not rebuild_qdrant and collection_name:
                        print("Using existing Qdrant collection.")
                        store = QdrantVectorStore(client=client, collection=collection_name)
                        vector_index = QdrantIndex(
                            store=store,
                            emb_cfg=emb_cfg,
                            page_text_by_page=page_text_by_page,
                            page_chunk_id_by_page=page_chunk_id_by_page,
                            top_k_subchunks=cfg.dense_top_k_subchunks,
                            subchunk_max_chars=cfg.subchunk_max_chars,
                            subchunk_overlap=cfg.subchunk_overlap,
                            snippet_chars=cfg.demo_snippet_chars,
                        )
                        vector_enabled = True

                if not vector_enabled:
                    try:
                        print("Building Qdrant index...")
                        vector_index, meta = build_qdrant_index(
                            client=client,
                            base_collection=base_collection,
                            book_id=cfg.book_id,
                            pages=chunks,
                            emb_cfg=emb_cfg,
                            subchunk_max_chars=cfg.subchunk_max_chars,
                            subchunk_overlap=cfg.subchunk_overlap,
                            recreate_collection=recreate_qdrant_collection,
                            batch_size=cfg.embed_batch_size,
                            snippet_chars=cfg.demo_snippet_chars,
                            top_k_subchunks=cfg.dense_top_k_subchunks,
                            meta_path=meta_path,
                        )
                        print("QDRANT_COLLECTION:", meta.get("collection"))
                        print("saved:", meta_path)
                        vector_enabled = True
                    except Exception as exc:
                        print("Qdrant index build failed, skipping vector/hybrid:", exc)
                        vector_enabled = False

    else:
        from src.retrievers.vector_numpy import build_vector_index, load_vector_index, save_vector_index

        vec_emb_path = cfg.vector_emb_path
        vec_meta_path = cfg.vector_meta_path

        if vec_emb_path.exists() and vec_meta_path.exists() and not cfg.rebuild_indexes:
            vector_index = load_vector_index(vec_emb_path, vec_meta_path)
            print("loaded:", vec_emb_path)
            print("loaded:", vec_meta_path)
        else:
            vector_index = build_vector_index(
                chunks,
                emb_cfg,
                subchunk_max_chars=cfg.subchunk_max_chars,
                subchunk_overlap=cfg.subchunk_overlap,
                snippet_chars=cfg.demo_snippet_chars,
            )
            save_vector_index(vector_index, vec_emb_path, vec_meta_path)
            print("saved:", vec_emb_path)
            print("saved:", vec_meta_path)

        vector_enabled = True


In [10]:
def print_hits(hits, max_chars: int) -> None:
    for h in hits:
        snippet = " ".join((h.get("text") or "").split())
        if len(snippet) > max_chars:
            snippet = snippet[:max_chars].rstrip() + "..."
        print(f"- page={h['page']} score={h['score']:.4f} | {snippet}")


In [11]:
import json
import pandas as pd
from src.retrieval_pipeline import retrieve, build_context
from src.eval import evaluate_questions

# bm25_index already built earlier
# vector_index and emb_cfg exist only if vector_enabled=True

def run_retrieve(query: str, mode: str):
    return retrieve(
        query=query,
        mode=mode,
        top_k=cfg.top_k,
        bm25=bm25_index,
        vector=(vector_index if vector_enabled else None),
        emb_cfg=(emb_cfg if vector_enabled else None),
        vector_top_k=cfg.vector_top_k,
        vector_subchunk_k=cfg.dense_top_k_subchunks,
        rrf_k=cfg.rrf_k,
    )


In [None]:
demo_modes = ["bm25"]
if vector_enabled:
    demo_modes += ["vector", "hybrid"]

for mode in demo_modes:
    print(f"\nMODE: {mode}")
    for q in cfg.demo_queries:
        hits = run_retrieve(q, mode)
        print(f"\nQUERY: {q}")
        print_hits(hits, cfg.demo_snippet_chars)


In [None]:
import os
from src.cost import Pricing, count_tokens, print_cost

def _f(x: str, default: float = 0.0) -> float:
    try:
        return float(x)
    except Exception:
        return default

pricing = Pricing(
    embed_price_per_1k_usd=_f(os.getenv("EMBED_PRICE_PER_1K_USD", "0")),
    chat_in_price_per_1k_usd=_f(os.getenv("CHAT_IN_PRICE_PER_1K_USD", "0")),
    chat_out_price_per_1k_usd=_f(os.getenv("CHAT_OUT_PRICE_PER_1K_USD", "0")),
)

embedding_tokens_total = None
if EMBEDDING_MODEL:
    embedding_tokens_total = sum(count_tokens(t, EMBEDDING_MODEL) for t in chunks_df["text"].tolist())

if embedding_tokens_total is None:
    print("Embedding cost skipped: EMBEDDING_MODEL is empty.")
else:
    print_cost(embedding_tokens=embedding_tokens_total, pricing=pricing)


In [14]:
SYSTEM_PROMPT = """
Ты — вопрос-ответный ассистент по одной книге.
Правила:
1) Отвечай ТОЛЬКО на основе предоставленного КОНТЕКСТА (выдержки со страниц).
2) Если в контексте нет ответа — скажи: "В предоставленном контексте ответа нет".
3) Всегда указывай ссылки на страницы в формате "стр. N".
4) Не выдумывай факты, определения, команды и численные значения.
Тон: нейтральный, технический, краткий.
"""


In [None]:
from src.retrieval_pipeline import build_context
from src.llm import LLMConfig, generate_answer

ask_mode = cfg.retrieval_mode
if ask_mode in ("vector", "hybrid") and not vector_enabled:
    print(f"Ask RAG mode '{ask_mode}' skipped: vector backend unavailable. Falling back to bm25.")
    ask_mode = "bm25"

rag_contexts = []
rag_usages = []

llm_cfg = None
if chat_enabled:
    llm_cfg = LLMConfig(
        api_key=OPENAI_API_KEY,
        base_url=OPENAI_BASE_URL or None,
        model=CHAT_MODEL,
        temperature=cfg.llm_temperature,
        max_tokens=cfg.llm_max_tokens,
    )

for question in cfg.ask_questions:
    print(f"\nQUESTION: {question}")
    hits = run_retrieve(question, ask_mode)
    print_hits(hits, cfg.demo_snippet_chars)
    context = build_context(hits, max_chars=cfg.max_context_chars)
    rag_contexts.append(context)

    if not chat_enabled:
        print("LLM generation skipped: chat_enabled=False (no API key / chat model).")
        rag_usages.append({})
        continue

    answer, usage = generate_answer(
        question=question,
        context=context,
        system_prompt=SYSTEM_PROMPT.strip(),
        cfg=llm_cfg,
    )
    rag_usages.append(usage if isinstance(usage, dict) else {})
    print("\nANSWER:\n", answer)
    if usage:
        print("\nUSAGE:\n", usage)


In [None]:
prompt_tokens_total = None
completion_tokens_total = None

usage_available = False
if isinstance(rag_usages, list):
    prompt_sum = 0
    completion_sum = 0
    for usage in rag_usages:
        if isinstance(usage, dict) and "prompt_tokens" in usage and "completion_tokens" in usage:
            usage_available = True
            prompt_sum += int(usage.get("prompt_tokens", 0))
            completion_sum += int(usage.get("completion_tokens", 0))
    if usage_available:
        prompt_tokens_total = prompt_sum
        completion_tokens_total = completion_sum

if not usage_available and CHAT_MODEL:
    prompt_tokens_total = sum(
        count_tokens(
            SYSTEM_PROMPT.strip() + "\n" + ctx + "\n" + q,
            CHAT_MODEL,
        )
        for q, ctx in zip(cfg.ask_questions, rag_contexts)
    )
    completion_tokens_total = 0

if prompt_tokens_total is None or completion_tokens_total is None:
    print("Chat cost skipped: CHAT_MODEL is empty or Ask RAG was not run.")
    print_cost(embedding_tokens=embedding_tokens_total, pricing=pricing)
else:
    print_cost(
        embedding_tokens=embedding_tokens_total,
        prompt_tokens=prompt_tokens_total,
        completion_tokens=completion_tokens_total,
        pricing=pricing,
    )


In [None]:
questions = json.load(open(cfg.eval_questions_path, "r", encoding="utf-8"))

page_set = set(int(p) for p in chunks_df["page"].tolist())
missing_pages = sorted({int(q["gold_page"]) for q in questions if int(q["gold_page"]) not in page_set})
if missing_pages:
    raise ValueError(f"gold_page not found in parsed pages: {missing_pages[:10]}")

modes = ["bm25"]
if vector_enabled:
    modes += ["vector", "hybrid"]

rows = evaluate_questions(
    questions=questions,
    run_retrieve=run_retrieve,
    modes=modes,
    ks=list(cfg.eval_ks),
)

eval_df = pd.DataFrame(rows).sort_values(["mode", "k"])
eval_df

out_path = cfg.eval_out_path
eval_df.to_csv(out_path, index=False)
print("saved:", out_path)
