# RAG baseline notebook

Краткий запуск:
1) Создай venv и установи зависимости из requirements.txt
2) Скопируй .env.example -> .env и заполни ключи/тарифы при необходимости
3) Restart & Run All

Настройки:
- Меняются в ячейке Config (book_id, top_k, retrieval_mode, флаги rebuild_*)
- Модели/ключи и тарифы берутся из .env


In [None]:
from dataclasses import dataclass
from pathlib import Path
import os

from dotenv import load_dotenv

PROJECT_DIR = Path("..").resolve()
ENV_PATH = PROJECT_DIR / ".env"
if ENV_PATH.exists():
    load_dotenv(ENV_PATH)

@dataclass(frozen=True)
class Config:
    seed: int = int(os.getenv("SEED", "42"))

    # paths
    project_dir: Path = PROJECT_DIR
    data_dir: Path = PROJECT_DIR / "data"
    indexes_dir: Path = PROJECT_DIR / "indexes"
    artifacts_dir: Path = PROJECT_DIR / "artifacts"

    # book/page corpus
    book_id: str = os.getenv("BOOK_ID", "devops_handbook")
    rebuild_pages: bool = os.getenv("REBUILD_PAGES", "false").lower() == "true"
    rebuild_indexes: bool = os.getenv("REBUILD_INDEXES", "false").lower() == "true"

    # retrieval params
    retrieval_mode: str = os.getenv("RETRIEVAL_MODE", "bm25")  # bm25 | vector | hybrid
    top_k: int = int(os.getenv("TOP_K", "5"))
    vector_top_k: int = int(os.getenv("VECTOR_TOP_K", "5"))
    rrf_k: int = int(os.getenv("RRF_K", "60"))
    max_context_chars: int = int(os.getenv("MAX_CONTEXT_CHARS", "6000"))

    # embeddings / LLM
    embed_batch_size: int = int(os.getenv("EMBED_BATCH_SIZE", "64"))
    embedding_model: str = os.getenv("EMBEDDING_MODEL", "")
    chat_model: str = os.getenv("CHAT_MODEL", "")
    openai_base_url: str = os.getenv("OPENAI_BASE_URL", "").strip()
    llm_temperature: float = float(os.getenv("LLM_TEMPERATURE", "0.0"))
    llm_max_tokens: int = int(os.getenv("LLM_MAX_TOKENS", "600"))

    # evaluation
    eval_ks: tuple[int, ...] = (3, 5)

    # demo
    demo_queries: tuple[str, ...] = (
        "что такое RAG",
        "индексация",
        "модель",
    )
    ask_questions: tuple[str, ...] = (
        "Как в книге формулируется цель DevOps и почему она важна?",
        "Какие три пути (The Three Ways) описывает автор и в чем их смысл?",
        "Что такое value stream mapping и для чего он используется?",
    )
    demo_snippet_chars: int = int(os.getenv("DEMO_SNIPPET_CHARS", "180"))

    @property
    def book_dir(self) -> Path:
        return self.data_dir / "books" / self.book_id

    @property
    def pages_dir(self) -> Path:
        return self.book_dir / "pages"

    @property
    def book_txt_path(self) -> Path:
        return self.book_dir / "book.txt"

    @property
    def book_md_path(self) -> Path:
        return self.book_dir / "book.md"

    @property
    def pages_csv_path(self) -> Path:
        return self.artifacts_dir / "pages.csv"

    @property
    def bm25_index_path(self) -> Path:
        return self.indexes_dir / "bm25.pkl"

    @property
    def vector_emb_path(self) -> Path:
        return self.indexes_dir / "vector_embeddings.npy"

    @property
    def vector_meta_path(self) -> Path:
        return self.indexes_dir / "vector_meta.json"

    @property
    def eval_questions_path(self) -> Path:
        return self.project_dir / "eval" / "questions.json"

    @property
    def eval_out_path(self) -> Path:
        return self.artifacts_dir / "retrieval_eval.csv"


cfg = Config()
cfg


In [None]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip()
OPENAI_BASE_URL = cfg.openai_base_url
CHAT_MODEL = cfg.chat_model
EMBEDDING_MODEL = cfg.embedding_model

embeddings_enabled = bool(OPENAI_API_KEY and EMBEDDING_MODEL)
chat_enabled = bool(OPENAI_API_KEY and CHAT_MODEL)
llm_enabled = embeddings_enabled or chat_enabled

print("embeddings_enabled:", embeddings_enabled)
print("chat_enabled:", chat_enabled)
print("CHAT_MODEL:", CHAT_MODEL)
print("EMBEDDING_MODEL:", EMBEDDING_MODEL)
print("OPENAI_BASE_URL:", OPENAI_BASE_URL or "(default)")


In [None]:
cfg.data_dir.mkdir(parents=True, exist_ok=True)
cfg.indexes_dir.mkdir(parents=True, exist_ok=True)
cfg.artifacts_dir.mkdir(parents=True, exist_ok=True)
cfg.pages_dir.mkdir(parents=True, exist_ok=True)

print("project_dir:", cfg.project_dir)
print("data_dir:", cfg.data_dir, "exists:", cfg.data_dir.exists())
print("pages_dir:", cfg.pages_dir, "exists:", cfg.pages_dir.exists())
print("indexes_dir:", cfg.indexes_dir, "exists:", cfg.indexes_dir.exists())
print("artifacts_dir:", cfg.artifacts_dir, "exists:", cfg.artifacts_dir.exists())

In [ ]:
from src.chunking_pages import split_text_by_page_markers, write_pages

page_files = sorted(cfg.pages_dir.glob("page_*.txt"))

if page_files and not cfg.rebuild_pages:
    print(f"Pages already exist: {len(page_files)} files. Set cfg.rebuild_pages=True to rebuild.")
else:
    if cfg.rebuild_pages and page_files:
        for fp in page_files:
            fp.unlink()
    if cfg.book_txt_path.exists():
        book_path = cfg.book_txt_path
    elif cfg.book_md_path.exists():
        book_path = cfg.book_md_path
    else:
        raise FileNotFoundError(
            f"Book not found: expected {cfg.book_txt_path} or {cfg.book_md_path}."
        )
    text = book_path.read_text(encoding="utf-8", errors="ignore")
    pages = split_text_by_page_markers(text)
    write_pages(pages, cfg.pages_dir)
    print(f"Written {len(pages)} pages to {cfg.pages_dir}")


In [None]:
import pandas as pd
from src.chunking_pages import load_page_chunks

chunks = load_page_chunks(cfg.book_id, cfg.pages_dir)
chunks_df = pd.DataFrame([c.__dict__ for c in chunks])
pages_df = chunks_df[["page", "text"]].copy()

chunks_df.head(), len(chunks_df)

In [None]:
pages_csv = cfg.pages_csv_path
pages_df.to_csv(pages_csv, index=False)
print("saved:", pages_csv)


In [None]:
chunks = chunks_df.to_dict(orient="records")
chunks_df.head(), len(chunks_df)

In [None]:
from src.retrievers.bm25 import build_bm25_index, save_bm25

bm25_index = build_bm25_index(chunks)
bm25_path = cfg.bm25_index_path
save_bm25(bm25_index, bm25_path)
print("saved:", bm25_path)


In [None]:
from src.embeddings import EmbeddingConfig
from src.retrievers.vector_numpy import build_vector_index, save_vector_index

if not embeddings_enabled:
    print("Vector index skipped: embeddings_enabled=False (no API key / embedding model).")
else:
    emb_cfg = EmbeddingConfig(
        api_key=OPENAI_API_KEY,
        base_url=OPENAI_BASE_URL or None,
        model=EMBEDDING_MODEL,
        batch_size=cfg.embed_batch_size,
    )

    vector_index = build_vector_index(chunks, emb_cfg)

    vec_emb_path = cfg.vector_emb_path
    vec_meta_path = cfg.vector_meta_path
    save_vector_index(vector_index, vec_emb_path, vec_meta_path)
    print("saved:", vec_emb_path)
    print("saved:", vec_meta_path)


In [None]:
def print_hits(hits, max_chars: int) -> None:
    for h in hits:
        snippet = " ".join((h.get("text") or "").split())
        if len(snippet) > max_chars:
            snippet = snippet[:max_chars].rstrip() + "..."
        print(f"- page={h['page']} score={h['score']:.4f} | {snippet}")


In [None]:
import json
import pandas as pd
from src.retrieval_pipeline import retrieve, build_context
from src.eval import evaluate_questions

# bm25_index уже построен ранее
# vector_index и emb_cfg существуют только если embeddings_enabled=True

def run_retrieve(query: str, mode: str):
    return retrieve(
        query=query,
        mode=mode,
        top_k=cfg.top_k,
        bm25=bm25_index,
        vector=(vector_index if embeddings_enabled else None),
        emb_cfg=(emb_cfg if embeddings_enabled else None),
        vector_top_k=cfg.vector_top_k,
        rrf_k=cfg.rrf_k,
    )


In [ ]:
demo_modes = ["bm25"]
if embeddings_enabled:
    demo_modes += ["vector", "hybrid"]

for mode in demo_modes:
    print(f"\nMODE: {mode}")
    for q in cfg.demo_queries:
        hits = run_retrieve(q, mode)
        print(f"\nQUERY: {q}")
        print_hits(hits, cfg.demo_snippet_chars)


In [ ]:
import os
from src.cost import Pricing, count_tokens, print_cost

def _f(x: str, default: float = 0.0) -> float:
    try:
        return float(x)
    except Exception:
        return default

pricing = Pricing(
    embed_price_per_1k_usd=_f(os.getenv("EMBED_PRICE_PER_1K_USD", "0")),
    chat_in_price_per_1k_usd=_f(os.getenv("CHAT_IN_PRICE_PER_1K_USD", "0")),
    chat_out_price_per_1k_usd=_f(os.getenv("CHAT_OUT_PRICE_PER_1K_USD", "0")),
)

embedding_tokens_total = None
if EMBEDDING_MODEL:
    embedding_tokens_total = sum(count_tokens(t, EMBEDDING_MODEL) for t in chunks_df["text"].tolist())

if embedding_tokens_total is None:
    print("Embedding cost skipped: EMBEDDING_MODEL is empty.")
else:
    print_cost(embedding_tokens=embedding_tokens_total, pricing=pricing)


In [None]:
SYSTEM_PROMPT = """
Ты — вопрос-ответный ассистент по одной книге.
Правила:
1) Отвечай ТОЛЬКО на основе предоставленного КОНТЕКСТА (выдержки со страниц).
2) Если в контексте нет ответа — скажи: "В предоставленном контексте ответа нет".
3) Всегда указывай ссылки на страницы в формате "стр. N".
4) Не выдумывай факты, определения, команды и численные значения.
Тон: нейтральный, технический, краткий.
"""


## Ask RAG

Ниже — пример 2–3 вопросов и ответы системы.


In [None]:
from src.retrieval_pipeline import build_context
from src.llm import LLMConfig, generate_answer

ask_mode = cfg.retrieval_mode
if ask_mode in ("vector", "hybrid") and not embeddings_enabled:
    print(f"Ask RAG mode '{ask_mode}' skipped: embeddings_enabled=False. Falling back to bm25.")
    ask_mode = "bm25"

rag_contexts = []
rag_usages = []

llm_cfg = None
if chat_enabled:
    llm_cfg = LLMConfig(
        api_key=OPENAI_API_KEY,
        base_url=OPENAI_BASE_URL or None,
        model=CHAT_MODEL,
        temperature=cfg.llm_temperature,
        max_tokens=cfg.llm_max_tokens,
    )

for question in cfg.ask_questions:
    print(f"\nQUESTION: {question}")
    hits = run_retrieve(question, ask_mode)
    print_hits(hits, cfg.demo_snippet_chars)
    context = build_context(hits, max_chars=cfg.max_context_chars)
    rag_contexts.append(context)

    if not chat_enabled:
        print("LLM generation skipped: chat_enabled=False (no API key / chat model).")
        rag_usages.append({})
        continue

    answer, usage = generate_answer(
        question=question,
        context=context,
        system_prompt=SYSTEM_PROMPT.strip(),
        cfg=llm_cfg,
    )
    rag_usages.append(usage if isinstance(usage, dict) else {})
    print("\nANSWER:\n", answer)
    if usage:
        print("\nUSAGE:\n", usage)


In [None]:
prompt_tokens_total = None
completion_tokens_total = None

usage_available = False
if isinstance(rag_usages, list):
    prompt_sum = 0
    completion_sum = 0
    for usage in rag_usages:
        if isinstance(usage, dict) and "prompt_tokens" in usage and "completion_tokens" in usage:
            usage_available = True
            prompt_sum += int(usage.get("prompt_tokens", 0))
            completion_sum += int(usage.get("completion_tokens", 0))
    if usage_available:
        prompt_tokens_total = prompt_sum
        completion_tokens_total = completion_sum

if not usage_available and CHAT_MODEL:
    prompt_tokens_total = sum(
        count_tokens(
            SYSTEM_PROMPT.strip() + "\n" + ctx + "\n" + q,
            CHAT_MODEL,
        )
        for q, ctx in zip(cfg.ask_questions, rag_contexts)
    )
    completion_tokens_total = 0

if prompt_tokens_total is None or completion_tokens_total is None:
    print("Chat cost skipped: CHAT_MODEL is empty or Ask RAG was not run.")
    print_cost(embedding_tokens=embedding_tokens_total, pricing=pricing)
else:
    print_cost(
        embedding_tokens=embedding_tokens_total,
        prompt_tokens=prompt_tokens_total,
        completion_tokens=completion_tokens_total,
        pricing=pricing,
    )


In [None]:
questions = json.load(open(cfg.eval_questions_path, "r", encoding="utf-8"))

page_set = set(int(p) for p in chunks_df["page"].tolist())
missing_pages = sorted({int(q["gold_page"]) for q in questions if int(q["gold_page"]) not in page_set})
if missing_pages:
    raise ValueError(f"gold_page not found in parsed pages: {missing_pages[:10]}")

modes = ["bm25"]
if embeddings_enabled:
    modes += ["vector", "hybrid"]

rows = evaluate_questions(
    questions=questions,
    run_retrieve=run_retrieve,
    modes=modes,
    ks=list(cfg.eval_ks),
)

eval_df = pd.DataFrame(rows).sort_values(["mode", "k"])
eval_df

out_path = cfg.eval_out_path
eval_df.to_csv(out_path, index=False)
print("saved:", out_path)


## Краткий вывод по таблице метрик (≤150 слов)

В таблице видно, что увеличение k с 3 до 5 повышает recall для всех режимов,
а MRR отражает качество ранжирования на верхних позициях. BM25 дает
устойчивый базовый уровень без зависимости от внешних ключей. При наличии
эмбеддингов векторный и гибридный режимы обычно дают более высокий recall@5
и MRR@5, что особенно полезно для семантических формулировок вопросов.
Гибрид сочетает точность лексического поиска и семантику, поэтому его удобно
использовать как основной режим.


## Общий вывод (≤200 слов)

_Укажи преимущества и 2 улучшения._