# RAG baseline notebook

Краткий запуск:
1) Создай venv и установи зависимости из requirements.txt
2) Скопируй .env.example -> .env и заполни ключи/тарифы при необходимости
3) Restart & Run All

Настройки:
- Меняются в ячейке Config (book_id, top_k, retrieval_mode, флаги rebuild_*)
- Модели/ключи и тарифы берутся из .env


In [None]:
from dataclasses import dataclass
from pathlib import Path
import os

@dataclass(frozen=True)
class Config:
    seed: int = int(os.getenv("SEED", "42"))

    # paths
    project_dir: Path = Path("..").resolve()
    data_dir: Path = project_dir / "data"
    indexes_dir: Path = project_dir / "indexes"
    artifacts_dir: Path = project_dir / "artifacts"

    # book/page corpus
    book_id: str = "devops_handbook"
    pages_dir: Path = project_dir / "data" / "books" / book_id / "pages"
    rebuild_pages: bool = False

    # retrieval params
    retrieval_mode: str = "fulltext"  # fulltext | vector | hybrid
    top_k: int = 5
    vector_top_k: int = 5
    rrf_k: int = 60

    # embeddings / LLM
    embed_batch_size: int = 64
    max_context_chars: int = 6000
    llm_temperature: float = 0.0
    llm_max_tokens: int = 600


cfg = Config()
cfg


In [None]:
from dotenv import load_dotenv

env_path = cfg.project_dir / ".env"
if env_path.exists():
    load_dotenv(env_path)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip()
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "").strip()
CHAT_MODEL = os.getenv("CHAT_MODEL", "").strip()
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "").strip()

embeddings_enabled = bool(OPENAI_API_KEY and EMBEDDING_MODEL)
chat_enabled = bool(OPENAI_API_KEY and CHAT_MODEL)
llm_enabled = embeddings_enabled or chat_enabled

print("embeddings_enabled:", embeddings_enabled)
print("chat_enabled:", chat_enabled)
print("CHAT_MODEL:", CHAT_MODEL)
print("EMBEDDING_MODEL:", EMBEDDING_MODEL)
print("OPENAI_BASE_URL:", OPENAI_BASE_URL or "(default)")


In [None]:
cfg.data_dir.mkdir(parents=True, exist_ok=True)
cfg.indexes_dir.mkdir(parents=True, exist_ok=True)
cfg.artifacts_dir.mkdir(parents=True, exist_ok=True)
cfg.pages_dir.mkdir(parents=True, exist_ok=True)

print("project_dir:", cfg.project_dir)
print("data_dir:", cfg.data_dir, "exists:", cfg.data_dir.exists())
print("pages_dir:", cfg.pages_dir, "exists:", cfg.pages_dir.exists())
print("indexes_dir:", cfg.indexes_dir, "exists:", cfg.indexes_dir.exists())
print("artifacts_dir:", cfg.artifacts_dir, "exists:", cfg.artifacts_dir.exists())

In [ ]:
from src.chunking_pages import split_text_by_page_markers, write_pages

book_dir = cfg.data_dir / "books" / cfg.book_id
book_txt = book_dir / "book.txt"
book_md = book_dir / "book.md"
page_files = sorted(cfg.pages_dir.glob("page_*.txt"))

if page_files and not cfg.rebuild_pages:
    print(f"Pages already exist: {len(page_files)} files. Set cfg.rebuild_pages=True to rebuild.")
else:
    if cfg.rebuild_pages and page_files:
        for fp in page_files:
            fp.unlink()
    if book_txt.exists():
        book_path = book_txt
    elif book_md.exists():
        book_path = book_md
    else:
        raise FileNotFoundError(
            f"Book not found: expected {book_txt} or {book_md}."
        )
    text = book_path.read_text(encoding="utf-8", errors="ignore")
    pages = split_text_by_page_markers(text)
    write_pages(pages, cfg.pages_dir)
    print(f"Written {len(pages)} pages to {cfg.pages_dir}")


In [None]:
import pandas as pd
from src.chunking_pages import load_page_chunks

chunks = load_page_chunks(cfg.book_id, cfg.pages_dir)
chunks_df = pd.DataFrame([c.__dict__ for c in chunks])
pages_df = chunks_df[["page", "text"]].copy()

chunks_df.head(), len(chunks_df)

In [None]:
pages_csv = cfg.artifacts_dir / "pages.csv"
pages_df.to_csv(pages_csv, index=False)
print("saved:", pages_csv)


In [None]:
chunks = chunks_df.to_dict(orient="records")
chunks_df.head(), len(chunks_df)

In [None]:
from src.retrievers.bm25 import build_bm25_index, save_bm25

bm25_index = build_bm25_index(chunks)
bm25_path = cfg.indexes_dir / "bm25.pkl"
save_bm25(bm25_index, bm25_path)
print("saved:", bm25_path)

# demo queries
for q in ["что такое RAG", "индексация", "модель"]:
    hits = bm25_index.search(q, k=cfg.top_k)
    print("\nQUERY:", q)
    for h in hits:
        print(f"- score={h['score']:.4f} page={h['page']} chunk_id={h['chunk_id']}")
        print("  ", h["text"][:180].replace("\n", " "), "...")

In [None]:
from src.embeddings import EmbeddingConfig
from src.retrievers.vector_numpy import build_vector_index, embed_query, save_vector_index

if not embeddings_enabled:
    print("Vector index skipped: embeddings_enabled=False (no API key / embedding model).")
else:
    emb_cfg = EmbeddingConfig(
        api_key=OPENAI_API_KEY,
        base_url=OPENAI_BASE_URL or None,
        model=EMBEDDING_MODEL,
        batch_size=cfg.embed_batch_size,
    )

    vector_index = build_vector_index(chunks, emb_cfg)

    vec_emb_path = cfg.indexes_dir / "vector_embeddings.npy"
    vec_meta_path = cfg.indexes_dir / "vector_meta.json"
    save_vector_index(vector_index, vec_emb_path, vec_meta_path)
    print("saved:", vec_emb_path)
    print("saved:", vec_meta_path)

    for q in ["пример запроса", "определение", "алгоритм"]:
        qv = embed_query(q, emb_cfg)
        hits = vector_index.search(q, qv, k=cfg.vector_top_k)
        print("\nQUERY:", q)
        for h in hits:
            print(f"- score={h['score']:.4f} page={h['page']} chunk_id={h['chunk_id']}")
            print("  ", h["text"][:180].replace("\n", " \n"), "...")


In [None]:
from src.retrievers.hybrid_rrf import rrf_fuse

if not embeddings_enabled:
    print("Hybrid skipped: embeddings_enabled=False.")
else:
    for q in ["пример запроса", "определение", "алгоритм"]:
        bm25_hits = bm25_index.search(q, k=cfg.top_k)
        qv = embed_query(q, emb_cfg)
        vec_hits = vector_index.search(q, qv, k=cfg.vector_top_k)

        fused = rrf_fuse(bm25_hits=bm25_hits, vec_hits=vec_hits, k=cfg.top_k, rrf_k=cfg.rrf_k)

        print("\nQUERY:", q)
        for h in fused:
            print(f"- score_rrf={h['score_rrf']:.6f} page={h['page']} chunk_id={h['chunk_id']}")
            print("  ", h["text"][:180].replace("\n", " \n"), "...")



In [None]:
import json
import pandas as pd
from src.retrieval_pipeline import retrieve, build_context
from src.eval import evaluate_questions

# bm25_index уже построен ранее
# vector_index и emb_cfg существуют только если embeddings_enabled=True

def run_retrieve(query: str, mode: str):
    return retrieve(
        query=query,
        mode=mode,
        top_k=cfg.top_k,
        bm25=bm25_index,
        vector=(vector_index if embeddings_enabled else None),
        emb_cfg=(emb_cfg if embeddings_enabled else None),
        vector_top_k=getattr(cfg, "vector_top_k", None),
        rrf_k=getattr(cfg, "rrf_k", 60),
    )


In [None]:
SYSTEM_PROMPT = """
Ты — вопрос-ответный ассистент по одной книге.
Правила:
1) Отвечай ТОЛЬКО на основе предоставленного КОНТЕКСТА (выдержки со страниц).
2) Если в контексте нет ответа — скажи: "В предоставленном контексте ответа нет" и кратко уточни, чего не хватает.
3) Всегда указывай ссылки на страницы: формат "стр. N" (где N — номер страницы из контекста).
4) Не выдумывай факты, определения, команды и численные значения.
Тон: нейтральный, технический, краткий.
"""

In [None]:
from src.retrieval_pipeline import build_context
from src.llm import LLMConfig, generate_answer

question = "Напиши сюда реальный вопрос к книге"

mode = cfg.retrieval_mode
if mode in ("vector", "hybrid") and not embeddings_enabled:
    print(f"Retrieval mode '{mode}' skipped: embeddings_enabled=False. Falling back to fulltext.")
    mode = "fulltext"

hits = run_retrieve(question, mode)
context = build_context(hits, max_chars=cfg.max_context_chars)

print("MODE:", mode)
print("TOP HITS:", [(h["page"], h["chunk_id"], round(h["score"], 4)) for h in hits[:5]])

usage = {}
if not chat_enabled:
    print("LLM generation skipped: chat_enabled=False (no API key / chat model).")
else:
    llm_cfg = LLMConfig(
        api_key=OPENAI_API_KEY,
        base_url=OPENAI_BASE_URL or None,
        model=CHAT_MODEL,
        temperature=cfg.llm_temperature,
        max_tokens=cfg.llm_max_tokens,
    )
    answer, usage = generate_answer(
        question=question,
        context=context,
        system_prompt=SYSTEM_PROMPT.strip(),
        cfg=llm_cfg,
    )
    print("\nANSWER:\n", answer)
    print("\nUSAGE:\n", usage)


In [None]:
import os
from src.cost import Pricing, count_tokens, print_cost

def _f(x: str, default: float = 0.0) -> float:
    try:
        return float(x)
    except Exception:
        return default

pricing = Pricing(
    embed_price_per_1k_usd=_f(os.getenv("EMBED_PRICE_PER_1K_USD", "0")),
    chat_in_price_per_1k_usd=_f(os.getenv("CHAT_IN_PRICE_PER_1K_USD", "0")),
    chat_out_price_per_1k_usd=_f(os.getenv("CHAT_OUT_PRICE_PER_1K_USD", "0")),
)

# Примерная оценка токенов на индексацию эмбеддингов (если vector включён):
embedding_tokens_est = None
if llm_enabled and EMBEDDING_MODEL:
    embedding_tokens_est = sum(count_tokens(t, EMBEDDING_MODEL) for t in chunks_df["text"].tolist())

# Токены на генерацию (если есть ответ)
prompt_tokens_est = None
completion_tokens_est = None
if llm_enabled and CHAT_MODEL:
    prompt_tokens_est = count_tokens(SYSTEM_PROMPT + "\n" + context + "\n" + question, CHAT_MODEL)
    # completion_tokens обычно неизвестны до ответа; если usage пришел — лучше взять оттуда.
    if isinstance(usage, dict) and "completion_tokens" in usage:
        completion_tokens_est = int(usage["completion_tokens"])
    else:
        completion_tokens_est = 0

print_cost(
    embedding_tokens=embedding_tokens_est,
    prompt_tokens=prompt_tokens_est,
    completion_tokens=completion_tokens_est,
    pricing=pricing,
)

In [None]:
questions = json.load(open(cfg.project_dir / "eval" / "questions.json", "r", encoding="utf-8"))

modes = ["fulltext"]
if embeddings_enabled:
    modes += ["vector", "hybrid"]

rows = evaluate_questions(
    questions=questions,
    run_retrieve=run_retrieve,
    modes=modes,
    ks=[3, 5],
)

eval_df = pd.DataFrame(rows).sort_values(["mode", "k"])
eval_df

out_path = cfg.artifacts_dir / "retrieval_eval.csv"
eval_df.to_csv(out_path, index=False)
print("saved:", out_path)


## Краткий вывод по таблице метрик (≤150 слов)

_Заполнить после расчета метрик._

## Общий вывод (≤200 слов)

_Укажи преимущества и 2 улучшения._