# RAG baseline notebook

This notebook is designed to run top-to-bottom without manual edits.

How to run:
1) Create venv and install requirements
2) Copy `.env.example` -> `.env` and fill keys if you want LLM calls
3) Restart kernel & Run all


In [None]:
from dataclasses import dataclass
from pathlib import Path
import os

@dataclass(frozen=True)
class Config:
    seed: int = int(os.getenv("SEED", "42"))

    # paths
    project_dir: Path = Path("..").resolve()
    data_dir: Path = project_dir / "data"
    indexes_dir: Path = project_dir / "indexes"
    artifacts_dir: Path = project_dir / "artifacts"

    # retrieval params (placeholders for now)
    top_k: int = 5
    chunk_size: int = 800
    chunk_overlap: int = 150

cfg = Config()
cfg


In [None]:
from dotenv import load_dotenv

env_path = cfg.project_dir / ".env"
if env_path.exists():
    load_dotenv(env_path)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "")
CHAT_MODEL = os.getenv("CHAT_MODEL", "")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "")

llm_enabled = bool(OPENAI_API_KEY and (CHAT_MODEL or EMBEDDING_MODEL))
print("llm_enabled:", llm_enabled)
print("CHAT_MODEL:", CHAT_MODEL)
print("EMBEDDING_MODEL:", EMBEDDING_MODEL)
print("OPENAI_BASE_URL:", OPENAI_BASE_URL or "(default)")


In [None]:
cfg.data_dir.mkdir(parents=True, exist_ok=True)
cfg.indexes_dir.mkdir(parents=True, exist_ok=True)
cfg.artifacts_dir.mkdir(parents=True, exist_ok=True)

print("project_dir:", cfg.project_dir)
print("data_dir:", cfg.data_dir, "exists:", cfg.data_dir.exists())
print("indexes_dir:", cfg.indexes_dir, "exists:", cfg.indexes_dir.exists())
print("artifacts_dir:", cfg.artifacts_dir, "exists:", cfg.artifacts_dir.exists())


In [None]:
import pandas as pd
from src.ingest import read_pdf_pages, pages_to_rows

pdf_path = cfg.data_dir / "book.pdf"
pages = read_pdf_pages(pdf_path)

pages_df = pd.DataFrame(pages_to_rows(pages))
pages_df.head(), len(pages_df)

In [None]:
pages_csv = cfg.artifacts_dir / "pages.csv"
pages_df.to_csv(pages_csv, index=False)
print("saved:", pages_csv)


In [None]:
from src.chunking import pages_to_chunks

chunks = pages_to_chunks(
    pages_df.to_dict(orient="records"),
    chunk_size=cfg.chunk_size,
    chunk_overlap=cfg.chunk_overlap,
    prefix="book",
)

chunks_df = pd.DataFrame(chunks)
chunks_df.head(), len(chunks_df)


In [None]:
from src.retrievers.bm25 import build_bm25_index, save_bm25

bm25_index = build_bm25_index(chunks_df.to_dict(orient="records"))
bm25_path = cfg.indexes_dir / "bm25.pkl"
save_bm25(bm25_index, bm25_path)
print("saved:", bm25_path)

# demo queries
for q in ["что такое RAG", "индексация", "модель"]:
    hits = bm25_index.search(q, k=cfg.top_k)
    print("\nQUERY:", q)
    for h in hits:
        print(f"- score={h['score']:.4f} page={h['page']} chunk_id={h['chunk_id']}")
        print("  ", h["text"][:180].replace("\n", " "), "...")
