In [1]:
import os
import json
from pathlib import Path

from gitsource import chunk_documents
from minsearch import Index
import tiktoken
from pydantic import BaseModel, Field
from typing import Literal
import requests
import pandas as pd
import fitz 

In [2]:
CSV_URL = "https://raw.githubusercontent.com/alexeygrigorev/ai-engineering-buildcamp-code/main/01-foundation/homework/books.csv"
CSV_PATH = "books.csv"

response = requests.get(CSV_URL)
response.raise_for_status()

with open(CSV_PATH, "w", encoding="utf-8") as f:
    f.write(response.text)

In [3]:
df = pd.read_csv(CSV_PATH)
df

Unnamed: 0,title,book_url,pdf_url
0,Think Python 2e,https://greenteapress.com/wp/think-python-2e/,http://greenteapress.com/thinkpython2/thinkpyt...
1,Think DSP,https://greenteapress.com/wp/think-dsp/,http://greenteapress.com/thinkdsp/thinkdsp.pdf
2,Think Complexity 2e,https://greenteapress.com/wp/think-complexity/,http://greenteapress.com/complexity2/thinkcomp...
3,Think Java 2e,https://greenteapress.com/wp/think-java-2e/,http://greenteapress.com/thinkjava7/thinkjava2...
4,Physical Modeling in MATLAB,https://greenteapress.com/wp/physical-modeling...,https://github.com/AllenDowney/PhysicalModelin...
5,Think OS,https://greenteapress.com/wp/think-os/,http://greenteapress.com/thinkos/thinkos.pdf
6,Think C++,https://greenteapress.com/wp/think-c/,https://raw.githubusercontent.com/tscheffl/Thi...


In [4]:
PDF_DIR = Path("books_pdf")
PDF_DIR.mkdir(exist_ok=True)

In [5]:
PDF_DIR = Path("books_pdf")
PDF_DIR.mkdir(exist_ok=True)

In [6]:
def download_pdf(title, url):
    filename = title.lower().replace(" ", "_") + ".pdf"
    path = PDF_DIR / filename

    if path.exists():
        return path

    r = requests.get(url)
    r.raise_for_status()

    with open(path, "wb") as f:
        f.write(r.content)

    return path

In [7]:
pdf_files = []

for _, row in df.iterrows():
    pdf_path = download_pdf(row["title"], row["book_url"])
    pdf_files.append(pdf_path)

In [8]:
MD_DIR = Path("books_text")
MD_DIR.mkdir(exist_ok=True)

In [9]:
def pdf_to_markdown(pdf_path):
    doc = fitz.open(pdf_path)
    md_lines = []

    for page in doc:
        text = page.get_text("text")
        for line in text.splitlines():
            line = line.strip()
            if line:
                md_lines.append(line)

    md_path = MD_DIR / (pdf_path.stem + ".md")
    md_path.write_text("\n".join(md_lines), encoding="utf-8")

    return md_path

In [10]:
md_files = []

for pdf in pdf_files:
    md_files.append(pdf_to_markdown(pdf))

In [11]:
def prepare_books(path="books_text"):
    books = []

    for file in Path(path).glob("*.md"):
        lines = file.read_text(encoding="utf-8").splitlines()
        lines = [l.strip() for l in lines if l.strip()]

        books.append({
            "source": file.name,
            "content": lines
        })

    return books

In [12]:
books = prepare_books()

In [13]:
chunks = chunk_documents(
    books,
    size=100,
    step=50
)

In [14]:
think_python_chunks = [
    c for c in chunks if "think_python" in c["source"].lower()
]

len(think_python_chunks)

208

In [15]:
def prepare_documents(chunks):
    docs = []

    for c in chunks:
        docs.append({
            "source": c["source"],
            "content": "\n".join(c["content"])
        })

    return docs

In [16]:
documents = prepare_documents(chunks)
len(documents)

1008

In [18]:
index = Index(
    text_fields=["content"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7ce967313bf0>

In [19]:
results = index.search("python function definition", num_results=5)
results[0]["source"]

'think_python_2e.md'

In [20]:
instructions = """
You're a course assistant, your task is to answer the QUESTION from the
course students using the provided CONTEXT
"""

In [21]:
prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

In [22]:
def build_prompt(question, search_results):
    return prompt_template.format(
        question=question,
        context=json.dumps(search_results, indent=2)
    )

In [23]:
prompt = build_prompt("python function definition", results)

In [24]:
encoding = tiktoken.encoding_for_model("gpt-4o-mini")

def count_tokens(text):
    return len(encoding.encode(text))

In [25]:
unstructured_tokens = (
    count_tokens(instructions) +
    count_tokens(prompt)
)

unstructured_tokens

6562

In [26]:
class RAGResponse(BaseModel):
    answer: str = Field(description="The main answer in markdown")
    found_answer: bool
    confidence: float
    confidence_explanation: str
    answer_type: Literal[
        "how-to", "explanation", "troubleshooting", "comparison", "reference"
    ]
    followup_questions: list[str]

In [27]:
schema_prompt = f"""
You must respond using this JSON schema:

{RAGResponse.schema_json(indent=2)}
"""

/tmp/ipykernel_17941/786101429.py:4: PydanticDeprecatedSince20: The `schema_json` method is deprecated; use `model_json_schema` and json.dumps instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  {RAGResponse.schema_json(indent=2)}


In [28]:
structured_tokens = (
    count_tokens(instructions) +
    count_tokens(prompt) +
    count_tokens(schema_prompt)
)

structured_tokens

6826

In [29]:
structured_tokens - unstructured_tokens

264