In [20]:
!uv add sentence-transformers
!uv add tqdm

[2K[2mResolved [1m150 packages[0m [2min 2.68s[0m[0m                                       [0m
[2K[2mPrepared [1m29 packages[0m [2min 59.96s[0m[0m                                           
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
[2K[2mInstalled [1m29 packages[0m [2min 1m 56s[0m[0m                             [0m
 [32m+[39m [1mfilelock[0m[2m==3.20.0[0m
 [32m+[39m [1mfsspec[0m[2m==2025.9.0[0m
 [32m+[39m [1mhf-xet[0m[2m==1.1.10[0m
 [32m+[39m [1mhuggingface-hub[0m[2m==0.35.3[0m
 [32m+[39m [1mmpmath[0m[2m==1.3.0[0m
 [32m+[39m [1mnetworkx[0m[2m==3.5[0m
 [32m+[39m [1mnvidia-cublas-cu12[0m[2m==12.8.4.1[0m
 [32m+[39m [1mnvidia-cuda-cupti-cu12[0m[2m==12.8.90[0m
 [32m+[39m [1mnvidia-cuda-nvrtc-cu12[0m[2m==12.8.93[0m
 [32m+[39m [1mnvidia-cuda-runtime-cu12[0m[2m==12.8.90[0m
 [32m+[39m [1mnvidia-cudnn-cu12[0m[2m==9.10.2.21[0m
 [32m+[39m [1mn

In [28]:
import requests 
import json 
import numpy as np
from tqdm.auto import tqdm
from openai import OpenAI
from minsearch import Index
from sentence_transformers import SentenceTransformer
from minsearch import VectorSearch

In [None]:
openai_client = OpenAI()

In [2]:
def llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [3]:
llm('When the course starts?')

"Could you please provide more details about the course you're referring to? This way, I can assist you better!"

In [4]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, instructions=instructions)
    return answer

In [5]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [6]:
documents[11]

{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
 'section': 'General course-related questions',
 'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
len(documents)

948

In [8]:
index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7c98c7062ab0>

In [9]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results



In [10]:
question = 'I just discovered this course. Can I join it? '

In [11]:
search_results = search(question)

In [17]:
instructions = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
""".strip()

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    search_json = json.dumps(search_results)
    return prompt_template.format(
        question=question,
        context=search_json
    )

In [18]:
rag(question)

'Yes, you can join the course even if it has already started. You are eligible to submit the homeworks, but keep in mind that there will be deadlines for turning in the final projects.'

In [23]:
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [24]:

embeddings = []

for d in tqdm(documents):
    text = d['question'] + ' ' + d['text']
    v = embedding_model.encode(text)
    embeddings.append(v)

embeddings = np.array(embeddings)

  0%|          | 0/948 [00:00<?, ?it/s]

In [26]:
embeddings.shape

(948, 768)

In [29]:
vindex = VectorSearch(keyword_fields=['course'])
vindex.fit(embeddings, documents)

<minsearch.vector.VectorSearch at 0x7c97bade7a70>

In [30]:
def vector_search(question):
    q = embedding_model.encode(question)

    return vindex.search(
        q,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        num_results=5
    )

In [31]:
def rag(question):
    search_results = vector_search(question)
    user_prompt = build_prompt(question, search_results)
    return llm(user_prompt, instructions=instructions)

In [32]:
rag(question)

'Yes, you can still join the course even after the start date. You are eligible to submit homework, but be aware of the deadlines for the final projects.'

In [33]:
def hybrid_search(question):
    r1 = search(question)
    r2 = vector_search(question)
    return r1 + r2