In [1]:
from openai import OpenAI
import requests
import json
from minsearch import Index, VectorSearch

openai_client = OpenAI()

In [2]:
!uv add sentence-transformers

[2mResolved [1m150 packages[0m [2min 36ms[0m[0m
[2mAudited [1m145 packages[0m [2min 962ms[0m[0m


In [2]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [1]:
# !uv add tqdmz

In [3]:
import numpy as np
from tqdm import tqdm

In [4]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [5]:
def build_prompt(question, search_results):
    search_json = json.dumps(search_results)
    return prompt_template.format(
        question=question,
        context=search_json
    )

In [6]:
def llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [7]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, instructions=instructions)
    return answer


In [8]:
instructions = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
        Use only the facts from the CONTEXT when answering the QUESTION.
        """.strip()

prompt_template = """
    <QUESTION>
    {question}
    </QUESTION>

    <CONTEXT>
    {context}
    </CONTEXT>
    """.strip()

In [9]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )
    

In [10]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)


In [11]:
index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)
index.fit(documents)

<minsearch.minsearch.Index at 0x7ac068448a40>

In [12]:
rag('Can I install kafka in python?')

'Based on the provided context, there is no information available regarding the installation of Kafka in Python.'

In [13]:
embeddings = []


for d in tqdm(documents):
    text = d['question'] + ' ' + d['text']
    v = embedding_model.encode(text)
    embeddings.append(v)

embeddings = np.array(embeddings)

100%|██████████| 948/948 [03:09<00:00,  5.01it/s]


In [15]:
docs = [
    "I just discovered the course, can I still join?",
    "I just found out about this program. Can I still enroll?",
    "you can join the course any time."
]

vectors = []

for doc in docs:
    v = embedding_model.encode(doc)
    vectors.append(v)

In [26]:
docs

['I just discovered the course, can I still join?',
 'I just found out about this program. Can I still enroll?',
 'you can join the course any time.']

In [16]:
vectors

[array([ 9.03510898e-02, -4.22856919e-02,  4.13428545e-02, -2.25168355e-02,
         8.22591484e-02, -4.70755100e-02, -2.03889655e-03,  3.05278283e-02,
        -5.56562319e-02,  3.35262679e-02, -2.89196055e-03, -1.61779802e-02,
         3.52300070e-02,  1.06729278e-02,  4.28561755e-02,  2.56012590e-03,
         4.80097346e-02, -3.33815031e-02, -3.23886052e-02, -2.89528333e-02,
        -2.41320487e-02,  1.54270642e-02, -2.48449463e-02,  3.01835742e-02,
        -4.14910950e-02,  6.65498599e-02,  2.94065531e-02,  8.71115271e-03,
        -1.20314630e-02,  5.29993465e-03,  6.79888800e-02, -3.38245593e-02,
         7.79914185e-02, -6.71395508e-04, -1.09961882e-04, -2.76266206e-02,
        -7.76979886e-03,  8.61174706e-03,  2.73799058e-02,  1.28756100e-02,
        -4.10926118e-02,  4.64550331e-02, -1.97823904e-02,  4.94695604e-02,
         1.38017051e-02, -2.82891225e-02, -4.86314520e-02, -2.40617339e-02,
         7.35455146e-03,  2.34309044e-02,  3.26605961e-02, -7.84962904e-03,
         2.8

In [17]:
q1, q2, q3 = vectors

In [27]:
q2.dot(q2)

np.float32(0.99999976)

In [28]:
embeddings.shape

(948, 768)

In [20]:
q3

array([ 7.19089657e-02, -1.08710835e-02,  3.67264673e-02, -4.39401250e-03,
        1.07474118e-01, -2.71018269e-03,  2.29759999e-02,  5.97709827e-02,
       -3.39687839e-02,  2.82044578e-02, -2.42995396e-02, -1.32792322e-02,
        4.90809195e-02,  1.18848393e-02,  4.20249738e-02,  5.34892920e-03,
        9.02673379e-02, -1.68406051e-02, -2.30942760e-02, -7.94781521e-02,
       -3.73180024e-02,  3.45861837e-02,  3.36482711e-02,  2.87630949e-02,
       -5.30024292e-03,  8.65944326e-02,  3.07331085e-02,  4.50767763e-02,
        4.06874297e-03, -1.51615404e-02,  9.86641180e-03, -2.94827037e-02,
        7.14853555e-02,  2.13490184e-02,  1.42295333e-03, -2.22517140e-02,
       -1.20183611e-02,  8.62596149e-04,  3.28632817e-03,  1.17208054e-02,
       -1.77335013e-02,  3.57097052e-02,  7.58906128e-03, -2.75518373e-03,
        1.57686975e-02, -3.60514000e-02, -1.84773989e-02, -3.04791369e-02,
        5.34979142e-02, -2.29049544e-03,  1.20340902e-02,  3.31147723e-02,
        4.58058231e-02, -

In [22]:
vindex = VectorSearch(keyword_fields=["course"])

In [23]:
vindex.fit(embeddings, documents)

<minsearch.vector.VectorSearch at 0x7ac05e5859a0>

In [24]:
def vector_search(question):
    q = embedding_model.encode(question)

    return vindex.search(
        q,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        num_results=5
    )

def rag(question):
    search_results = vector_search(question)
    user_prompt = build_prompt(question, search_results)
    return llm(user_prompt, instructions=instructions)

In [25]:
rag('can I still join this course? I just discovered it.')

"Yes, you can still join the course even if you've just discovered it. You are eligible to submit the homework assignments, but be mindful that there will be deadlines for turning in the final projects, so it's best not to leave everything until the last minute."