In [1]:
## Download minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-06-20 06:23:51--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py.2’


2024-06-20 06:23:52 (23.3 MB/s) - ‘minsearch.py.2’ saved [3832/3832]



In [2]:
## Download documents.json
!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json

--2024-06-20 06:23:52--  https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json.1’


2024-06-20 06:23:53 (101 MB/s) - ‘documents.json.1’ saved [658332/658332]



In [3]:
import json
import minsearch
from openai import OpenAI

![rag_chart.png](./imgs/rag_chart.png)

In [30]:
def build_documents_from_json(json_path):
    '''
    Convert document json format to a list of
    elements which contain 4 objects: 
    course, text, question, and section
    '''
    with open(json_path, 'rt') as f_in:
        docs_raw = json.load(f_in) 
    documents = []
    
    for course_dict in docs_raw:
        for doc in course_dict['documents']:
            doc['course'] = course_dict['course']
            documents.append(doc)
    return documents


def build_minsearch(text_fields, keyword_fields):
    # SELECT * WHERE course = 'data-engineering-zoomcamp';
    index = minsearch.Index(
        text_fields=text_fields,
        keyword_fields=keyword_fields
    )
    return index


def build_db(documents, search_engine):
    search_engine.fit(documents)


def build_minsearch_engine(query, index, filter_dict, boost_dict, num_results):
    results = index.search(
        query=query,
        filter_dict=filter_dict,
        boost_dict=boost_dict,
        num_results=num_results
    )
    return results


def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
QUESTION: {question}
CONTEXT: {context}
""".strip()
    
    context = ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


def build_llm(base_url, api_key):
    client = OpenAI(
        base_url=base_url,
        api_key=api_key
    )
    return client


def query_llm(prompt, client, model_name):
    response = client.chat.completions.create(
        model=model_name,
        messages=[{'role':'user', 'content':prompt}]
    )
    return response.choices[0].message.content


In [26]:
%%time
query = 'how do I run kafka?'

json_doc_path = 'documents.json'
cvt_documents = build_documents_from_json(json_doc_path)
# print(cvt_documents)

text_fields = ["question", "text", "section"]
keyword_fields = ["course"]
minsearch_index = build_minsearch(text_fields, keyword_fields)

build_db(cvt_documents, minsearch_index)

filter_dict = {'course': 'data-engineering-zoomcamp'}
boost_dict = {'question': 3.0, 'section': 0.5}
num_results = 5
minsearch_results = build_minsearch_engine(query=query, index=minsearch_index, 
                       filter_dict=filter_dict, boost_dict=boost_dict, 
                       num_results=num_results)

prompt = build_prompt(query=query, search_results=minsearch_results)
base_url = 'http://localhost:11434/v1/'
api_key = 'ollama'
model_name = 'phi3'
phi3_client = build_llm(base_url, api_key)
response_res = query_llm(prompt=prompt, client=phi3_client, model_name=model_name)

CPU times: user 85.7 ms, sys: 0 ns, total: 85.7 ms
Wall time: 31 s


In [27]:
response_res

' In the project directory, to run Kafka with Java using your specific jar and code example, execute: `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`'

In [31]:
def minsearch_rag(query):
    minsearch_results = build_minsearch_engine(query=query, index=minsearch_index, 
                       filter_dict=filter_dict, boost_dict=boost_dict, 
                       num_results=num_results)
    prompt = build_prompt(query=query, search_results=minsearch_results)
    response_res = query_llm(prompt=prompt, client=phi3_client, model_name=model_name)
    return response_res

In [32]:
minsearch_rag(query= 'the course has already started, can I still enroll?')

" Yes, even if you don't register before the course starts, you're still eligible to submit the homeworks and can follow the course at your own pace after it finishes as we will keep all the materials. However, there will be deadlines for turning in the final projects so avoid leaving everything till the last minute. You can also subscribe to our Google Calendar, join Telegram channel with announcements, register on DataTalks.Club's Slack and continue looking at homeworks while following the course after it finishes."