In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [27]:
 import minsearch

In [2]:
import json

In [3]:
with open("documents.json", "rt") as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc["course"] = course_dict["course"]
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [7]:
index.fit(documents)

<minsearch.Index at 0x7d010d536ba0>

In [8]:
from openai import OpenAI

In [9]:
client = OpenAI()

In [10]:
def search(query):
    boost = {"question": 3.0, "section": 0.4}

    results = index.search(
        query=query,
        filter_dict={"course": "data-engineering-zoomcamp"},
        boost_dict=boost,
        num_results=10
    )

    return results

In [11]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't have the answer, output NONE
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """
    context = ""

    for doc in search_results :
        context = context + f"section: {doc['section']} \nquestion: {doc['question']}\nanswer: {doc['text']} \n\n"
    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt


In [12]:
def llm(prompt):
    response = client.chat.completions.create(
     model="gpt-4o",
     messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content
    

In [13]:
query = "I have an issue when running stream-kafka?"

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer 

In [14]:
rag(query)

'If you are experiencing an issue when running the stream-kafka script, the context provides several possible solutions based on specific problems:\n\n1. **Psycopg2 Issue**: If your issue is related to "Psycopg2 - issues", replace `psycopg2==2.9.9` with `psycopg2-binary` in the requirements.txt file. Also, remember to run `source command.sh` for each terminal session.\n\n2. **Connection Refused Error**: If the script gets stuck on a loop with "Connection Refused", check the logs of the `message_queue` container in Docker. If you see the error message "insufficient physical memory", lower the memory allocation of the service “message_queue” in your docker-compose file from 4GB to a lower value, like 3GB.\n\n3. **Ingestion Rate**: If the concern is about the number of records ingested at a time, it is noted that it is expected to be 10 at a time for real-time query observation. This was recently increased to 100 at a time, and you may need to pull the latest changes from the repository.\

In [20]:
from elasticsearch import Elasticsearch
from elastic_transport import Transport

es_client = Elasticsearch(
    "http://localhost:9200"
)

In [21]:
es_client.info()

ObjectApiResponse({'name': '35e1bdb2e5f4', 'cluster_name': 'docker-cluster', 'cluster_uuid': '_1OAr8nrR-mOsQPu4CDsNw', 'version': {'number': '9.0.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '73f7594ea00db50aa7e941e151a5b3985f01e364', 'build_date': '2025-04-30T10:07:41.393025990Z', 'build_snapshot': False, 'lucene_version': '10.1.0', 'minimum_wire_compatibility_version': '8.18.0', 'minimum_index_compatibility_version': '8.0.0'}, 'tagline': 'You Know, for Search'})

In [24]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },

    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [25]:
from tqdm.auto import tqdm

In [26]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:04<00:00, 232.79it/s]


In [46]:
def elastic_search(query):
    
    search_query = {
        "size": 10,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es_client.search(index=index_name, body=search_query)

    results_docs = []

    for hit in response["hits"]["hits"]:
        results_docs.append(hit["_source"])
    
    return results_docs

In [55]:
query = "How to identify myself in the leaderboard?"

def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer 

In [56]:
rag(query)

"To identify yourself in the leaderboard, go to the Homework submission link at https://courses.datatalks.club/de-zoomcamp-2024/homework/hw2. Log in, click on 'Data Engineering Zoom Camp 2024,' and then click on 'Edit Course Profile' to see or change your display name. You are automatically assigned a random name like “Lucid Elbakyan” when setting up your account."