In [1]:
from openai import OpenAI
import minsearch
import json


In [2]:
with open('documents.json','rt') as f_in:
    docs_raw = json.load(f_in)

documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [3]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [4]:
index.fit(documents)

<minsearch.Index at 0x214965bfc10>

In [5]:
def search(query):
    boost = {"question": 3.0, "section": 0.5}
    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict= boost,
        num_results = 5
    )
    return results

In [9]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant.Answer the QUESTION base on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEST when answering the QUESTION.

QUESTION: {question}
CONTEST: {context}
""".strip()

    context = ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [10]:
client = OpenAI()
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [11]:
query = 'how do I run kafka'
search_results = search(query)
prompt = build_prompt('how do I run kafka',search_results)
answer = llm(prompt)

In [12]:
print(answer)

To run Kafka, in the project directory, you can use the following commands based on your specific situation:
- For Java Kafka producer/consumer/kstreams, run:
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java

- For Python Kafka if you encounter a "Module 'kafka' not found" error, create a virtual environment and run requirements.txt and the python files in that environment. Use the following commands:
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt

- If you're working on Workshop 1 - dlthub and need to install necessary dependencies, ensure 'dlt[duckdb]' package is installed. Use:
!pip install dlt[duckdb]

- In case you face a Permission denied error when running ./build.sh, run:
chmod +x build.sh

- If you encounter the error "ModuleNotFoundError: No module named 'kafka.vendor.six.moves'", it is suggested to use the kafka-python-ng library temporarily instead of kafka-python with the command:
pip install kaf

## 最後整個打包成RAG ->

In [13]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [14]:
query = 'the course has already started, can I still enroll?'
print(rag(query))

Yes, even if the course has already started, you can still enroll and participate. You will be eligible to submit the homework assignments, but make sure to keep up with the deadlines for turning in the final projects. If you join after the start date, you can also follow the course at your own pace after it finishes and continue working on the materials and assignments. Just make sure to stay updated with the course announcements and resources provided.
