### Importing all required libraries

In [1]:
import json
import minsearch


from dotenv import load_dotenv
from langchain_ollama import ChatOllama
from langchain_mistralai.chat_models import ChatMistralAI

load_dotenv()

True

#### 2. loading documents

In [2]:
with open('documents.json', 'rt') as f_in: 
    docs_raw = json.load(f_in)

In [3]:
documents = [] 

# looping on all courses
for course_dict in docs_raw:
    # looping on all documents for each course 
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [7]:
documents[:3]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines 

#### 3. Indexing the documents

In [8]:
index = minsearch.Index(
    text_fields = ["question", "text", "section"], # search fields
    keyword_fields = ["course"] # filtering field
)

In [9]:
# fit the index with documents
index.fit(documents)

<minsearch.minsearch.Index at 0x1b11946d0f0>

In [None]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results = 3
    )
    
    return results

In [24]:
def build_prompt(query, retrieved_docs):
    
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output I don't Know.

    QUESTION: {question}

    CONTEXT: {context}

    ANSWER:
    """.strip()
    
    context = "" 

    for doc in retrieved_docs:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        
    
    prompt = prompt_template.format(question=query, context=context).strip()
    
    return prompt
    

In [25]:
query = "How do I run Kafka?"

retrieved_docs = search("How do I run kafka?")

prompt = build_prompt(query, retrieved_docs)

In [26]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output I don't Know.

    QUESTION: How do I run Kafka?

    CONTEXT: section: Module 6: streaming with kafka
question: Java Kafka: How to run producer/consumer/kstreams/etc in terminal
answer: In the project directory, run:
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java

section: Module 6: streaming with kafka
question: Module “kafka” not found when trying to run producer.py
answer: Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.
To create a virtual env and install packages (run only once)
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt
To activate it (you'll need to run it every time you need the virtual env):
source env/bin/

In [37]:
def get_llm(model_name):
    if model_name == "mistral":
        llm = ChatMistralAI()
    elif model_name == "llama":
        llm = ChatOllama(model="llama3.2-3b") 
    else:
        print("Invalid model name...")
    return llm

In [38]:
def get_llm_response(prompt, model_name="mistral"): 
    llm = get_llm(model_name=model_name)
    response = llm.invoke(prompt)
    return response.content

In [39]:
answer = get_llm_response(prompt)

In [36]:
print(answer)

The context does not provide instructions on how to run Kafka in general. However, it does provide specific instructions for running Kafka-related tasks in certain contexts. 

For running a Python Kafka producer, the context suggests creating a virtual environment and installing the necessary dependencies using a `requirements.txt` file. The command to create and activate the virtual environment is:
```bash
python -m venv env
source env/bin/activate
```
And to install the dependencies, run:
```bash
pip install -r ../requirements.txt
```
However, these instructions do not directly answer the question of how to run Kafka. Therefore, I must output:

I don't Know.


In [44]:
def rag(query):
    # 1: Retrieval step
    retrieved_docs = search(query)

    # 2. Augmentation step
    prompt = build_prompt(query, retrieved_docs)
    
    # 3. Generation Step
    answer = get_llm_response(prompt)
    
    return answer

In [45]:
answer = rag(query="the course has already started, can I still enroll?")

print(answer)

Yes, you can still enroll in the course even if it has already started. You won't be able to submit some of the homeworks, but you can still participate in the course. To earn a certificate, you need to submit 2 out of 3 course projects and review 3 peers' projects by the deadline. Therefore, if you join the course late and complete the required projects, you will still be eligible for a certificate.
