In [21]:
import minsearch

In [22]:
import json

In [7]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [8]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [9]:
len(documents)

948

In [10]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [11]:
index.fit(documents)

<minsearch.Index at 0x7f76c2c18dd0>

In [12]:
q = 'the course has already started, can I still enroll?'

In [17]:
index.search(q, filter_dict={'course': 'machine-learning-zoomcamp'}, num_results=3)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\nOr you can just use this lin

In [15]:
index.search("boosting", filter_dict={'course': 'machine-learning-zoomcamp'})

[{'text': 'For ensemble algorithms, during the week 6, one bagging algorithm and one boosting algorithm were presented: Random Forest and XGBoost, respectively.\nRandom Forest trains several models in parallel. The output can be, for example, the average value of all the outputs of each model. This is called bagging.\nXGBoost trains several models sequentially: the previous model error is used to train the following model. Weights are used to ponderate the models such as the best models have higher weights and are therefore favored for the final output. This method is called boosting.\nNote that boosting is not necessarily better than bagging.\nMélanie Fouesnard\nBagging stands for “Bootstrap Aggregation” - it involves taking multiple samples with replacement to derive multiple training datasets from the original training dataset (bootstrapping), training a classifier (e.g. decision trees or stumps for Random Forests) on each such training dataset, and then combining the the prediction

____

In [46]:
from openai import OpenAI

In [47]:
client = OpenAI()

In [32]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

In [36]:
response

ChatCompletion(id='chatcmpl-9aQ0FsMCiysOD7RXHmweR1mfokcAg', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="The possibility of enrolling in a course that has already started depends on several factors, including the specific institution's policies, the instructor's discretion, and the structure of the course. Here are some steps you can take to find out if late enrollment is still an option:\n\n1. **Check the Institution’s Policies:** Review the enrollment policies on the institution's website or in the course catalog. Some institutions allow late enrollment within a certain timeframe after the course has begun.\n\n2. **Contact the Registrar’s Office:** Reach out to the registrar’s office to inquire about the possibility of enrolling late. They can provide detailed information about deadlines and any additional steps required.\n\n3. **Speak with the Instructor:** Contact the course instructor directly. They may allow late enrollments

In [42]:
print(response.choices[0].message.content)

The possibility of enrolling in a course that has already started depends on several factors, including the specific institution's policies, the instructor's discretion, and the structure of the course. Here are some steps you can take to find out if late enrollment is still an option:

1. **Check the Institution’s Policies:** Review the enrollment policies on the institution's website or in the course catalog. Some institutions allow late enrollment within a certain timeframe after the course has begun.

2. **Contact the Registrar’s Office:** Reach out to the registrar’s office to inquire about the possibility of enrolling late. They can provide detailed information about deadlines and any additional steps required.

3. **Speak with the Instructor:** Contact the course instructor directly. They may allow late enrollments, especially if the course material is self-paced or if you can catch up with the content you missed.

4. **Consider the Course Format:** Some courses, particularly on

In [18]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [40]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [41]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [23]:
query = 'how do I run kafka?'

In [29]:
search_results = search(query)
prompt = build_prompt(query, search_results)
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: how do I run kafka?

CONTEXT: 
section: Module 6: streaming with kafka
question: Java Kafka: How to run producer/consumer/kstreams/etc in terminal
answer: In the project directory, run:
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java

section: Module 6: streaming with kafka
question: Module “kafka” not found when trying to run producer.py
answer: Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.
To create a virtual env and install packages (run only once)
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt
To activate it (you'll need to run it every time you need the virtual env):
source env/bin/activate
To deactivate it:
deactivate
This works on MacOS, Linux and Windows - 

In [30]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [31]:
rag(query)

'To run Kafka-based applications like producers, consumers, or kstreams in Java, follow these steps provided in the context:\n\nIn the project directory, you need to execute the following command in the terminal:\n\n```sh\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nReplace `<jar_name>` with the actual name of your compiled jar file.'

In [32]:
rag('the course has already started, can I still enroll?')

"Yes, even if the course has already started, you can still enroll and are eligible to submit the homework. However, be mindful of the deadlines for turning in the final projects. It's important not to leave everything for the last minute."

In [36]:
rag('how to use spark to make joins')

'The provided CONTEXT does not contain specific information on how to perform joins using PySpark. If you need to perform joins in PySpark, you typically use the DataFrame API which allows you to perform various types of joins such as inner, outer, left, and right joins. Here is a basic example of how to perform a join in PySpark:\n\nAssuming you have two DataFrames, `df1` and `df2`:\n\n```python\n# Inner Join\nresult = df1.join(df2, df1["key"] == df2["key"], "inner")\n\n# Left Outer Join\nresult = df1.join(df2, df1["key"] == df2["key"], "left_outer")\n\n# Right Outer Join\nresult = df1.join(df2, df1["key"] == df2["key"], "right_outer")\n\n# Full Outer Join\nresult = df1.join(df2, df1["key"] == df2["key"], "outer")\n\n# Cross Join\nresult = df1.crossJoin(df2)\n```\n\nIn these examples:\n- `df1` and `df2` are DataFrames.\n- `"key"` is the column on which you are performing the join.\n- `"inner"`, `"left_outer"`, `"right_outer"`, and `"outer"` specify the type of join.\n\nThis should giv

_____

In [23]:
from elasticsearch import Elasticsearch

In [24]:
es_client = Elasticsearch('http://localhost:9200') 

In [25]:
es_client.info()

{'name': '3cbc31fe41ed',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'sfyC9chHRoSSF3OqTT6f1g',
 'version': {'number': '8.4.3',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73',
  'build_date': '2022-10-04T07:17:24.662462378Z',
  'build_snapshot': False,
  'lucene_version': '9.3.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [31]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

# es_client.indices.create(index=index_name, body=index_settings)

In [28]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [29]:
from tqdm.auto import tqdm

In [32]:
for doc in tqdm(documents):
    es_client.index(index=index_name, body=doc)

100%|██████████████████████████████████████████████████████| 948/948 [00:06<00:00, 150.67it/s]


In [42]:
query = 'I just disovered the course. Can I still join it?'

In [48]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [49]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [50]:
rag(query)

"Yes, you can still join the course even if it has already started. You don't need to register beforehand, and you are still eligible to submit the homework assignments. Just be mindful of the deadlines for turning in the final projects."

In [52]:
rag("How to install Spark?")

'The provided FAQ database does not have specific instructions about installing Spark. However, I can give you a general guide on how to approach it:\n\n1. **Download Spark**: \n   Visit the [Apache Spark website](https://spark.apache.org/downloads.html) and download the pre-built package for Hadoop. Select the version matching your Hadoop version.\n\n2. **Extract the Spark Archive**:\n   ```bash\n   tar -xvf spark-<version>-bin-hadoop<version>.tgz\n   ```\n\n3. **Set Environment Variables**:\n   Update your `.bashrc` or `.bash_profile` to include the following lines:\n   ```bash\n   export SPARK_HOME=~/path/to/spark\n   export PATH=$SPARK_HOME/bin:$PATH\n   ```\n\n4. **Start Spark**:\n   Navigate to the Spark directory:\n   ```bash\n   cd $SPARK_HOME\n   ./bin/spark-shell\n   ```\nThis will launch the Spark shell with Scala.\n\n5. **Launching a Cluster** (if needed):\n   For running a standalone cluster, follow the steps mentioned in the CONTEXT:\n   ```bash\n   # Start the Spark Mast