In [34]:
import minsearch
import json
import requests
from elasticsearch import Elasticsearch

In [2]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)
index.fit(documents)

<minsearch.Index at 0x7ff5b9617170>

In [43]:
q = 'the course has already started, can I still enroll?'

In [6]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [22]:
def llm_llama(prompt: str, model='llama3': str):
    url = 'http://localhost:11434/api/generate'
    data = {
        "model": f"{model}",
        "prompt": f"{prompt}"
    }
    response = requests.post(url, json=data)
    lines = response.text.strip().split('\n')

    response = []

    for line in lines:
        d = json.loads(line)
        if d['done'] == False:
            response.append(d['response'])
        elif d['done'] == True:
            model = d['model']
            context = d['context']
            total_duration = d['total_duration']
            load_duration = d['load_duration']
            eval_duration = d['eval_duration']
        else:
            raise ValueError
    response_text = "".join(response)
    return {
        "model" : model,
        "response" : response_text,
        "context" : context,
        "total_duration" : total_duration,
        "load_duration" : load_duration,
        "eval_duration" : eval_duration
    }

In [38]:
es_client = Elasticsearch('http://localhost:9200')
index_name = "course-questions"

def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [44]:
def rag(query):
    # search_results = search(query)
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm_llama(prompt)
    return answer

In [47]:
%%time
r = rag(q)

CPU times: user 20.4 ms, sys: 4.12 ms, total: 24.5 ms
Wall time: 1min 41s


In [48]:
print(r['response'])

Based on the context, the answer to your question "the course has already started, can I still enroll?" is:

Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


In [49]:
%%time

query = 'What installations I should do?'
r = rag(query)
print(r['response'])

Based on the CONTEXT from the FAQ database, I would answer your QUESTION as follows:

Since you're asking about installations to do, I assume you're referring to Module 2: Workflow Orchestration. According to the provided information, you should focus on the following installations:

1. Download each .py/.sql file that corresponds to each block you created for the pipeline. These files can be found under 'data loaders', 'data transformers', and 'data exporters' folders.
2. Move the downloaded files to your GitHub repo folder and commit your changes.

These steps should help you complete the installations required for Module 2: Workflow Orchestration.
CPU times: user 33.7 ms, sys: 4.98 ms, total: 38.7 ms
Wall time: 3min 43s
