In [16]:
import minsearch
import json
from openai import OpenAI
from dotenv import load_dotenv
import os

In [17]:
load_dotenv()

True

In [18]:
def prepare_data(path):
    with open(path,'r') as f:
        docs_raw = json.load(f)

    documents = []
    for course_dict in docs_raw:
        for doc in course_dict['documents']:
            doc['course'] = course_dict['course']
            documents.append(doc)
    return documents

In [19]:
def search(query, index):
    boost = {'question':3.0} # When we think that one of the text fields is more important than the other - importance is adjusted using number > 1 for more important and < 1 for less importance
    filter_dict = {'course':'data-engineering-zoomcamp'}
    results = index.search(query=query,boost_dict=boost,num_results=5,filter_dict=filter_dict)
    return results

In [20]:
def build_prompt(query, search_results):
    context = ''
    for doc in search_results:
        context += f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    # Prompt template is usually: Role - Instructions - Context - Query
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. Use only the facts from the CONTEXT when answering the QUESTION. If the CONTEXT doesn't contain
        the answer, output NONE
    
    QUESTION: {query}

    CONTEXT: 
    {context}
    """

    prompt = prompt_template.format(query=query,context=context).strip()
    return prompt

In [24]:
def llm(prompt, client):
    response = client.chat.completions.create(
        model = 'gpt-4o-mini',
        messages = [{'role':'user','content':prompt}]
    )
    return response.choices[0].message.content

In [22]:
def rag(query, client, index):
    search_results = search(query=query,index=index)
    prompt = build_prompt(query=query,search_results=search_results)
    output = llm(prompt=prompt,client=client)
    return output

In [25]:
query = 'can I enroll in the course after it got started?'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=OPENAI_API_KEY)

documents = prepare_data(path='documents.json')
# The keyword_field is used to index the data and the search will be performed based on the question, text and section fields in that index
index = minsearch.Index(text_fields=['question','text','section'],keyword_fields=['course'])
index.fit(documents)
print(rag(query=query,client=client,index=index))

Yes, you can still join the course after the start date. However, be aware that there will be deadlines for turning in the final projects.
