In [3]:
import minsearch
import json

In [4]:
with open('documents.json', 'rt') as f:
    docs_raw = json.load(f)

In [5]:
documents = []
# traverse the dictionary
for course_dict in docs_raw:
    # get all elements with key as "documents"
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)
documents[0]      

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
index = minsearch.Index(text_fields=["question", "section", "text"], keyword_fields=['course'])

In [7]:
index.fit(documents)

<minsearch.Index at 0x7d9be0c0f230>

In [8]:
#query for which we want to get answer for
q = 'the course has started, can i still enroll'
# give importance for specified fields
boost ={'question': 3.0, 'section':0.5}
results = index.search(query = q, 
                       filter_dict={'course':'data-engineering-zoomcamp'},
                       boost_dict=boost,
                       num_results=5)

In [9]:
from groq import Groq
import os

In [10]:
client = Groq()

In [11]:
#query
q = 'the course has started, can i still enroll?'
response = client.chat.completions.create(model="llama-3.3-70b-versatile",
                                          messages=[ { 'role':'user', 'content': q }])

In [12]:
#print(response.choices[0].message.content)

In [13]:
prompt_template = """
You are a teaching assistant. Answer the QUESTION based on CONTEXT. 
Use only the facts from CONTEXT for answering QUESTION
If the CONTEXT does not contain the answer, output NONE
QUESTION:{question}
CONTEXT:{context}
""".strip()

In [14]:
context= ""

for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer:{doc['text']}\n\n"

In [15]:
prompt = prompt_template.format(question=q, context=context).strip()

In [16]:
response = client.chat.completions.create(model="llama-3.3-70b-versatile", messages=[{'role':'user','content':prompt}])

In [17]:
print(response.choices[0].message.content)

Yes, even if you don't register, you're still eligible to submit the homeworks.


In [18]:
## Putting all the functionality in functions to add structure

In [19]:
def search(query):
    boost ={'question': 3.0, 'section':0.5}
    results = index.search(query = query, 
                       filter_dict={'course':'data-engineering-zoomcamp'},
                       boost_dict=boost,
                       num_results=5)

    return results

In [20]:
def build_prompt(query, search_results):
    prompt_template = """
                    You are a teaching assistant. Answer the QUESTION based on CONTEXT. 
                    Use only the facts from CONTEXT for answering QUESTION
                    If the CONTEXT does not contain the answer, output NONE
                    QUESTION:{question}
                    CONTEXT:{context}
                    """.strip()

    context= ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer:{doc['text']}\n\n"
        
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [21]:
def llm(prompt):
    client= Groq()
    response = client.chat.completions.create(model="llama-3.3-70b-versatile", messages=[{'role':'user','content':prompt}])
    return response.choices[0].message.content

In [22]:
def rag_minisearch(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

query = "how do i run kafka?"
rag_minisearch(query)

"To run Kafka, the context provides the following information:\n\n1. For Java Kafka: In the project directory, run `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`\n2. For Python Kafka: Create a virtual environment, install requirements.txt, and run the python files in that environment. \n\nHowever, these are specific to running the producer or other components, not the general process of running Kafka itself. For general Kafka setup and running, there's no information provided in the context. \n\nTherefore, based on the context provided, the answer is: NONE"

In [23]:
## Elastic search

In [24]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

In [None]:
es_client = Elasticsearch('http://localhost:9200')
index_name ='course-questions'
    
for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)
    
index_setting ={
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
    
es_client.indices.create(index=index_name,body=index_setting)

In [29]:
def elastic_search(es_query):
   
    search_query = {
        "size": 5, # number of results returned by elastic search
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": es_query,
                        "fields": ["question^3", "text", "section"], # this shows question field is give 3 times more weight than text and section
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"  #this key value is same filter_dict of index function in minsearch
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    result_docs =[]
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [30]:
def rag_elastic(es_query):
    search_results = elastic_search(es_query)
    prompt = build_prompt(es_query, search_results)
    answer = llm(prompt)
    return answer
    

In [31]:
es_query='I just discovered about the course, can i still join?'
rag_elastic(es_query)

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."