In [None]:
import json 

In [None]:
with open('document.json') as f_in: 
    docs_raw = json.load(f_in)

In [None]:
documents = []
for course_dict in docs_raw : 
    for doc in course_dict['documents'] : 
        doc['course'] = course_dict['course']
        documents.append(doc)

In [None]:
documents[0]

In [12]:
from elasticsearch import Elasticsearch

# Connect to the Elasticsearch cluster
es_client = Elasticsearch("http://localhost:9200")

if es_client.ping():
    print("Connected to Elasticsearch")
else:
    print("Could not connect to Elasticsearch")

Connected to Elasticsearch


In [13]:
es_client.indices.delete(index='course-questions', ignore=[400, 404])


  es_client.indices.delete(index='course-questions', ignore=[400, 404])


ObjectApiResponse({'error': {'root_cause': [{'type': 'index_not_found_exception', 'reason': 'no such index [course-questions]', 'resource.type': 'index_or_alias', 'resource.id': 'course-questions', 'index_uuid': '_na_', 'index': 'course-questions'}], 'type': 'index_not_found_exception', 'reason': 'no such index [course-questions]', 'resource.type': 'index_or_alias', 'resource.id': 'course-questions', 'index_uuid': '_na_', 'index': 'course-questions'}, 'status': 404})

In [14]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = 'course-questions' 

es_client.indices.create(index = index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [15]:
from tqdm.auto import tqdm 

In [16]:
for doc in tqdm(documents) : 
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [55]:
query = "How do i register the course"

In [56]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^5", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [57]:
response = es_client.search(index=index_name, body=search_query)

In [58]:
result_docs = [] 
for hit in response['hits']['hits'] : 
    result_docs.append(hit['_source'])

In [60]:
result_docs

[{'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repository on your computer will make it easy for you to access the instructors’ code and make pull requests (if you want to add your own notes or make changes to the course content).\nYou will probably also create your own repositories that host your notes, versions of your file, to do this. Here is a great tutorial that shows you how to do 

In [74]:
def elastic_search(query) : 
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^5", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)
    result_docs = [] 
    for hit in response['hits']['hits'] : 
        result_docs.append(hit['_source'])
    return result_docs

In [75]:
def build_prompt(query, search_result) : 
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
        Use only the facts from the CONTEXT when answering the QUESTION. 

        QUESTION: {question}

        CONTEXT: 
        {context}
        """.strip()
    context = ""

    for doc in search_result: 
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt
    

In [76]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',)
def llm(prompt):
    response = client.chat.completions.create(
        model='llama3.2',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [77]:
def rag(query): 
    results = elastic_search(query)
    prompt = build_prompt(query, results)
    answer = llm(prompt)
    return answer

In [70]:
print(query)

How do i register the course


In [80]:
print(rag('Who teach this course'))

Based on the provided CONTEXT, I cannot find any information about who teaches this course. The context only covers general questions about the course itself, such as prerequisites, how to use Git/GitHub, expected time commitment, course start date and preparation before the course begins
