In [1]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [2]:
from elasticsearch import Elasticsearch

In [3]:
es_client = Elasticsearch('http://localhost:9200') 

In [4]:

index_name = "course-questions"

# Check if index exists
if es_client.indices.exists(index=index_name):
    print(f"Index '{index_name}' already exists. Deleting it.")
    es_client.indices.delete(index=index_name)

# Now create the index
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}

es_client.indices.create(index=index_name, body=index_settings)
print(f"Index '{index_name}' created successfully.")


Index 'course-questions' already exists. Deleting it.
Index 'course-questions' created successfully.


In [5]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [6]:
from tqdm.auto import tqdm

In [7]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [8]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [9]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}],
        stream=True  # Enable streaming to handle large responses efficiently
    )
    
    result = ""
    for chunk in response:
        if chunk.choices and chunk.choices[0].delta.content:
            result += chunk.choices[0].delta.content
    
    return result


In [10]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [11]:
query = 'I just disovered the course. Can I still join it?'

In [12]:
rag(query)

"Based on my role as a course teaching assistant, here are some important details about joining or continuing with our Data Engineering Bootcamp:\n\n1. If you've discovered this opportunity and wish to join after its official starting day but still want access to homework assignments even without registration because their work is not verified against any registered lists; we encourage doing so, although there are deadlines for final projects that prompt students not to wait until the very last minute.\n\n2. You can indeed follow this course at your own pace well after it concludes as all materials remain accessible and you could potentially continue with homework submissions or even start working on a capstone project which seems most beneficial if done within an organized setting of our FAQ document, slack channel discussions, and @ZoomcampQABot's support.\n\n3. Before enrolling in the course (even self-paced) it is encouraged to prepare by ensuring that you understand all dependenci