In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py

In [None]:
import minsearch

In [None]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [None]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [None]:
index.fit(documents)

In [None]:
from openai import OpenAI

In [None]:
def search(q):
    boost={'question':3.0,'section':0.5}
    results= index.search(
        query=q,
        filter_dict={'course':'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )
    return results


In [None]:
def llm(prompt):
    client = OpenAI()
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}],
    )
    results = response.choices[0].message.content
    return results

In [None]:
def build_context(query, search_results):
    prompt_template ="""
    You are the course teaching assistant. Answer the QUESTION based on the CONTEXT. 
    Only use facts from CONTEXT when answering the QUESTION.
    If CONTEXT does not contain an answer, return NULL.
    
    QUESTION:{question}
    
    CONTEXT:
    {context}
    """.strip()
    context =""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion:{doc['question']}\nanswer:{doc['text']}\n\n"
    prompt = prompt_template.format(question=query,context=context).strip()
    return prompt

In [None]:
def  rag(query):
    search_results = search(query)
    prompt=build_context(query,search_results)
    llm_output = llm(prompt)
    return llm_output

In [None]:
query = "How do I run a Kafka?"
rag(query)

In [None]:
from elasticsearch import Elasticsearch

In [None]:
es_client = Elasticsearch("http://localhost:9200")

In [None]:
index_settings={
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name ="corse-index"

In [None]:
es_client.indices.create(index=index_name, body=index_settings)

In [None]:
from tqdm.auto import tqdm

In [None]:
for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

In [None]:
query = "How do execute a command on a Kubernetes pod?"

In [None]:
def elastic_search(query):    
    search_quesry={
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    es_results=es_client.search(index=index_name,body=search_quesry)
    res_documents=[]
    for hit in es_results['hits']['hits']:
        res_documents.append(hit['_source'])
    return res_documents

In [None]:
search_quesry={
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}
es_results=es_client.search(index=index_name,body=search_quesry)

In [None]:
es_results['hits']

In [None]:
elastic_search(query)

In [None]:
def es_rag(query):
    search_results = elastic_search(query)
    prompt=build_context(query,search_results)
    llm_output = llm(prompt)
    return llm_output

In [None]:
es_rag(query)

In [None]:
!curl localhost:9200

In [157]:
def elastic_search2(query):    
    search_quesry2={
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                            "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    es_results=es_client.search(index=index_name,body=search_quesry2)
    res_documents=[]
    for hit in es_results['hits']['hits']:
        res_documents.append(hit['_source'])
    return res_documents

In [161]:
query = "How do copy a file to a Docker container?"

In [162]:
search_results2=elastic_search2(query)

In [163]:
def build_context2(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    context =""
    for doc in search_results:
        context = context + f"Q:{doc['question']}\n\nA:{doc['text']}\n\n".strip()
    prompt = prompt_template.format(question=query,context=context)
    return prompt

In [165]:
prompt2=build_context2(query,search_results2)

In [166]:
prompt2

'You\'re a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n    Use only the facts from the CONTEXT when answering the QUESTION.\n\n    QUESTION: How do copy a file to a Docker container?\n\n    CONTEXT:\n    Q:Java Kafka: How to run producer/consumer/kstreams/etc in terminal\n\nA:In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.javaQ:Module “kafka” not found when trying to run producer.py\n\nA:Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.\nTo create a virtual env and install packages (run only once)\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\nTo activate it (you\'ll need to run it every time you need the virtual env):\nsource env/bin/activate\nTo deactivate it:\ndeactivate\nThis works on MacOS, Linux and Windows - but for Windows the path is slightly different (