In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py

In [None]:
import minsearch

In [None]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [None]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [None]:
index.fit(documents)

In [None]:
from openai import OpenAI

In [None]:
def search(q):
    boost={'question':3.0,'section':0.5}
    results= index.search(
        query=q,
        filter_dict={'course':'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )
    return results


In [None]:
def llm(prompt):
    client = OpenAI()
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}],
    )
    results = response.choices[0].message.content
    return results

In [None]:
def build_context(query, search_results):
    prompt_template ="""
    You are the course teaching assistant. Answer the QUESTION based on the CONTEXT. 
    Only use facts from CONTEXT when answering the QUESTION.
    If CONTEXT does not contain an answer, return NULL.
    
    QUESTION:{question}
    
    CONTEXT:
    {context}
    """.strip()
    context =""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion:{doc['question']}\nanswer:{doc['text']}\n\n"
    prompt = prompt_template.format(question=query,context=context).strip()
    return prompt

In [None]:
def  rag(query):
    search_results = search(query)
    prompt=build_context(query,search_results)
    llm_output = llm(prompt)
    return llm_output

In [None]:
query = "How do I run a Kafka?"
rag(query)

In [None]:
from elasticsearch import Elasticsearch

In [None]:
es_client = Elasticsearch("http://localhost:9200")

In [None]:
index_settings={
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name ="corse-index"

In [None]:
es_client.indices.create(index=index_name, body=index_settings)

In [None]:
from tqdm.auto import tqdm

In [None]:
for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

In [None]:
query = "How do execute a command on a Kubernetes pod?"

In [None]:
def elastic_search(query):    
    search_quesry={
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    es_results=es_client.search(index=index_name,body=search_quesry)
    res_documents=[]
    for hit in es_results['hits']['hits']:
        res_documents.append(hit['_source'])
    return res_documents

In [None]:
search_quesry={
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}
es_results=es_client.search(index=index_name,body=search_quesry)

In [None]:
es_results['hits']

In [None]:
elastic_search(query)

In [None]:
def es_rag(query):
    search_results = elastic_search(query)
    prompt=build_context(query,search_results)
    llm_output = llm(prompt)
    return llm_output

In [None]:
es_rag(query)

In [None]:
!curl localhost:9200

In [167]:
query = "How do copy a file to a Docker container?"

In [168]:
def elastic_search2(query):    
    search_quesry2={
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                            "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    es_results=es_client.search(index=index_name,body=search_quesry2)
    res_documents=[]
    for hit in es_results['hits']['hits']:
        res_documents.append(hit['_source'])
    return res_documents

In [169]:
search_results2=elastic_search2(query)

In [170]:
def build_context2(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    context =""
    for doc in search_results:
        context = context + f"Q:{doc['question']}\n\nA:{doc['text']}\n\n".strip()
    prompt = prompt_template.format(question=query,context=context)
    return prompt

In [171]:
prompt2=build_context2(query,search_results2)

In [174]:
len(prompt2)

1455

In [177]:
pip install tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [178]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")

In [185]:
encoding.encode(prompt2)

[63842,
 261,
 4165,
 14029,
 29186,
 13,
 30985,
 290,
 150339,
 4122,
 402,
 290,
 31810,
 8099,
 591,
 290,
 40251,
 7862,
 558,
 271,
 7649,
 1606,
 290,
 19719,
 591,
 290,
 31810,
 8099,
 1261,
 55959,
 290,
 150339,
 364,
 271,
 150339,
 25,
 3253,
 621,
 5150,
 261,
 1974,
 316,
 261,
 91238,
 9282,
 1715,
 271,
 31810,
 8099,
 734,
 271,
 1486,
 25,
 5299,
 621,
 357,
 15199,
 261,
 62275,
 9282,
 1715,
 32,
 25,
 35423,
 290,
 9282,
 3621,
 306,
 25383,
 6766,
 326,
 151187,
 290,
 7251,
 4859,
 11,
 813,
 484,
 480,
 13217,
 261,
 38615,
 6348,
 558,
 68923,
 2461,
 533,
 278,
 2230,
 7962,
 4859,
 38615,
 464,
 3365,
 523,
 3335,
 290,
 9282,
 382,
 4279,
 6788,
 11,
 15792,
 261,
 6348,
 306,
 290,
 4857,
 9282,
 734,
 68923,
 10942,
 350,
 6555,
 290,
 9282,
 26240,
 446,
 68923,
 25398,
 533,
 278,
 464,
 6896,
 26240,
 29,
 38615,
 198,
 6103,
 277,
 10732,
 391,
 79771,
 8,
 48,
 25,
 5299,
 621,
 357,
 5150,
 6291,
 591,
 922,
 2698,
 7342,
 316,
 62275,
 9282,
 1715,