In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

## Index docs

In [2]:
from elasticsearch import Elasticsearch
client = Elasticsearch('http://localhost:9200')

client.info()

# resp = client.index(index="test-index", id=1, document=doc)
# print(resp['result'])

ObjectApiResponse({'name': 'e712d8d9d768', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'jphSPa9QTK69xs4el7bwWw', 'version': {'number': '8.17.6', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'dbcbbbd0bc4924cfeb28929dc05d82d662c527b7', 'build_date': '2025-04-30T14:07:12.231372970Z', 'build_snapshot': False, 'lucene_version': '9.12.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [3]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
response = client.indices.create(index=index_name, body=index_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/UsJ17lKwT2mJGqNxMcPW5w] already exists')

In [4]:
# Index all documents
for doc in documents:
    client.index(index=index_name, document=doc)

# Verify the indexing
doc_count = client.count(index=index_name)
print(f"Indexed {doc_count['count']} documents")

Indexed 1896 documents


In [5]:
# Search query with boost on question field
query = "How do execute a command on a Kubernetes pod?"

search_query = {
    "size": 5,
    "query": {
        "multi_match": {
            "query": query,
            "fields": ["question^4", "text"],
            "type": "best_fields"
        }
    }
}

response = client.search(index=index_name, body=search_query)

# Display the top result and its score
top_result = response['hits']['hits'][0]
print(f"Top result score: {top_result['_score']}")
print(f"Question: {top_result['_source']['question']}")
print(f"Text: {top_result['_source']['text'][:200]}...")

Top result score: 44.55304
Question: How do I debug a docker container?
Text: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.
docker run -it --entrypoint bash <image>
If the container is already running, execute a ...


In [6]:
# Search with course filter
query = "How do copy a file to a Docker container?"

search_query = {
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}

response = client.search(index=index_name, body=search_query)

# Display all 3 results
for i, hit in enumerate(response['hits']['hits']):
    print(f"Result {i+1}:")
    print(f"  Score: {hit['_score']}")
    print(f"  Question: {hit['_source']['question']}")
    print(f"  Course: {hit['_source']['course']}")
    print()

# Show the 3rd question specifically
if len(response['hits']['hits']) >= 3:
    third_question = response['hits']['hits'][2]['_source']['question']
    print(f"The 3rd question returned: {third_question}")

Result 1:
  Score: 73.50458
  Question: How do I debug a docker container?
  Course: machine-learning-zoomcamp

Result 2:
  Score: 73.50458
  Question: How do I debug a docker container?
  Course: machine-learning-zoomcamp

Result 3:
  Score: 66.94253
  Question: How do I copy files from my local machine to docker container?
  Course: machine-learning-zoomcamp

The 3rd question returned: How do I copy files from my local machine to docker container?


In [7]:
# Build context from search results
context_template = """
Q: {question}
A: {text}
""".strip()

# Get the results from the previous search
search_results = response['hits']['hits']

# Build context entries
context_entries = []
for hit in search_results:
    context_entry = context_template.format(
        question=hit['_source']['question'],
        text=hit['_source']['text']
    )
    context_entries.append(context_entry)

# Join context entries with double linebreaks
context = "\n\n".join(context_entries)

# Build the final prompt
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

final_prompt = prompt_template.format(
    question="How do copy a file to a Docker container?",
    context=context
)

# Calculate and display the length
prompt_length = len(final_prompt)
print(f"Length of the resulting prompt: {prompt_length}")
print(f"\nFirst 500 characters of the prompt:\n{final_prompt[:500]}...")

Length of the resulting prompt: 1383

First 500 characters of the prompt:
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: How do copy a file to a Docker container?

CONTEXT:
Q: How do I debug a docker container?
A: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.
docker run -it --entrypoint bash <image>
If the container is already running, execute a command in the specific contain...


In [8]:
import tiktoken

In [9]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [11]:
len(encoding.encode(final_prompt))

308