# Homework 1

Importo los modulos necesarios

In [53]:
import requests 
import tiktoken

from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from openai import OpenAI

## Q1. Running Elastic

Ejecutar el siguiente comando en la terminal

```bash
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.17.6
```

In [2]:
! curl http://localhost:9200

{
  "name" : "41840e7257b4",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "Urhkcu8HTXiNrf4EUjGxdw",
  "version" : {
    "number" : "8.17.6",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "dbcbbbd0bc4924cfeb28929dc05d82d662c527b7",
    "build_date" : "2025-04-30T14:07:12.231372970Z",
    "build_snapshot" : false,
    "lucene_version" : "9.12.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}


In [3]:
client_elasticsearch = Elasticsearch('http://localhost:9200')

In [4]:
info = client_elasticsearch.info()
info

ObjectApiResponse({'name': '41840e7257b4', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'Urhkcu8HTXiNrf4EUjGxdw', 'version': {'number': '8.17.6', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'dbcbbbd0bc4924cfeb28929dc05d82d662c527b7', 'build_date': '2025-04-30T14:07:12.231372970Z', 'build_snapshot': False, 'lucene_version': '9.12.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [5]:
info['version']['build_hash']

'dbcbbbd0bc4924cfeb28929dc05d82d662c527b7'

### Getting the data

In [6]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']
    
    for doc in tqdm(course['documents']):
        doc['course'] = course_name
        documents.append(doc)

  0%|          | 0/435 [00:00<?, ?it/s]

  0%|          | 0/375 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

In [7]:
documents[-1]

{'text': 'Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin',
 'section': 'Module 6: Best practices',
 'question': 'How to destroy infrastructure created via GitHub Actions',
 'course': 'mlops-zoomcamp'}

## Q2. Indexing the data

Defino el nombre del indice, la configuración para luego crear el indice.
Posterior a esto puedo indexar los documentos al indice creado

In [8]:
index_name = "course-questions-hw1"

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

In [9]:
client_elasticsearch.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions-hw1'})

In [10]:
for doc in tqdm(documents):
    client_elasticsearch.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

## Q3. Searching

Busco por los campos necesarios aplicando la prioridad correspondiente

In [11]:
query_q3 = "How do execute a command on a Kubernetes pod?"

In [12]:
search_query_q3 = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query_q3,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                }
            }
        }
    }

In [13]:
response_q3 = client_elasticsearch.search(index=index_name, body=search_query_q3)

In [14]:
[{'score':docs['_score'], 'course':docs['_source']['course']} for docs in response_q3['hits']['hits'] ]

[{'score': 44.50556, 'course': 'machine-learning-zoomcamp'},
 {'score': 35.433445, 'course': 'machine-learning-zoomcamp'},
 {'score': 33.70974, 'course': 'machine-learning-zoomcamp'},
 {'score': 33.2635, 'course': 'machine-learning-zoomcamp'},
 {'score': 32.589073, 'course': 'machine-learning-zoomcamp'}]

In [15]:
response_q3['hits']['hits'][:2]

[{'_index': 'course-questions-hw1',
  '_id': '9c4RdpcBnwHzfe4Ct7WD',
  '_score': 44.50556,
  '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
   'section': '5. Deploying Machine Learning Models',
   'question': 'How do I debug a docker container?',
   'course': 'machine-learning-zoomcamp'}},
 {'_index': 'course-questions-hw1',
  '_id': 'hM4SdpcBnwHzfe4CAbbM',
  '_score': 35.433445,
  '_source': {'text': 'Deploy and Access the Kubernetes Dashboard\nLuke',
   'section': '10. Kubernetes and TensorFlow Serving',
   'question': 'Kubernetes-dashboard',
   'course': 'machine-learning-zoomcamp'}}]

## Q4. Filtering

Busco por los campos necesarios aplicando la prioridad correspondiente y filtro necesario

In [16]:
query_q4 = "How do copy a file to a Docker container?"

In [17]:
search_query_q4 = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query_q4,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

In [18]:
response_q4 = client_elasticsearch.search(index=index_name, body=search_query_q4)

In [19]:
[{'question':docs['_source']['question']} for docs in response_q4['hits']['hits'] ]

[{'question': 'How do I debug a docker container?'},
 {'question': 'How do I copy files from my local machine to docker container?'},
 {'question': 'How do I copy files from a different folder into docker container’s working directory?'}]

## Q5. Building a prompt

In [36]:
results = [
    {'question': docs['_source']['question'], 
    'text': docs['_source']['text']}
    for docs in response_q4['hits']['hits']
]

results[0]

{'question': 'How do I debug a docker container?',
 'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)'}

In [37]:
context_template = """
Q: {question}
A: {text}
""".strip()

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

list_context = [context_template.format(question=doc['question'], text=doc['text']).strip() for doc in results]
context = "\n\n".join(list_context)

prompt = prompt_template.format(question=query_q4, context=context).strip()

print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: How do copy a file to a Docker container?

CONTEXT:
Q: How do I debug a docker container?
A: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.
docker run -it --entrypoint bash <image>
If the container is already running, execute a command in the specific container:
docker ps (find the container-id)
docker exec -it <container-id> bash
(Marcos MJD)

Q: How do I copy files from my local machine to docker container?
A: You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:
To copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:
docker cp /path/to/local/file_or_directory container_id:/path/in/contain

In [47]:
print(f"Longitud del prompt: {len(prompt)}")

Longitud del prompt: 1446


## Q6. Tokens

In [54]:
encoding = tiktoken.encoding_for_model("gpt-4o-mini")

In [55]:
tokens = encoding.encode(prompt)
type(tokens)

list

In [62]:
print(f"Longitud de los tokens: {len(tokens)}") 

Longitud de los tokens: 320


In [67]:
tokens[:9]

[63842, 261, 4165, 14029, 29186, 13, 30985, 290, 150339]

In [72]:
f_decode = encoding.decode_single_token_bytes

In [75]:
f_decode(63842), f_decode(261), f_decode(4165), f_decode(14029)

(b"You're", b' a', b' course', b' teaching')

## Bonus

In [57]:
client = OpenAI()

In [58]:
completion = client.chat.completions.create(
  model="gpt-4o-mini",
  store=True,
  messages=[
    {"role": "user", "content": prompt}
  ]
)

In [60]:
print(completion.choices[0].message.content)

To copy a file to a Docker container, you can use the `docker cp` command. The basic syntax is as follows:

```
docker cp /path/to/local/file_or_directory container_id:/path/in/container
```

Replace `/path/to/local/file_or_directory` with the path of the file or directory on your local machine, `container_id` with the ID of your running Docker container, and `/path/in/container` with the destination path inside the container where you want to copy the file.
