In [21]:
from dotenv import load_dotenv
import openai
import os

In [22]:
from openai import OpenAI

In [23]:
# Load environment variables from the .env file
load_dotenv()

True

In [24]:
# Access the variables
api_key = os.getenv('OPENAI_API_KEY')

In [25]:
client = openai.OpenAI(api_key=api_key)

In [29]:
response=client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role":"user","content":"is it too late to join the course?"}]
)

In [30]:
response

ChatCompletion(id='chatcmpl-9c2b7dsOPsDWqUN5geq7RPRO7S1Rm', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="To determine whether it's too late to join a course, consider the following steps:\n\n1. **Check Enrollment Dates:** Look at the course catalog or the institution’s website to find key dates, such as enrollment deadlines.\n  \n2. **Contact the Instructor or Registrar:** Reach out directly to the course instructor or the institution's registrar office to ask if late enrollment is possible. Sometimes they can make exceptions.\n\n3. **Review Course Progress:** Determine how much of the course has already been completed. If only a small portion of the course has elapsed, catching up might be feasible.\n\n4. **Assess Personal Schedule:** Evaluate your own availability and ability to catch up on missed material if you are allowed to join late.\n\n5. **Understand the Implications:** Be aware that joining late might mean missing key in

In [31]:
response.choices[0].message.content

"To determine whether it's too late to join a course, consider the following steps:\n\n1. **Check Enrollment Dates:** Look at the course catalog or the institution’s website to find key dates, such as enrollment deadlines.\n  \n2. **Contact the Instructor or Registrar:** Reach out directly to the course instructor or the institution's registrar office to ask if late enrollment is possible. Sometimes they can make exceptions.\n\n3. **Review Course Progress:** Determine how much of the course has already been completed. If only a small portion of the course has elapsed, catching up might be feasible.\n\n4. **Assess Personal Schedule:** Evaluate your own availability and ability to catch up on missed material if you are allowed to join late.\n\n5. **Understand the Implications:** Be aware that joining late might mean missing key introductory material, which could impact your performance in the course.\n\nEach institution has its own policies regarding late enrollment, so it's best to get 

## Q1. Running Elastic 

Run Elastic Search 8.4.3, and get the cluster information. If you run it on localhost, this is how you do it:

```bash
curl localhost:9200
```

What's the `version.build_hash` value?

>Answer: 42f05b9372a9a4a470db3b52817899b99a76ee73

## Getting the data

Now let's get the FAQ data. You can run this snippet:

```python
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)
```

Note that you need to have the `requests` library:

```bash
pip install requests
```

In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

## Q2. Indexing the data

Index the data in the same way as was shown in the course videos. Make the `course` field a keyword and the rest should be text. 

Don't forget to install the ElasticSearch client for Python:

```bash
pip install elasticsearch
```

Which function do you use for adding your data to elastic?

* `insert`
* `index`
* `put`
* `add`

> Answer: `index`

In [2]:
from elasticsearch import Elasticsearch

In [3]:
es_client=Elasticsearch('http://localhost:9200')

In [4]:
index_settings={
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name='course-questions'

es_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [5]:
from tqdm.auto import tqdm

In [6]:
for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [40]:
query='i just discovered the course. Can i still join it'

In [41]:
def elastic_search(query):
    search_query={
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
    }
    response=es_client.search(index=index_name, body=search_query)
    
    result_docs=[]
    
    for response in response['hits']['hits']:
        result_docs.append(response['_source'])
    return result_docs

In [42]:
elastic_search(query)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (insta

In [43]:
def build_prompt(query,search_results):
    prompt_template="""
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
    Use only the facts from CONTEXT when answering the QUESTION.

    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    
    context=""
    
    for doc in search_results:
        context=context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        
    prompt=prompt_template.format(question=query,context=context).strip()
    return prompt
        


In [44]:
def llm(prompt):
    response=client.chat.completions.create(model='gpt-4o', messages=[{"role":"user","content":prompt}])
    return response.choices[0].message.content

In [45]:
def rag(query):
    search_results=elastic_search(query)
    prompt=build_prompt(query,search_results)
    answer=llm(prompt)
    return answer

In [46]:
rag(query)

'Yes, you can still join the course even if you have discovered it after the start date. You are eligible to submit the homework assignments, but be mindful of the deadlines for turning in the final projects. It is important not to leave everything for the last minute.'

## Q3. Searching

Now let's search in our index. 

We will execute a query "How do I execute a command in a running docker container?". 

Use only `question` and `text` fields and give `question` a boost of 4, and use `"type": "best_fields"`.

What's the score for the top ranking result?

* 94.05
* 84.05
* 74.05
* 64.05

Look at the `_score` field.

> Answer: 84.05

In [47]:
query='How do I execute a command in a running docker container?'

In [57]:
def elastic_search(query):
    search_query={
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            }
        }
    }
    }
    response=es_client.search(index=index_name, body=search_query)

    top_score = response['hits']['hits'][0]['_score']
    
    # result_docs=[]
    
    # for response in response['hits']['hits']:
    #     result_docs.append(response['_source'])
        
    return top_score

In [58]:
elastic_search(query)

84.050095

## Q4. Filtering

Now let's only limit the questions to `machine-learning-zoomcamp`.

Return 3 results. What's the 3rd question returned by the search engine?

* How do I debug a docker container?
* How do I copy files from a different folder into docker container’s working directory?
* How do Lambda container images work?
* How can I annotate a graph?

In [59]:
def elastic_search(query):
    search_query={
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            }
        }
    }
    }
    response=es_client.search(index=index_name, body=search_query)

    top_score = response['hits']['hits'][0]['_score']
    
    result_docs=[]
    
    for response in response['hits']['hits']:
        result_docs.append(response['_source'])
        
    return result_docs

In [60]:
elastic_search(query)

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'In case running pgcli  locally causes issues or you do not want to install it locally you can use it running in a Docker container instead.\nBelow the usage with values used in the videos of the course for:\nnetwork name (docker network)\npostgres related variables for pgcli\nHostname\nUsername\nPort\nDatabase name\n$ docker run -it --rm --network pg-network ai2ys/dockerized-pgcli:4.0.1\n175dd47cda07:/# pgcli -h pg-database -U root -p 5432 -d ny_taxi\nPassword for root:\nServer: PostgreSQL 16.1 (Debia

In [None]:
query = "How do I execute a command in a running docker container?"
results = elastic_search(query)