### Importing Required Libraries

In [1]:
import minsearch 

import json

from dotenv import load_dotenv

from openai import OpenAI

from minsearch import AppendableIndex

In [2]:
# load environment variables

load_dotenv()

True

### Loading documents

In [3]:
with open('../documents.json', 'rt') as f_in: 
    docs_raw = json.load(f_in)


documents = [] 

# looping on all courses
for course_dict in docs_raw:
    # looping on all documents for each course 
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
documents[2] 

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

### Indexing the documents

In [5]:
index = AppendableIndex(
    text_fields = ["question", "text", "section"], # search fields
    keyword_fields = ["course"] # filtering field
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x23382273220>

In [6]:
index.search("can I still join the course ?", num_results=3)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Here’s how you join a in Slack: https://slack.com/

In [6]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results = 5,
        output_ids=True
    )
    
    return results

In [8]:
question = 'Can I still join in the course ?'

search(question)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  '_id': 2},
 {'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
  'section': 'General course-related questions',
  'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
  'course': 'data-engineering-zoomcamp',
  '_id': 11},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the cou

In [9]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [10]:
search_results = search(question)

In [11]:
prompt = build_prompt(question, search_results)

In [12]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
Can I still join in the course ?
</QUESTION>

<CONTEXT>
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Certificate - Can I follow the course in a self-paced mode and get a certificate?
answer: No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.

section: Gen

In [7]:
client = OpenAI()

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [22]:
answer = llm(prompt)

print(answer)

Yes, you can still join the course after the start date. Even if you don't register, you are still eligible to submit the homework. However, be mindful of the deadlines for turning in the final projects.


In [21]:
def rag(query):
    # search 
    search_results = search(query)
    # building the prompt
    prompt = build_prompt(query, search_results)
    # getting llm response
    answer = llm(prompt)
    return answer

In [23]:
rag("How do I run Kafka in docker ?")

'To run Kafka in Docker, follow these steps:\n\n1. Navigate to the folder containing your docker-compose YAML file.\n2. Use the command `docker compose up -d` to start all the instances.\n\nMake sure to check if your Kafka broker Docker container is running by using `docker ps`. If you encounter any errors regarding availability, this might indicate that your Kafka broker is not working.'

## Irrelevant Question 

we are asking our RAG to provide answer on that we don't have information about that in our Knowledge base.

In [25]:
rag("How do I patch KDE under FreeBSD?")

"I'm sorry, but there is no information provided in the context regarding how to patch KDE under FreeBSD. Please refer to relevant FreeBSD or KDE documentation for guidance."

### But, General LLM can answer it: 

In [26]:
llm("How do I patch KDE under FreeBSD?")

"Patching KDE under FreeBSD involves a few steps, including obtaining the source code, applying the patch, and recompiling the software. Keep in mind that you should have the necessary development tools installed and that you're familiar with basic command-line operations. Here's a general guide on how to do this:\n\n### Prerequisites\n\n1. **Install Required Packages:**\n   Make sure you have the necessary tools to build KDE. You can do this by installing `ports` and related packages, as well as the development tools:\n\n   ```sh\n   pkg install ports\n   pkg install git cmake gmake\n   ```\n\n2. **Fetch the Source Code:**\n   You can either download the source from the FreeBSD ports collection or clone it from the KDE repositories. Using ports is typically more straightforward on FreeBSD.\n\n   ```sh\n   cd /usr/ports/x11/kde5\n   ```\n\n3. **Update Ports Tree:**\n   Ensure that your ports tree is up to date. You can use the `portsnap` tool:\n\n   ```sh\n   portsnap fetch update\n   

### Agentic RAG 

In [18]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
{question}
</QUESTION>

<CONTEXT> 
{context}
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}
""".strip()

In [19]:
question = "How can I run Docker in windows 10 ?"

context = "EMPTY"

In [20]:
prompt = prompt_template.format(question=question, context=context)

In [21]:
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
How can I run Docker in windows 10 ?
</QUESTION>

<CONTEXT> 
EMPTY
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}

If you can answer the QUESTION using CONTEXT, use this template:

{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}

If the context doesn't contain the answer, use your own knowledge to answer the question

{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}


In [22]:
answer = llm(prompt)

In [23]:
print(answer)

{
"action": "ANSWER",
"answer": "To run Docker on Windows 10, you need to follow these steps: 1. Ensure your Windows 10 version is 64-bit and supports Hyper-V (Pro, Enterprise, or Education editions). 2. Download Docker Desktop from the official Docker website. 3. Install Docker Desktop by running the downloaded installer. During installation, enable the required features such as WSL 2 (Windows Subsystem for Linux) if prompted. 4. Once the installation is complete, restart your computer if necessary. 5. After rebooting, launch Docker Desktop and wait for it to initialize. 6. You can verify that Docker is running by opening a command prompt or PowerShell and executing the command `docker --version`. This should display the installed version of Docker. You can now use Docker on your Windows 10 machine.",
"source": "OWN_KNOWLEDGE"
}


In [24]:
question = "Can I still join the course ?"

context = "EMPTY"

In [26]:
prompt = prompt_template.format(question=question, context=context)

answer_json = llm(prompt)

answer = json.loads(answer_json)

print(answer)

{'action': 'SEARCH', 'reasoning': 'The question is about joining the course, and I need to check the FAQ for information regarding enrollment deadlines and eligibility.'}


In [27]:
type(answer)

dict

In [28]:
# Now we can 

if answer['action'] == "SEARCH":
    # we need to perform search
    print("Searching...")

Searching...


In [29]:
def build_context(search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return context.strip()

In [30]:
search_results = search(question)

context = build_context(search_results)

prompt = prompt_template.format(question=question, context=context)

print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
Can I still join the course ?
</QUESTION>

<CONTEXT> 
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Certificate - Can I follow the course in a self-paced mode and get a certificate?
answer: No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the cou

In [31]:
answer_json = llm(prompt)

print(answer_json)

{
"action": "ANSWER",
"answer": "Yes, you can still join the course even after the start date. You are eligible to submit homework assignments, but be mindful that there are deadlines for final projects, so it's advisable to stay on track and not procrastinate.",
"source": "CONTEXT"
}


In [32]:
def agentic_rag_v1(question):
    context = "EMPTY"
    # prompt 
    prompt = prompt_template.format(question=question, context=context)
    # llm response 
    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(answer)

    # search part
    if answer['action'] == "SEARCH":
        print("need to perform search")
        search_results = search(question)
        context = build_context(search_results)

        prompt = prompt_template.format(question=question, context=context)
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(answer)

    return answer

In [34]:
agentic_rag_v1("Can I still join the course ?")

{'action': 'SEARCH', 'reasoning': 'The CONTEXT is EMPTY, which means I need to check the FAQ database to find out about the enrollment status for the course.'}
need to perform search
{'action': 'ANSWER', 'answer': "Yes, you can still join the course even after the start date. While it is recommended to register, you are eligible to submit the homework and participate in the course activities. Just keep in mind that there are deadlines for turning in final projects, so it's a good idea to stay on top of your assignments.", 'source': 'CONTEXT'}


{'action': 'ANSWER',
 'answer': "Yes, you can still join the course even after the start date. While it is recommended to register, you are eligible to submit the homework and participate in the course activities. Just keep in mind that there are deadlines for turning in final projects, so it's a good idea to stay on top of your assignments.",
 'source': 'CONTEXT'}

In [35]:
agentic_rag_v1("how to patch KDE under FreeBSD?")

{'action': 'ANSWER', 'answer': "To patch KDE under FreeBSD, you can follow these general steps: 1. First, ensure that you have the necessary tools installed, such as git and the FreeBSD ports or packages system. 2. Identify the specific package or port that you want to patch. You can find KDE ports under /usr/ports/x11/kde or similar directories. 3. Download the patch file that you want to apply. 4. Navigate to the directory of the specific KDE port you wish to modify. 5. Use the 'patch' command to apply the patch file, typically with a command like `patch < /path/to/your/patchfile.patch`. 6. Once the patch is applied, you can rebuild the port to ensure the changes take effect. Use `make install` or `portmaster` as needed. Check the FreeBSD Handbook or the KDE documentation for more detailed instructions and potential issues.", 'source': 'OWN_KNOWLEDGE'}


{'action': 'ANSWER',
 'answer': "To patch KDE under FreeBSD, you can follow these general steps: 1. First, ensure that you have the necessary tools installed, such as git and the FreeBSD ports or packages system. 2. Identify the specific package or port that you want to patch. You can find KDE ports under /usr/ports/x11/kde or similar directories. 3. Download the patch file that you want to apply. 4. Navigate to the directory of the specific KDE port you wish to modify. 5. Use the 'patch' command to apply the patch file, typically with a command like `patch < /path/to/your/patchfile.patch`. 6. Once the patch is applied, you can rebuild the port to ensure the changes take effect. Use `make install` or `portmaster` as needed. Check the FreeBSD Handbook or the KDE documentation for more detailed instructions and potential issues.",
 'source': 'OWN_KNOWLEDGE'}

### Part 2: Agentic Search

So far we had two actions only: `search` and `answer`. 

But we can let our `agent` formulate one or more search queries - and do it for a few iterations untill we found an answer. 

In [36]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than {max_iterations} iterations for a given student question.
The current iteration number: {iteration_number}. If we exceed the allowed number 
of iterations, give the best possible answer with the provided information.

Output templates:

If you want to perform search, use this template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>",
"keywords": ["search query 1", "search query 2", ...]
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER_CONTEXT",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}

<QUESTION>
{question}
</QUESTION>

<SEARCH_QUERIES>
{search_queries}
</SEARCH_QUERIES>

<CONTEXT> 
{context}
</CONTEXT>

<PREVIOUS_ACTIONS>
{previous_actions}
</PREVIOUS_ACTIONS>
""".strip()

In [38]:
question = "How do I do well on module 1?"

context = "EMPTY"

max_iterations = 3 

iteration_number = 0 

search_queries = []

search_results = []

previous_actions = []

In [40]:
context = build_context(search_results)

In [41]:
prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=max_iterations,
    iteration_number=iteration_number
)

print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration number

In [42]:
answer_json = llm(prompt)

print(answer_json)

{
"action": "SEARCH",
"reasoning": "The question asks for specific strategies or recommendations on how to perform well in module 1. It's beneficial to gather insights or advice related to study tips, module objectives, or common challenges faced by students in this module.",
"keywords": ["how to succeed in module 1", "study tips for module 1", "module 1 overview", "common challenges in module 1"]
}


In [43]:
answer = json.loads(answer_json)

answer

{'action': 'SEARCH',
 'reasoning': "The question asks for specific strategies or recommendations on how to perform well in module 1. It's beneficial to gather insights or advice related to study tips, module objectives, or common challenges faced by students in this module.",
 'keywords': ['how to succeed in module 1',
  'study tips for module 1',
  'module 1 overview',
  'common challenges in module 1']}

In [44]:
keywords = answer['keywords']

In [45]:
# Looping on queries
for kw in keywords:
    # keeping history of search queries
    search_queries.append(kw)
    # Performing search on each query (kw)
    s_r = search(kw)
    # similary history for search results
    search_results.extend(s_r)

In [49]:
search_queries

['how to succeed in module 1',
 'study tips for module 1',
 'module 1 overview',
 'common challenges in module 1']

In [47]:
search_results[:3]

[{'text': 'You need to look for the Py4J file and note the version of the filename. Once you know the version, you can update the export command accordingly, this is how you check yours:\n` ls ${SPARK_HOME}/python/lib/ ` and then you add it in the export command, mine was:\nexport PYTHONPATH=”${SPARK_HOME}/python/lib/Py4J-0.10.9.5-src.zip:${PYTHONPATH}”\nMake sure that the version under `${SPARK_HOME}/python/lib/` matches the filename of py4j or you will encounter `ModuleNotFoundError: No module named \'py4j\'` while executing `import pyspark`.\nFor instance, if the file under `${SPARK_HOME}/python/lib/` was `py4j-0.10.9.3-src.zip`.\nThen the export PYTHONPATH statement above should be changed to `export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:$PYTHONPATH"` appropriately.\nAdditionally, you can check for the version of ‘py4j’ of the spark you’re using from here and update as mentioned above.\n~ Abhijit Chakraborty: Sometimes, even with adding the correct version of p

In [50]:
len(search_queries), len(search_results)

(4, 20)

4 * 5 = 20 

for each query, we have 5 search results...

Problem: search results might be redundant..

In [51]:
## to remove duplicates from the search_results
def dedup(seq):
    seen = set()
    result = []
    for el in seq:
        _id = el['_id']
        if _id in seen:
            continue
        seen.add(_id)
        result.append(el)
    return result

search_results = dedup(search_results)

In [52]:
len(search_results)

6

Now we have only 6 search results, out of 20... 

Previous Actions

In [53]:
# keeping history of previous actions
previous_actions.append(answer)

In [54]:
previous_actions

[{'action': 'SEARCH',
  'reasoning': "The question asks for specific strategies or recommendations on how to perform well in module 1. It's beneficial to gather insights or advice related to study tips, module objectives, or common challenges faced by students in this module.",
  'keywords': ['how to succeed in module 1',
   'study tips for module 1',
   'module 1 overview',
   'common challenges in module 1']}]

Now let's rerun the Agentic Search loop once again...

In [55]:
iteration_number = 1 


context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=max_iterations,
    iteration_number=iteration_number
)

print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration number

In [56]:
answer_json = llm(prompt)

print(answer_json)

{
"action": "SEARCH",
"reasoning": "I need to collect more specific information related to study tips and effective practices for succeeding in module 1, as well as any common challenges students might face in this module to provide a tailored response.",
"keywords": ["success strategies for module 1", "effective study techniques for module 1", "module 1 common pitfalls", "best practices for module 1"]
}


### Iteration-2

In [57]:
answer = json.loads(answer_json)

answer

{'action': 'SEARCH',
 'reasoning': 'I need to collect more specific information related to study tips and effective practices for succeeding in module 1, as well as any common challenges students might face in this module to provide a tailored response.',
 'keywords': ['success strategies for module 1',
  'effective study techniques for module 1',
  'module 1 common pitfalls',
  'best practices for module 1']}

In [58]:
keywords = answer['keywords']

# Looping on queries
for kw in keywords:
    # keeping history of search queries
    search_queries.append(kw)
    # Performing search on each query (kw)
    s_r = search(kw)
    # similary history for search results
    search_results.extend(s_r)

In [59]:
len(search_queries), len(search_results)

(8, 26)

In [60]:
## to remove duplicates from the search_results
def dedup(seq):
    seen = set()
    result = []
    for el in seq:
        _id = el['_id']
        if _id in seen:
            continue
        seen.add(_id)
        result.append(el)
    return result

search_results = dedup(search_results)

In [61]:
len(search_queries), len(search_results)

(8, 7)

In [62]:
# keeping history of previous actions
previous_actions.append(answer)

In [63]:
iteration_number = 2


context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=max_iterations,
    iteration_number=iteration_number
)

print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration number

In [64]:
answer_json = llm(prompt)

answer = json.loads(answer_json)

print(answer)

{'action': 'ANSWER', 'answer': "To do well in Module 1, 'Docker and Terraform', focus on understanding the core concepts of both technologies. Here are some study tips:\n\n1. **Hands-On Practice**: Set up your own Docker containers and use Terraform to manage infrastructure as code. Real-world practice will solidify your understanding.\n2. **Follow Documentation**: Ensure you read and understand the official documentation for both Docker and Terraform. They contain valuable insights and best practices.\n3. **Engage in Community**: Participate in community forums or study groups. This can help you resolve doubts and learn from peers' experiences.\n4. **Understand Common Issues**: Familiarize yourself with common pitfalls and errors, such as module not found errors or configuration issues that students often encounter. Learning how to troubleshoot these will prepare you for exams and projects.\n5. **Review Course Materials**: Go over video lectures and course notes thoroughly. Pay attent

In [66]:
print(answer['answer'])

To do well in Module 1, 'Docker and Terraform', focus on understanding the core concepts of both technologies. Here are some study tips:

1. **Hands-On Practice**: Set up your own Docker containers and use Terraform to manage infrastructure as code. Real-world practice will solidify your understanding.
2. **Follow Documentation**: Ensure you read and understand the official documentation for both Docker and Terraform. They contain valuable insights and best practices.
3. **Engage in Community**: Participate in community forums or study groups. This can help you resolve doubts and learn from peers' experiences.
4. **Understand Common Issues**: Familiarize yourself with common pitfalls and errors, such as module not found errors or configuration issues that students often encounter. Learning how to troubleshoot these will prepare you for exams and projects.
5. **Review Course Materials**: Go over video lectures and course notes thoroughly. Pay attention to any assignments or practical ap

### Agentic Search Function:

In [68]:
def agentic_search(question):
    search_queries = []
    search_results = []
    previous_actions = []

    iteration = 0
    
    while True:
        print(f'ITERATION #{iteration}...')
    
        context = build_context(search_results)
        prompt = prompt_template.format(
            question=question,
            context=context,
            search_queries="\n".join(search_queries),
            previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
            max_iterations=3,
            iteration_number=iteration
        )
    
        print(prompt)
    
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(json.dumps(answer, indent=2))

        previous_actions.append(answer)
    
        action = answer['action']
        if action != 'SEARCH':
            break
    
        keywords = answer['keywords']
        search_queries = list(set(search_queries) | set(keywords))

        for k in keywords:
            res = search(k)
            search_results.extend(res)
    
        search_results = dedup(search_results)
        
        iteration = iteration + 1
        if iteration >= 4:
            break
    
        print()

    return answer

In [69]:
question = "what do I need to do to be successful at module 1?"

agentic_search(question)

ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current 

{'action': 'ANSWER',
 'answer': 'To be successful in Module 1: Docker and Terraform, consider the following strategies:\n\n1. **Understand the Tools**: Familiarize yourself with both Docker and Terraform concepts, as they are fundamental to this module. Make sure you grasp how Docker manages containerization and how Terraform is used for infrastructure as code.\n\n2. **Hands-On Practice**: Engage with practical exercises as much as possible. Set up Docker containers and practice writing Terraform configurations to solidify your understanding.\n\n3. **Follow Documentation**: Utilize the official documentation for Docker and Terraform. They provide comprehensive guides and examples that can help clarify usage.\n\n4. **Community Resources**: Leverage forums, community discussions, or study groups to share challenges and solutions with peers. Platforms like Stack Overflow can be particularly useful for troubleshooting specific issues.\n\n5. **Keep Environment Updated**: Make sure your Dock

### Part 3: Function calling (`tool use`)

Tradional Approach: We put all this logic inside our prompt. 

* But OpenAI and other providers provide a convienient API for adding extra functionality like `search`.

* It's called `function-calling` or `tool usage`: you can define functions that the model can call, and if it decides to make a call, it returns structured output for that.

For exmaple: let's take our `search` function: 

In [12]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

We describe it like that: 

In [13]:
search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }
}

In [14]:
question = "How do I do well in module 1?"

developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it. 
""".strip()

tools = [search_tool]

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model="gpt-4o-mini",
    input=chat_messages,
    tools=tools
)

response.output

[ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_cw8bzVd8lRuyOnZZ3c6n0PDK', name='search', type='function_call', id='fc_6874d3e6ffe08196b7b3ba35e7a7f18f062c3b20eb6abbce', status='completed')]

In [15]:
response

Response(id='resp_6874d3e633288196a7272d14b316b1d5062c3b20eb6abbce', created_at=1752486886.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-4o-mini-2024-07-18', object='response', output=[ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_cw8bzVd8lRuyOnZZ3c6n0PDK', name='search', type='function_call', id='fc_6874d3e6ffe08196b7b3ba35e7a7f18f062c3b20eb6abbce', status='completed')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[FunctionTool(name='search', parameters={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search query text to look up in the course FAQ.'}}, 'required': ['query'], 'additionalProperties': False}, strict=True, type='function', description='Search the FAQ database')], top_p=1.0, background=False, max_output_tokens=None, previous_response_id=None, reasoning=Reasoning(effort=None, generate_summary=None, summary=None), service_tier='default', sta

In [16]:
calls = response.output

In [17]:
call = calls[0]

In [18]:
call

ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_cw8bzVd8lRuyOnZZ3c6n0PDK', name='search', type='function_call', id='fc_6874d3e6ffe08196b7b3ba35e7a7f18f062c3b20eb6abbce', status='completed')

In [19]:
f_name = call.name 

In [20]:
f_name

'search'

In [21]:
arguments = json.loads(call.arguments)

print(arguments)

{'query': 'how to do well in module 1'}


In [22]:
globals()['search_tool']

{'type': 'function',
 'name': 'search',
 'description': 'Search the FAQ database',
 'parameters': {'type': 'object',
  'properties': {'query': {'type': 'string',
    'description': 'Search query text to look up in the course FAQ.'}},
  'required': ['query'],
  'additionalProperties': False}}

In [23]:
f = globals()[f_name]

In [24]:
search_results = f(**arguments)

In [25]:
search_results

[{'text': 'Even after installing pyspark correctly on linux machine (VM ) as per course instructions, faced a module not found error in jupyter notebook .\nThe solution which worked for me(use following in jupyter notebook) :\n!pip install findspark\nimport findspark\nfindspark.init()\nThereafter , import pyspark and create spark contex<<t as usual\nNone of the solutions above worked for me till I ran !pip3 install pyspark instead !pip install pyspark.\nFilter based on conditions based on multiple columns\nfrom pyspark.sql.functions import col\nnew_final.filter((new_final.a_zone=="Murray Hill") & (new_final.b_zone=="Midwood")).show()\nKrishna Anand',
  'section': 'Module 5: pyspark',
  'question': 'Module Not Found Error in Jupyter Notebook .',
  'course': 'data-engineering-zoomcamp',
  '_id': 322},
 {'text': 'You need to look for the Py4J file and note the version of the filename. Once you know the version, you can update the export command accordingly, this is how you check yours:\n`

And the save both the response and the result of the function call:

In [27]:
call

ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_cw8bzVd8lRuyOnZZ3c6n0PDK', name='search', type='function_call', id='fc_6874d3e6ffe08196b7b3ba35e7a7f18f062c3b20eb6abbce', status='completed')

In [33]:
chat_messages.append(call)

chat_messages.append({
    "type": "function_call_output",
    "call_id": call.call_id,
    "output": json.dumps(search_results),
})

In [34]:
chat_messages

[ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_cw8bzVd8lRuyOnZZ3c6n0PDK', name='search', type='function_call', id='fc_6874d3e6ffe08196b7b3ba35e7a7f18f062c3b20eb6abbce', status='completed'),
 {'type': 'function_call_output',
  'call_id': 'call_cw8bzVd8lRuyOnZZ3c6n0PDK',
  'output': '[{"text": "Even after installing pyspark correctly on linux machine (VM ) as per course instructions, faced a module not found error in jupyter notebook .\\nThe solution which worked for me(use following in jupyter notebook) :\\n!pip install findspark\\nimport findspark\\nfindspark.init()\\nThereafter , import pyspark and create spark contex<<t as usual\\nNone of the solutions above worked for me till I ran !pip3 install pyspark instead !pip install pyspark.\\nFilter based on conditions based on multiple columns\\nfrom pyspark.sql.functions import col\\nnew_final.filter((new_final.a_zone==\\"Murray Hill\\") & (new_final.b_zone==\\"Midwood\\")).show()\\nKrishna Ana

Now `chat_messages` contains both the call description (so it keeps track of history) and the results

Let's make another call to the model:

In [35]:
response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

This time it should be the response (but also can be another call):

In [36]:
r = response.output[0]
print(r.content[0].text)

Here are some tips for doing well in Module 1 of the course "Data Engineering Zoomcamp," specifically focusing on Docker and Terraform:

1. **Understand Basic Concepts**: Make sure you have a solid grasp of Docker and Terraform. Review any introductory materials provided in the course to help you get started.

2. **Environment Setup**:
   - Be diligent about setting up your environment according to the instructions. This often includes installing the necessary software, such as Docker, and ensuring proper configuration.

3. **Common Errors**:
   - If you encounter errors related to SQLAlchemy (e.g., `TypeError: 'module' object is not callable`), ensure your connection string is formatted correctly.
   - For issues like `ModuleNotFoundError: No module named 'psycopg2'`, you'll need to install the `psycopg2` module using `pip` or `conda`.

4. **Utilize Resources**: Check the course materials, FAQs, or community forums for solutions to common problems. For instance, searching for specific

#### Making multiple calls

What if we want to make multiple calls? Change the developer prompt a little:

In [54]:
# Added a new instruction
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
If you look up something in FAQ, convert the student question into multiple queries. 
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [55]:
response.output 

[ResponseFunctionToolCall(arguments='{"query":"How to do well in module 1?"}', call_id='call_iRCtlo9TqDLxUgQiiXMHAcFv', name='search', type='function_call', id='fc_6874e58faeac8194a066fb11a66044480533b6d072e9dbe6', status='completed'),
 ResponseFunctionToolCall(arguments='{"query":"tips for succeeding in module 1"}', call_id='call_GoYK0Gc20wUdsYzMaqNNNrfD', name='search', type='function_call', id='fc_6874e590195c8194830641a98fcbf9b20533b6d072e9dbe6', status='completed'),
 ResponseFunctionToolCall(arguments='{"query":"module 1 best practices"}', call_id='call_CFKMgh3vBYeopf7oHz29BiH4', name='search', type='function_call', id='fc_6874e5907b14819495231206fa506d990533b6d072e9dbe6', status='completed')]

This time let's organize our code a little: 

First, create a function: `do_call`: 

In [56]:
def do_call(tool_call_response):
    function_name = tool_call_response.name 
    arguments = json.loads(tool_call_response.arguments)

    f = globals()[function_name]
    search_results = f(**arguments)

    return {
        "type": "function_call_output",
        "call_id": tool_call_response.call_id,
        "output": json.dumps(search_results, indent=2),
    }

Now Iterate over responses:

In [57]:
# first iteration
for entry in response.output:
    chat_messages.append(entry)
    print(entry.type)

    if entry.type == "function_call":
        result = do_call(entry)
        chat_messages.append(result)

    elif entry.type == "message":
        print(entry.text)

function_call
function_call
function_call


First call will probably be function call, so let's do another one:

In [None]:
# 2nd iteration, after getting all context needed for the generation part
response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

for entry in response.output:
    chat_messages.append(entry)
    print(entry.type)
    print()

    if entry.type == 'function_call':      
        result = do_call(entry)
        chat_messages.append(result)
    elif entry.type == 'message':
        print(entry.content[0].text) 

message

To do well in **Module 1**, here are some specific tips and best practices:

1. **Understand Docker and Terraform**:
   - Familiarize yourself with the fundamental concepts of Docker and Terraform, as they are the primary tools used in this module. Refer to the Docker documentation for best practices (such as storing code in your Linux distro for better performance).

2. **Environment Setup**:
   - Ensure that you have properly set up your environment according to course instructions. If you encounter errors related to modules (like `psycopg2`), the solution often involves installing the necessary Python packages. You can install `psycopg2` with:
     ```bash
     pip install psycopg2-binary
     ```

3. **Troubleshooting Common Errors**:
   - If you face errors like `ModuleNotFoundError`, check that all necessary libraries are installed. If the library is already installed, consider upgrading it:
     ```bash
     pip install psycopg2-binary --upgrade
     ```
   - When using

### putting Everything Together:


Let's make two loops: 

1. First is the main Q&A loop - ask question, get back the answer.
2. Second is the request loop - send requests until there's a message reply from the API.

In [59]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.
When using FAQ, perform deep topic exploration: make one request to FAQ,
and then based on the results, make more requests.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]

In [None]:
while True: # main Q&A loop
    question = input() # How do I do my best for module 1?
    print(f"Question: {question}\n")
    if question == 'stop':
        break

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True: # request-response loop - query API till get a message
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )

        has_messages = False
        
        for entry in response.output:
            chat_messages.append(entry)
        
            if entry.type == 'function_call':      
                print('function_call:', entry)
                print()
                result = do_call(entry)
                chat_messages.append(result)
            elif entry.type == 'message':
                print("Response: \n")
                print(entry.content[0].text)
                print()
                has_messages = True

        if has_messages:
            break

function_call: ResponseFunctionToolCall(arguments='{"query":"how to do well in module 1"}', call_id='call_cWkzXu8M7jE2v9iqaftoevcL', name='search', type='function_call', id='fc_6874e759aea08197a2aa510d9b5001e40b0b1d4ba72eceb0', status='completed')

function_call: ResponseFunctionToolCall(arguments='{"query":"tips for success in module 1 Docker and Terraform"}', call_id='call_ePH51aUbWJQ3mtGzATW7WFli', name='search', type='function_call', id='fc_6874e75b0ea881978f09bbd0080f18450b0b1d4ba72eceb0', status='completed')

To do well in Module 1: Docker and Terraform, here are some key strategies and tips:

1. **Understand the Concepts**: Make sure you grasp the fundamental concepts of both Docker and Terraform. These tools are essential for creating and managing infrastructure, so a solid understanding will help you in practical applications.

2. **Follow Along with Tutorials**: Engage with any provided tutorials or videos. Since this module covers Docker and Terraform, try to replicate the e

### chat_assistant.py


* Tools - manages function tools for the agent
    * add_tool(function, description): Register a function with its description
    * get_tools(): Return list of registered tool descriptions
    * function_call(tool_call_response): Execute a function call and return result

* ChatInterface - handles user input and display formatting
   * input(): Get user input
   * display(message): Print a message
   * display_function_call(entry, result): Show function calls in HTML format
   * display_response(entry): Display AI responses with markdown


* ChatAssistant - main orchestrator for chat conversations.
    * __init__(tools, developer_prompt, chat_interface, client): Initialize assistant
    * gpt(chat_messages): Make OpenAI API calls
    * run(): Main chat loop handling user input and AI responses

In [61]:
import chat_assistant 

# Tools 
tools = chat_assistant.Tools()
tools.add_tool(search, search_tool)
tools.get_tools()

# Prompt Template
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_interface = chat_assistant.ChatInterface()


chat = chat_assistant.ChatAssistant(
    tools=tools,
    developer_prompt=developer_prompt,
    chat_interface=chat_interface,
    client=client
)

In [62]:
chat.run()

Chat ended.


### Multiple Tools

#### New tool: `add_entry` -> adds new document in the FAQ database.

What if we also want to use this chat app to add new entires to the FAQ? we'll need another function for it:

In [63]:
def add_entry(question, answer):
    doc = {
        'question': question,
        'text': answer,
        'section': 'user added',
        'course': 'data-engineering-zoomcamp'
    }
    index.append(doc)

Description about the tool:

In [64]:
add_entry_description = {
    "type": "function",
    "name": "add_entry",
    "description": "Add an entry to the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "The question to be added to the FAQ database",
            },
            "answer": {
                "type": "string",
                "description": "The answer to the question",
            }
        },
        "required": ["question", "answer"],
        "additionalProperties": False
    }
}

Now let's add the new tool:

In [65]:
tools.add_tool(add_entry, add_entry_description)

tools.get_tools()

[{'type': 'function',
  'name': 'search',
  'description': 'Search the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'query': {'type': 'string',
     'description': 'Search query text to look up in the course FAQ.'}},
   'required': ['query'],
   'additionalProperties': False}},
 {'type': 'function',
  'name': 'add_entry',
  'description': 'Add an entry to the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'question': {'type': 'string',
     'description': 'The question to be added to the FAQ database'},
    'answer': {'type': 'string', 'description': 'The answer to the question'}},
   'required': ['question', 'answer'],
   'additionalProperties': False}}]

In [66]:
# Prompt Template
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_interface = chat_assistant.ChatInterface()


chat = chat_assistant.ChatAssistant(
    tools=tools,
    developer_prompt=developer_prompt,
    chat_interface=chat_interface,
    client=client
)

In [67]:
chat.run()

Chat ended.


We actually asked chatbot to add question: `How can I do well in Module 1?` and answer to FAQ database.

And check that it's in the index:

In [68]:
index.docs[-1]

{'question': 'How can I do well in Module 1?',
 'text': '1. **Understand the Basics**: Make sure you have a good grasp of Docker and Terraform fundamentals. Review the introductory materials provided in the module.\n\n2. **Hands-On Practice**: Engage in the hands-on exercises. Set up your Docker containers and practice deploying configurations with Terraform. Experimenting will deepen your understanding.\n\n3. **Resolve Common Issues**: If you run into errors—such as `ModuleNotFoundError` for required packages (like `psycopg2` or similar)—follow the recommended solutions. For instance, you might need to install certain Python modules that are essential for execution.\n\n4. **Connect with Peers**: Discuss concepts and share problems with classmates on forums or group chats. Collaborative learning can provide different perspectives and solutions.\n\n5. **Utilize Resources**: Use any additional resources, such as documentation, tutorials, or the FAQ section for this course, to support you

### Part-4: PydanticAI 

There are frameworks that makes it easier for us to create `Agents`:



Tools 

In [8]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results = 5,
        output_ids=True
    )
    
    return results


def add_entry(question, answer):
    doc = {
        'question': question,
        'text': answer,
        'section': 'user added',
        'course': 'data-engineering-zoomcamp'
    }
    index.append(doc)

In [1]:
# installing pydanticAI 
# !pip install pydantic-ai

In [9]:
from pydantic_ai import Agent, RunContext

Create an Agent:

In [10]:
model_name = 'openai:gpt-4o-mini'


system_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

In [11]:
chat_agent = Agent(  
    model_name,
    system_prompt=system_prompt
)

Now we can use it to automate tool description:

In [14]:
from typing import Dict, List 

@chat_agent.tool
def search_tool(ctx: RunContext, query: str) -> List[Dict[str, str]]: 
    """
    Search the FAQ for relevant entries matching the query. 

    Parameters
    ----------
    query : str
        The search query string provided by the user. 

    Returns
    -------
    list
        A list of search results (up to 5), each containing relevance information 
        and associated output IDs.   
    """
    print(f"search('{query})")
    return search(query)


@chat_agent.tool
def add_entry_tool(ctx: RunContext, question: str, answer: str) -> None: 
    """
    Add a new question-answer entry to FAQ index.

    This function creates a document with given question and answer, 
    tagging it as user-added content. 

    Parameters
    ----------
    question: str 
        The question text to be added to the index. 

    answer: str 
        The answer or explaination corresponding to the question.

    Returns
    -------
    None
    """
    return add_entry(question, answer)

It reads the function's docstring to automatically create function description, so we don't need to worry about it. 

Let's use it:

In [15]:
question = "I just discovered the course, Can I join now?"

agent_run = await chat_agent.run(question)

print(agent_run.output)

search('Can I join the course now?)
Yes, you can still join the course even if it has already started. You are eligible to submit homework assignments without registering, although there will be deadlines for final project submissions. Just make sure not to leave everything until the last minute!

Would you like to know more about the course materials or any deadlines?
