In [1]:
import os
os.chdir("/home/kantundpeterpan/projects/zoomcamp/zcllm/ws_agents1")

In [8]:
import json, requests, dotenv
from openai import OpenAI
try:
    if dotenv.load_dotenv(): 
        OPENROUTER_API_KEY = os.environ['OPENROUTER_API_KEY']
except KeyError:
    raise ValueError("set api key")
# You can use any model that supports tool calling
MODEL = "google/gemini-2.5-flash"
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=OPENROUTER_API_KEY,
)

In [3]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [4]:
from minsearch import AppendableIndex

index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x7f691d5eecf0>

In [5]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [6]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

In [7]:
def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [10]:
def llm(prompt):
    response = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content


In [13]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [14]:
print(rag('Can i join after the course started ?'))

Yes, you can still join the course even after the start date. You are eligible to submit homeworks even if you haven't registered. However, be aware of the deadlines for final projects to avoid last-minute rush.


# Agentic RAG

In [54]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
{question}
</QUESTION>

<CONTEXT> 
{context}
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database IF you deem the question related to one of the courses
you are responsible for: Machine Learning, MLOps, Data Engineering and LLM Applications.
In this case, use the following output template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}
""".strip()

In [55]:
question = 'Can I still join the course ?'
context = "EMPTY"

In [56]:
prompt = prompt_template.format(
    question = question,
    context = context
)

In [57]:
answer_json = llm(prompt)

In [58]:
import json

In [59]:
answer = json.loads(answer_json)

In [60]:
answer['action']

'SEARCH'

In [61]:
def build_context(search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return context.strip()

In [62]:
search_results = search(question)
context = build_context(search_results)
prompt = prompt_template.format(question=question, context=context)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
Can I still join the course ?
</QUESTION>

<CONTEXT> 
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Certificate - Can I follow the course in a self-paced mode and get a certificate?
answer: No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the cou

In [63]:
answer_json = llm(prompt)

In [64]:
print(answer_json)

{
"action": "ANSWER",
"answer": "Yes, you can still join the course even after the official start date and submit homeworks. However, be mindful of deadlines for the final projects.",
"source": "CONTEXT"
}


## Puttin agentic RAG together

In [65]:
def agentic_rag_v1(question):
    context = "EMPTY"
    prompt = prompt_template.format(question=question, context=context)
    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(answer)

    if answer['action'] == 'SEARCH':
        print('need to perform search...')
        search_results = search(question)
        context = build_context(search_results)
        
        prompt = prompt_template.format(question=question, context=context)
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(answer)

    return answer

In [66]:
agentic_rag_v1('how do I join the course?')

{'action': 'SEARCH', 'reasoning': 'The user is asking a general question about joining a course. This is a common question that might be covered in an FAQ for any of the courses I support (Machine Learning, MLOps, Data Engineering, LLM Applications).'}
need to perform search...
{'action': 'ANSWER', 'answer': 'To join the course, you should register using the provided link before it starts. The course begins on January 15th, 2024, at 17:00 with the first "Office Hours" live session. You can also subscribe to the course\'s public Google Calendar, join the course Telegram channel for announcements, and register in DataTalks.Club\'s Slack to join the channel.', 'source': 'CONTEXT'}


{'action': 'ANSWER',
 'answer': 'To join the course, you should register using the provided link before it starts. The course begins on January 15th, 2024, at 17:00 with the first "Office Hours" live session. You can also subscribe to the course\'s public Google Calendar, join the course Telegram channel for announcements, and register in DataTalks.Club\'s Slack to join the channel.',
 'source': 'CONTEXT'}

In [67]:
agentic_rag_v1('how patch KDE under FreeBSD?')

{'action': 'SEARCH', 'reasoning': "The question is about patching KDE under FreeBSD, which is an operating system and desktop environment topic. This is not directly related to Machine Learning, MLOps, Data Engineering, or LLM Applications, and therefore likely not in our course FAQ database. A search action is chosen to indicate that it's outside the scope of direct course material but a search would be the next logical step if it were related."}
need to perform search...
{'action': 'ANSWER', 'answer': "Patching KDE under FreeBSD typically involves using the FreeBSD Ports or Packages system. The most common and recommended way to update or patch software, including desktop environments like KDE, on FreeBSD is through the `pkg` command for binary packages or by updating and re-installing from the Ports tree. \n\n1.  **Using `pkg` (for binary packages):**\n    If you installed KDE using `pkg install kde5` (or similar), you can update it by running `pkg upgrade`. This command will fetch 

{'action': 'ANSWER',
 'answer': "Patching KDE under FreeBSD typically involves using the FreeBSD Ports or Packages system. The most common and recommended way to update or patch software, including desktop environments like KDE, on FreeBSD is through the `pkg` command for binary packages or by updating and re-installing from the Ports tree. \n\n1.  **Using `pkg` (for binary packages):**\n    If you installed KDE using `pkg install kde5` (or similar), you can update it by running `pkg upgrade`. This command will fetch and install the latest versions of all installed packages, including KDE components, from the official FreeBSD package repositories.\n\n2.  **Using Ports (for source-based installation):**\n    If you built KDE from the Ports tree (i.e., you navigated to `/usr/ports/x11/kde5` and ran `make install` or `make reinstall`), you'll need to update your Ports tree first, and then rebuild/reinstall KDE. \n    *   Update your Ports tree: `portsnap fetch update` (or `git pull` if yo

# Agentic Search

Agentic Search is an extension of Agentic RAG by leveraging an LLM Agent to formulate search queries based on the initial question.

In [68]:
question = 'how do I do well in module 1'

In [72]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than {max_iterations} iterations for a given student question.
The current iteration number: {iteration_number}. If we exceed the allowed number 
of iterations, give the best possible answer with the provided information.

Output templates:

If you want to perform search, use this template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>",
"keywords": ["search query 1", "search query 2", ...]
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER_CONTEXT",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}

<QUESTION>
{question}
</QUESTION>

<SEARCH_QUERIES>
{search_queries}
</SEARCH_QUERIES>

<CONTEXT> 
{context}
</CONTEXT>

<PREVIOUS_ACTIONS>
{previous_actions}
</PREVIOUS_ACTIONS>
""".strip()

In [73]:
question = "how do I join the course?"

search_queries = []
search_results = []
previous_actions = []
context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=3,
    iteration_number=1
)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration number

In [74]:
answer_json = llm(prompt)
answer = json.loads(answer_json)
print(json.dumps(answer, indent=2))

{
  "action": "SEARCH",
  "reasoning": "The user is asking how to join the course. I need to search the FAQ for information on course enrollment or joining procedures.",
  "keywords": [
    "join course",
    "enrollment process",
    "how to register for the course"
  ]
}


In [75]:
previous_actions.append(answer)

In [76]:
previous_actions

[{'action': 'SEARCH',
  'reasoning': 'The user is asking how to join the course. I need to search the FAQ for information on course enrollment or joining procedures.',
  'keywords': ['join course',
   'enrollment process',
   'how to register for the course']}]

In [77]:
keywords = answer['keywords']
search_queries.extend(keywords)

In [78]:
for k in keywords:
    res = search(k)
    search_results.extend(res)

In [79]:
def dedup(seq):
    seen = set()
    result = []
    for el in seq:
        _id = el['_id']
        if _id in seen:
            continue
        seen.add(_id)
        result.append(el)
    return result

search_results = dedup(search_results)

In [80]:
context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=3,
    iteration_number=2
)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration number

In [81]:
answer_json = llm(prompt)
answer = json.loads(answer_json)
print(json.dumps(answer, indent=2))

{
  "action": "ANSWER_CONTEXT",
  "answer": "To join the course, you should register using the provided link. You can also subscribe to the course public Google Calendar (from desktop only) and join the course Telegram channel for announcements. Additionally, register in DataTalks.Club's Slack and join the relevant channel. Even if you don't register, you can still submit homeworks, but be aware of deadlines for final projects. If you want a certificate, you must finish the course with a \"live\" cohort, as certificates are not awarded for self-paced mode.",
  "source": "CONTEXT"
}


In [82]:
question = "what do I need to do to be successful at module 1?"

search_queries = []
search_results = []
previous_actions = []


iteration = 0

while True:
    print(f'ITERATION #{iteration}...')

    context = build_context(search_results)
    prompt = prompt_template.format(
        question=question,
        context=context,
        search_queries="\n".join(search_queries),
        previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
        max_iterations=3,
        iteration_number=iteration
    )

    print(prompt)

    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(json.dumps(answer, indent=2))

    previous_actions.append(answer)

    action = answer['action']
    if action != 'SEARCH':
        break

    keywords = answer['keywords']
    search_queries = list(set(search_queries) | set(keywords))
    
    for k in keywords:
        res = search(k)
        search_results.extend(res)

    search_results = dedup(search_results)
    
    iteration = iteration + 1
    if iteration >= 4:
        break

    print()

ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current 

In [83]:
def agentic_search(question):
    # Initialize lists to store search queries, results, and previous agent actions
    search_queries = []
    search_results = []
    previous_actions = []

    # Initialize iteration counter
    iteration = 0
    
    # Start the main loop for iterative search
    while True:
        # Print current iteration number for debugging/tracking
        print(f'ITERATION #{iteration}...')
    
        # Build context from previous search results to inform the LLM
        context = build_context(search_results)
        # Format the prompt using a predefined template and current state
        prompt = prompt_template.format(
            question=question,
            context=context,
            search_queries="\n".join(search_queries),
            previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
            max_iterations=3, # Note: This parameter is in the prompt, but the actual loop limit is 4.
            iteration_number=iteration
        )
    
        # Print the prompt sent to the LLM
        print(prompt)
    
        # Call the LLM with the formatted prompt to get an action
        answer_json = llm(prompt)
        # Parse the JSON response from the LLM
        answer = json.loads(answer_json)
        # Print the LLM's response for debugging/tracking
        print(json.dumps(answer, indent=2))

        # Record the current action in previous_actions for future iterations
        previous_actions.append(answer)
    
        # Get the action type from the LLM's response
        action = answer['action']
        # If the action is not 'SEARCH', break the loop (e.g., if it's 'FINISH' or 'ANSWER')
        if action != 'SEARCH':
            break
    
        # Extract keywords for searching from the LLM's response
        keywords = answer['keywords']
        # Update the list of all search queries, ensuring uniqueness
        search_queries = list(set(search_queries) | set(keywords))

        # Perform searches for each new keyword
        for k in keywords:
            res = search(k)
            # Add new results to the overall search_results list
            search_results.extend(res)
    
        # Deduplicate search results to avoid redundant information
        search_results = dedup(search_results)
        
        # Increment the iteration counter
        iteration = iteration + 1
        # Set a hard limit on the number of iterations to prevent infinite loops
        if iteration >= 4:
            break
    
        # Print a newline for better readability between iterations
        print()

    # Return the final answer from the LLM
    return answer

In [85]:
answer = agentic_search('how do I prepare for the course?')

ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current 

In [89]:
from IPython.display import display_markdown, Markdown

In [91]:
Markdown(answer['answer'])

To prepare for the course, you can start by installing and setting up the necessary dependencies and requirements, which include a Google Cloud account, Google Cloud SDK, Python 3 (installed with Anaconda), Terraform, and Git. It's also recommended to review the prerequisites and syllabus to ensure you are comfortable with the subjects covered.

Additionally, you should clone the course repository from GitHub to your local machine. For using Git and GitHub, resources like "Git for Everybody: How to Clone a Repository from GitHub" and a tutorial on setting up a repository (https://www.atlassian.com/git/tutorials/setting-up-a-repository) are suggested. Remember to use `.gitignore` for large files and never store passwords or keys in a Git repository.

[Brave Search API](https://brave.com/search/api/)

# Function calling

```python
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results
```

Can be jsonified for OpenAI like :

```json
search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }
}
```

Slightly different syntax for OpenRouter (older OpenAI syntax ?):

http://openrouter.ai/docs/features/tool-calling

```json
search_tool = {
    "type": "function",
    "function":{
        "name": "search",
        "description": "Search the FAQ database",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "Search query text to look up in the course FAQ."
                }
            },
            "required": ["query"],
            "additionalProperties": False
        }
    }
}
```

In [122]:
search_tool = {
    "type": "function",
    "function":{
        "name": "search",
        "description": "Search the FAQ database",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "Search query text to look up in the course FAQ."
                }
            },
            "required": ["query"],
            "additionalProperties": False
        }
    }
}

In [281]:
question = "How to do well in Module 1 ?"

developer_prompt = """
You're a course teaching assistant.
You're given a question from a course student and your task is to answer it.
""".strip()

tools = [search_tool]

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.chat.completions.create(
    model=MODEL,
    messages=chat_messages,
    tools=tools
)


In [292]:
call = response.choices[0].message.tool_calls[0]
call

ChatCompletionMessageToolCall(id='tool_0_search', function=Function(arguments='{"query":"how to do well in Module 1"}', name='search'), type='function', index=0)

In [293]:
call_id = call.id
call_id

'tool_0_search'

In [294]:
f_name = call.function.name
arguments  = json.loads(
    call.function.arguments
)
print(f_name)
print(arguments)

search
{'query': 'how to do well in Module 1'}


In [295]:
# get functions by name from globals()
f = globals()[f_name]

In [296]:
results = f(**arguments)

In [297]:
search_results = json.dumps(results, indent=2)
print(search_results[:100])

[
  {
    "text": "Even after installing pyspark correctly on linux machine (VM ) as per course inst


In [298]:
chat_messages.append(call)

chat_messages.append({
    "role": "tool",
    "tool_call_id": call_id,
    "name":f_name,
    "content": json.dumps(
        search_results
    )
})

In [299]:
chat_messages

[{'role': 'developer',
  'content': "You're a course teaching assistant.\nYou're given a question from a course student and your task is to answer it."},
 {'role': 'user', 'content': 'How to do well in Module 1 ?'},
 ChatCompletionMessageToolCall(id='tool_0_search', function=Function(arguments='{"query":"How to do well in Module 1"}', name='search'), type='function', index=0),
 {'role': 'tool',
  'tool_call_id': 'tool_0_search',
  'name': 'search',
  'content': '"[\\n  {\\n    \\"text\\": \\"Even after installing pyspark correctly on linux machine (VM ) as per course instructions, faced a module not found error in jupyter notebook .\\\\nThe solution which worked for me(use following in jupyter notebook) :\\\\n!pip install findspark\\\\nimport findspark\\\\nfindspark.init()\\\\nThereafter , import pyspark and create spark contex<<t as usual\\\\nNone of the solutions above worked for me till I ran !pip3 install pyspark instead !pip install pyspark.\\\\nFilter based on conditions based on

In [300]:
response = client.chat.completions.create(
    model=MODEL,
    messages=chat_messages,
    tools=tools
)

In [303]:
Markdown(
    response.choices[0].message.content
)

I can't provide specific advice on how to "do well" in Module 1 as the search results primarily focus on troubleshooting technical errors related to different modules (Module 5: PySpark, Module 4: dbt, and Module 1: Docker and Terraform). 

However, looking at the Module 1 entries, these are common issues:

*   **`TypeError: 'module' object is not callable` with SQLAlchemy:** This usually means you're trying to call a module as if it were a function. The solution provided suggests changing the connection string to `postgresql+psycopg://` and then correctly creating the engine.
*   **`ModuleNotFoundError: No module named 'psycopg2'`**: This error indicates that the `psycopg2` Python module, necessary for interacting with PostgreSQL, is not installed. The solution is to install it using `conda` or `pip`.

To do well in Module 1, I would recommend:

1.  **Carefully review the module materials:** Understand the core concepts of Docker and Terraform.
2.  **Pay close attention to setup instructions:** Many errors stem from incorrect environments or missing dependencies.
3.  **Troubleshoot methodically:** When you encounter an error, try to understand the error message.
4.  **Look for similar issues:** If you encounter a problem, it's likely others have too. Search the course FAQs, forums, or general programming resources.

If you have a more specific question about a concept or a problem you're facing in Module 1, feel free to ask!

## Making multiple calls

In [346]:
question = "How to do well in Module 1 ?"

developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
If you look up something in FAQ, convert the student question into multiple queries.
""".strip()

tools = [search_tool]

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]
# response will contain multiple tool calls
response = client.chat.completions.create(
    model=MODEL,
    messages=chat_messages,
    tools=tools
)

In [317]:
def do_call(tool_call_response):
    call_id = tool_call_response.id
    function_name = tool_call_response.function.name
    arguments = json.loads(tool_call_response.function.arguments)

    f = globals()[function_name]
    result = f(**arguments)

    return {
        "role": "tool",
        "tool_call_id": call_id,
        "name":function_name,
        "content": json.dumps(
            result, indent = 2
            )
    }

In [318]:
calls = response.choices[0].message.tool_calls

In [320]:
for call in calls:
    chat_messages.append(call)
    
    result = do_call(call)
    chat_messages.append(result)

In [339]:
# response will contain multiple tool calls
response = client.chat.completions.create(
    model=MODEL,
    messages=chat_messages,
    tools=tools
)

In [323]:
Markdown(
    response.choices[0].message.content
)

I can't provide specific advice on how to "do well" in Module 1, as that depends on the course content and your learning style. However, I can help you with specific technical issues you might encounter in Module 1.

Are you running into any particular errors or having trouble with any of the concepts? For example, some common issues in Module 1 of the Data Engineering Zoomcamp course include:

*   **`ModuleNotFoundError: No module named 'psycopg2'`**: This error occurs when the `psycopg2` Python module, needed for PostgreSQL, is not installed. You can usually fix this by running `pip install psycopg2-binary`.
*   **`TypeError: 'module' object is not callable` with SQLAlchemy**: This error often means you're trying to call `create_engine` incorrectly. Make sure you are using `create_engine` with the correct connection string format, e.g., `postgresql+psycopg://user:password@host:port/database_name`.

If you have other questions related to specific errors or topics in Module 1, please let me know!

In [329]:
print(response.choices[0].finish_reason)

tool_calls


In [347]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.
When using FAQ, perform deep topic exploration: make one request to FAQ,
and then based on the results, make more requests.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]

In [350]:
while True: # main Q&A loop
    question = input() # How do I do my best for module 1?
    if question == 'stop':
        break

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True: # request-response loop - query API till get a message
        # response will contain multiple tool calls
        response = client.chat.completions.create(
            model=MODEL,
            messages=chat_messages,
            tools=tools
        )

        # check for tool calls
        has_tool_calls = response.choices[0].finish_reason == 'tool_calls'
        
        if has_tool_calls:
            
            # consume all tool calls
            calls = response.choices[0].message.tool_calls
            
            for call in calls:
                chat_messages.append(call)
                
                result = do_call(call)
                chat_messages.append(result)
            
        if response.choices[0].finish_reason == 'stop':
            display(
                Markdown(
                    response.choices[0].message.content
                )
            )
            break
        

Module 1 of the Data Engineering Zoomcamp covers Docker and Terraform. This includes topics like ingesting NY Taxi Data to Postgres, and resolving `psycopg2` and SQLAlchemy-related errors.

Module 4 focuses on Analytics Engineering with dbt. This module addresses issues like connecting dbt Cloud with BigQuery, resolving `NoneType` object errors in dbt macros, and handling `ModuleNotFoundError: No module named 'pytz'` when setting up dbt with Docker. It also covers what to do if `dbt run` doesn't update your BigQuery table as expected.

What else would you like to know about these modules or any other part of the course?

Module 4 of the Data Engineering Zoomcamp covers "Analytics Engineering with dbt". Here are some key takeaways:

*   **dbt Project Setup:** Ensure your `dbt_project.yml` file is correctly configured, especially if your dbt project is in a subdirectory.
*   **Connecting dbt Cloud with BigQuery:** You might encounter "Access Denied" errors. To resolve this, grant your dbt service account the "BigQuery Job User" role in Google Cloud's IAM & Admin. It's also recommended to add "BigQuery Data Owner," "Storage Object Admin," and "Storage Admin" roles to prevent future permission issues.
*   **Handling Data Types:** Be mindful of column data types when working with dbt and BigQuery, especially when converting from CSV to Parquet. If dbt infers incorrect types, define the schema explicitly in your data ingestion pipeline or cast the columns within your dbt models. Pandas `Int64` (with a capital 'I') can be used for nullable integers when reading CSVs.
*   **Variable Usage:** When using variables with `dbt run`, ensure proper YAML dictionary format, e.g., `dbt run --var 'is_test_run: false'`.
*   **CI/CD Jobs in dbt Cloud:** If "Triggered by pull requests" is disabled when setting up a Continuous Integration job, it's likely due to being on the Developer Plan. You'll need to upgrade to a Team Plan or Enterprise Plan to utilize CI Jobs.

Do you have any specific questions about any of these dbt topics, or would you like to explore another module?

`dbt run` compiles and executes all models in your dbt project, creating or updating the corresponding tables or views in your data warehouse. `dbt build` is a superset of `dbt run`; it not only runs your models but also includes `dbt test`, `dbt snapshot`, and `dbt seed` operations. This means `dbt build` will execute your tests, apply any data snapshots, and load any seed files in addition to building your models.

Here's why you might choose one over the other:

*   **`dbt run`**: Ideal for everyday development when you're primarily focused on building and transforming your data models. It's faster as it skips tests and other operations.
*   **`dbt build`**: Recommended for a more comprehensive workflow, especially before deploying to production or when you want to ensure data quality and integrity. It provides a more robust check of your dbt project.

Do you have any specific issues you're encountering with `dbt run` or `dbt build`?

In [352]:
def add_entry(question, answer):
    doc = {
        'question': question,
        'text': answer,
        'section': 'user added',
        'course': 'data-engineering-zoomcamp'
    }
    index.append(doc)

In [354]:
add_entry_description = {
    "type": "function",
    "function": {"name": "add_entry",
    "description": "Add an entry to the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "The question to be added to the FAQ database",
            },
            "answer": {
                "type": "string",
                "description": "The answer to the question",
            }
        },
        "required": ["question", "answer"],
        "additionalProperties": False
    }}
}

In [421]:
import chat_assistant
from importlib import reload
chat_assistant = reload(chat_assistant)

tools = chat_assistant.Tools()
tools.add_tool(search, search_tool)
tools.add_tool(add_entry, add_entry_description)
print(tools.get_tools())

developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ to augment your answer.

At the end of each response, ask the user a follow up question based on your answer.

Use the add_entry tool to add entries to the FAQ database when asked to do so.
""".strip()

chat_interface = chat_assistant.ChatInterface()

chat = chat_assistant.ChatAssistant(
    tools=tools,
    developer_prompt=developer_prompt,
    chat_interface=chat_interface,
    client=client,
    model=MODEL
)

[{'type': 'function', 'function': {'name': 'search', 'description': 'Search the FAQ database', 'parameters': {'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search query text to look up in the course FAQ.'}}, 'required': ['query'], 'additionalProperties': False}}}, {'type': 'function', 'function': {'name': 'add_entry', 'description': 'Add an entry to the FAQ database', 'parameters': {'type': 'object', 'properties': {'question': {'type': 'string', 'description': 'The question to be added to the FAQ database'}, 'answer': {'type': 'string', 'description': 'The answer to the question'}}, 'required': ['question', 'answer'], 'additionalProperties': False}}}]


In [422]:
chat.run()

Chat ended.


https://ai.pydantic.dev/models/openai/#openrouter

In [434]:
from pydantic_ai import Agent, RunContext
from pydantic_ai.providers.openrouter import OpenRouterProvider
from pydantic_ai.models.openai import OpenAIModel

In [435]:
model = OpenAIModel(
    MODEL,  
    provider = OpenRouterProvider(api_key=OPENROUTER_API_KEY),
)

In [436]:
chat_agent = Agent(
    model,
    system_prompt=developer_prompt
)

In [437]:
from typing import Dict


@chat_agent.tool
def search_tool(ctx: RunContext, query: str) -> Dict[str, str]:
    """
    Search the FAQ for relevant entries matching the query.

    Parameters
    ----------
    query : str
        The search query string provided by the user.

    Returns
    -------
    list
        A list of search results (up to 5), each containing relevance information 
        and associated output IDs.
    """
    print(f"search('{query}')")
    return search(query)


@chat_agent.tool
def add_entry_tool(ctx: RunContext, question: str, answer: str) -> None:
    """
    Add a new question-answer entry to FAQ.

    This function creates a document with the given question and answer, 
    tagging it as user-added content.

    Parameters
    ----------
    question : str
        The question text to be added to the index.

    answer : str
        The answer or explanation corresponding to the question.

    Returns
    -------
    None
    """
    return add_entry(question, answer)

In [442]:
user_prompt = "what are the tools used in the course and what are common issues with these tools ? use as many search queries as needed"
agent_run = await chat_agent.run(user_prompt)
Markdown(agent_run.output)

search('tools used in the course')
search('common issues with tools')


The course covers two alternative data stacks: one using Google Cloud Platform (GCP) and another using local installations of various tools. You have the flexibility to choose one of these or even use your own preferred tools, although support will be limited if you deviate from the course's recommended stacks.

Here are some of the tools mentioned:

*   **Orchestration:** Airflow, Prefect, or Mage
*   **Cloud Providers:** AWS or GCP products
*   **Business Intelligence:** Tableau, Metabase, or Google Data Studio
*   **Version Control:** Git/GitHub
*   **Containerization:** Docker
*   **Infrastructure as Code:** Terraform
*   **Database related:** Pandas, SQLAlchemy, psycopg2

Common issues that students encounter with these tools, and general troubleshooting tips, include:

*   **`ModuleNotFoundError: No module named 'psycopg2'`**: This often happens when `psycopg2` is not installed correctly. Try `pip install psycopg2-binary` or upgrade your existing installation. If problems persist, update `conda` or `pip`, then try uninstalling and reinstalling the package.
*   **`TypeError: 'module' object is not callable` with SQLAlchemy**: Ensure you are using `create_engine` correctly, for example: `engine = create_engine("postgresql+psycopg://root:root@localhost:5432/ny_taxi")`.
*   **Pandas missing value issues**: When injecting data into tools like BigQuery, you might face type errors because pandas can parse integer columns with missing values as float types by default. To address this, you can specify or cast the data type to `Int64` during data transformation. Alternatively, you can use `df.convert_dtypes()` after filling missing values (e.g., `df.fillna(-999999, inplace=True)`).
*   **Google Cloud SDK PATH issues on Windows**: If you're consistently getting errors about the installer being unable to automatically update your system PATH, you might need to add Git Bash to your Windows path. This can be done by installing Anaconda Navigator and ensuring you check "add conda to the path" during installation, and/or installing Git Bash and checking "Add GitBash to Windows Terminal" and "Use Git and optional Unix tools from the command prompt." Then, from within Git Bash, run `conda init bash`.
*   **Troubleshooting in general**:
    *   Start by trying to solve the issue yourself by reading error messages and documentation.
    *   Restart the application, server, or PC.
    *   Search for the error online using specific keywords (e.g., `<technology> <problem statement>`).
    *   Check the tool's official documentation.
    *   Consider uninstalling and reinstalling the application.
    *   If all else fails, ask for help in the Slack channel, providing as much detail as possible (OS, commands run, exact error messages, what you've already tried). **Always copy and paste error messages, do not use screenshots.**

What specific tools are you most interested in learning more about, or what kind of issues have you encountered so far?