### Day 1

In [1]:
import io
import zipfile
import requests
import frontmatter

In [2]:
url = 'https://codeload.github.com/DataTalksClub/faq/zip/refs/heads/main'
resp = requests.get(url)

In [3]:
repository_data = []

# Create a ZipFile object from the downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()

    # Only process markdown files
    if not filename.endswith('.md'):
        continue

    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read()
        post = frontmatter.loads(content)
        data = post.to_dict()
        data['filename'] = filename
        repository_data.append(data)

zf.close()


In [4]:
print(len(repository_data))
print(repository_data[1])

1228
{'content': '# DataTalks.Club FAQ\n\nA static site generator for DataTalks.Club course FAQs with automated AI-powered FAQ maintenance.\n\n## Features\n\n- **Static Site Generation**: Converts markdown FAQs to a beautiful, searchable HTML site\n- **Automated FAQ Management**: AI-powered bot that processes new FAQ proposals\n- **Intelligent Triage**: Automatically determines if proposals should create new entries, update existing ones, or are duplicates\n- **GitHub Integration**: Seamless workflow via GitHub Issues and Pull Requests\n\n## Project Structure\n\n```\nfaq/\n├── _questions/              # FAQ content organized by course\n│   ├── machine-learning-zoomcamp/\n│   │   ├── _metadata.yaml   # Course configuration\n│   │   ├── general/         # General course questions\n│   │   ├── module-1/        # Module-specific questions\n│   │   └── ...\n│   ├── data-engineering-zoomcamp/\n│   └── ...\n├── _layouts/                # Jinja2 HTML templates\n│   ├── base.html\n│   ├── cours

In [5]:
repository_data = []

# Create a ZipFile object from the downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()

    if not (filename.endswith('.md') or filename.endswith('.mdx')):
        continue


    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read()
        post = frontmatter.loads(content)
        data = post.to_dict()
        data['filename'] = filename
        repository_data.append(data)

zf.close()


In [6]:
print(repository_data[1])

{'content': '# DataTalks.Club FAQ\n\nA static site generator for DataTalks.Club course FAQs with automated AI-powered FAQ maintenance.\n\n## Features\n\n- **Static Site Generation**: Converts markdown FAQs to a beautiful, searchable HTML site\n- **Automated FAQ Management**: AI-powered bot that processes new FAQ proposals\n- **Intelligent Triage**: Automatically determines if proposals should create new entries, update existing ones, or are duplicates\n- **GitHub Integration**: Seamless workflow via GitHub Issues and Pull Requests\n\n## Project Structure\n\n```\nfaq/\n├── _questions/              # FAQ content organized by course\n│   ├── machine-learning-zoomcamp/\n│   │   ├── _metadata.yaml   # Course configuration\n│   │   ├── general/         # General course questions\n│   │   ├── module-1/        # Module-specific questions\n│   │   └── ...\n│   ├── data-engineering-zoomcamp/\n│   └── ...\n├── _layouts/                # Jinja2 HTML templates\n│   ├── base.html\n│   ├── course.htm

In [7]:
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data


In [8]:
dtc_faq = read_repo_data("DataTalksClub", "faq")
evidently_docs = read_repo_data("evidentlyai", "docs")

print(type(dtc_faq), len(dtc_faq))
print(type(evidently_docs), len(evidently_docs))
print(dtc_faq[0].keys())


<class 'list'> 1228
<class 'list'> 95
dict_keys(['content', 'filename'])


In [9]:
doc_45 = dtc_faq[45]

print(doc_45)

{'id': '1b03f3dadf', 'question': 'Set up Chrome Remote Desktop for Linux on Compute Engine', 'sort_order': 45, 'content': 'This [tutorial](https://cloud.google.com/architecture/chrome-desktop-remote-on-compute-engine?hl=en) shows you how to set up the Chrome Remote Desktop service on a Debian Linux virtual machine (VM) instance on Compute Engine. Chrome Remote Desktop allows you to remotely access applications with a graphical user interface.', 'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/045_1b03f3dadf_set-up-chrome-remote-desktop-for-linux-on-compute.md'}


### Day 2

In [10]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [11]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for i, ch in enumerate(chunks):
        chunk_dict = {
            "chunk": ch,          # ch is the string
            "chunk_id": i,
            **doc_copy
        }
        evidently_chunks.append(chunk_dict)

In [12]:
import re
text = evidently_docs[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())

In [13]:
import re

def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.
    
    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)
    
    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)
    
    return sections

In [14]:
sections = split_markdown_by_level(text, level=2)

In [15]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for i, section in enumerate(sections):
        section_doc = doc_copy.copy()
        section_doc["chunk"] = section      # ✅ now you have chunk
        section_doc["chunk_id"] = i
        section_doc["section"] = section    # optional: keep old name too
        evidently_chunks.append(section_doc)

In [16]:
# from openai import OpenAI

# openai_client = OpenAI()


# def llm(prompt, model='gpt-4o-mini'):
#     messages = [
#         {"role": "user", "content": prompt}
#     ]

#     response = openai_client.responses.create(
#         model='gpt-4o-mini',
#         input=messages
#     )

#     return response.output_text

In [17]:
# prompt_template = """
# Split the provided document into logical sections
# that make sense for a Q&A system.

# Each section should be self-contained and cover
# a specific topic or concept.

# <DOCUMENT>
# {document}
# </DOCUMENT>

# Use this format:

# ## Section Name

# Section content with all relevant details

# ---

# ## Another Section Name

# Another section content

# ---
# """.strip()

In [18]:
# def intelligent_chunking(text):
#     prompt = prompt_template.format(document=text)
#     response = llm(prompt)
#     sections = response.split('---')
#     sections = [s.strip() for s in sections if s.strip()]
#     return sections

In [19]:
# from tqdm.auto import tqdm

# evidently_chunks = []

# for doc in tqdm(evidently_docs):
#     doc_copy = doc.copy()
#     doc_content = doc_copy.pop('content')

#     sections = intelligent_chunking(doc_content)
#     for section in sections:
#         section_doc = doc_copy.copy()
#         section_doc['section'] = section
#         evidently_chunks.append(section_doc)

### Day 3

In [20]:
from minsearch import Index

index = Index(
    text_fields=["chunk", "title", "description", "filename"],
    keyword_fields=[]
)

index.fit(evidently_chunks)


<minsearch.minsearch.Index at 0x107825fa0>

In [21]:
query = 'What should be in a test dataset for AI evaluation?'
results = index.search(query)


In [22]:
dtc_faq = read_repo_data('DataTalksClub', 'faq')

de_dtc_faq = [d for d in dtc_faq if 'data-engineering' in d['filename']]

faq_index = Index(
    text_fields=["question", "content"],
    keyword_fields=[]
)

faq_index.fit(de_dtc_faq)


<minsearch.minsearch.Index at 0x1078dbec0>

In [23]:
de_dtc_faq

[{'id': '9e508f2212',
  'question': 'Course: When does the course start?',
  'sort_order': 1,
  'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's Slack and join the channel.",
  'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/001_9e508f2212_course-when-does-the-course-start.md'},
 {'id': 'bfafa427b3',
  'question': 'Course: What are the prerequisites for this course?',
  'sort_order': 2,
  'content': 'To get the most out of this course, you should have:\n\n- Basic coding experience\n- Familiarity with SQL\n- Experience with Python (helpful but not required)\n\nNo prior data engineering experience is necessary. See [Readme on GitHub](https

In [24]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')


In [25]:
record = de_dtc_faq[2]
text = record['question'] + ' ' + record['content']
v_doc = embedding_model.encode(text)


In [26]:
query = 'I just found out about the course. Can I enroll now?'
v_query = embedding_model.encode(query)


In [27]:
similarity = v_query.dot(v_doc)

In [28]:
from tqdm.auto import tqdm
import numpy as np

faq_embeddings = []

for d in tqdm(de_dtc_faq):
    text = d['question'] + ' ' + d['content']
    v = embedding_model.encode(text)
    faq_embeddings.append(v)

faq_embeddings = np.array(faq_embeddings)


  0%|          | 0/449 [00:00<?, ?it/s]

In [29]:
from minsearch import VectorSearch

faq_vindex = VectorSearch()
faq_vindex.fit(faq_embeddings, de_dtc_faq)


<minsearch.vector.VectorSearch at 0x169e50590>

In [30]:
query = 'Can I join the course now?'
q = embedding_model.encode(query)
results = faq_vindex.search(q)


In [31]:
evidently_embeddings = []

for d in tqdm(evidently_chunks):
    v = embedding_model.encode(d['chunk'])
    evidently_embeddings.append(v)

evidently_embeddings = np.array(evidently_embeddings)

evidently_vindex = VectorSearch()
evidently_vindex.fit(evidently_embeddings, evidently_chunks)


  0%|          | 0/266 [00:00<?, ?it/s]

<minsearch.vector.VectorSearch at 0x15db9a180>

In [32]:
query = 'Can I join the course now?'

text_results = faq_index.search(query, num_results=5)

q = embedding_model.encode(query)
vector_results = faq_vindex.search(q, num_results=5)

final_results = text_results + vector_results


In [33]:
def text_search(query):
    return faq_index.search(query, num_results=5)

def vector_search(query):
    q = embedding_model.encode(query)
    return faq_vindex.search(q, num_results=5)

def hybrid_search(query):
    text_results = text_search(query)
    vector_results = vector_search(query)
    
    # Combine and deduplicate results
    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        if result['filename'] not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)
    
    return combined_results


### Day 4

In [34]:
import openai

openai_client = openai.OpenAI()

user_prompt = "I just discovered the course, can I join now?"

chat_messages = [
    {"role": "user", "content": user_prompt}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
)

print(response.output_text)

It depends on the specific course and its enrollment policies. Many online courses allow for rolling admissions or ongoing enrollment, while others may have specific start dates or deadlines. Check the course website or contact the course administrator for more details.


In [35]:
text_search_tool = {
    "type": "function",
    "name": "text_search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }
}


In [36]:
system_prompt = """
You are a helpful assistant for a course. 
"""

question = "I just discovered the course, can I join now?"

chat_messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=[text_search_tool]
)


In [37]:
response.output

[ResponseFunctionToolCall(arguments='{"query":"Can I join the course now?"}', call_id='call_D8RfAyCQzTMiN2d9TK0FE07l', name='text_search', type='function_call', id='fc_028c19f23b60d6ac00694cae638c9881a184172d45f01275b5', status='completed')]

In [38]:
import json

call = response.output[0]

arguments = json.loads(call.arguments)
result = text_search(**arguments)

call_output = {
    "type": "function_call_output",
    "call_id": call.call_id,
    "output": json.dumps(result),
}


In [39]:
chat_messages.append(call)
chat_messages.append(call_output)

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=[text_search_tool]
)

print(response.output_text)

Yes, you can still join the course even after the start date. You are eligible to submit homework assignments, but keep in mind that there will be deadlines for turning in homework and the final projects. It’s advisable not to leave everything until the last minute.

If you're interested in joining, make sure to register and check the course details to stay updated.


In [40]:
system_prompt = """
You are a helpful assistant for a course. 

Use the search tool to find relevant information from the course materials before answering questions.

If you can find specific information through search, use it to provide accurate answers.
If the search doesn't return relevant results, let the user know and provide general guidance.
"""


In [41]:
system_prompt = """
You are a helpful assistant for a course. 

Always search for relevant information before answering. 
If the first search doesn't give you enough information, try different search terms.

Make multiple searches if needed to provide comprehensive answers.
"""


In [42]:
from typing import List, Any

def text_search(query: str) -> List[Any]:
    """
    Perform a text-based search on the FAQ index.

    Args:
        query (str): The search query string.

    Returns:
        List[Any]: A list of up to 5 search results returned by the FAQ index.
    """
    return faq_index.search(query, num_results=5)

In [43]:
from pydantic_ai import Agent

agent = Agent(
    name="faq_agent",
    instructions=system_prompt,
    tools=[text_search],
    model='gpt-4o-mini'
)


In [44]:
question = "I just discovered the course, can I join now?"

result = await agent.run(user_prompt=question)

In [46]:
result

AgentRunResult(output="Yes, you can still join the course even after it has started. Although you may not be officially registered, you are still eligible to submit homework assignments. However, keep in mind that there are deadlines for submitting homework and final projects, so it’s advisable not to postpone your work until the last minute. \n\nIf you're interested, the next cohort of the course will start on January 13th, 2025, and you can register for it [here](https://airtable.com/shr6oVXeQvSI5HuWD).")

In [47]:
result.new_messages()

[ModelRequest(parts=[UserPromptPart(content='I just discovered the course, can I join now?', timestamp=datetime.datetime(2025, 12, 25, 3, 24, 43, 271819, tzinfo=datetime.timezone.utc))], timestamp=datetime.datetime(2025, 12, 25, 3, 24, 43, 272070, tzinfo=datetime.timezone.utc), instructions="You are a helpful assistant for a course. \n\nAlways search for relevant information before answering. \nIf the first search doesn't give you enough information, try different search terms.\n\nMake multiple searches if needed to provide comprehensive answers.", run_id='523fc8aa-2b25-4020-a0b9-89a484022f4d'),
 ModelResponse(parts=[ToolCallPart(tool_name='text_search', args='{"query":"can I join the course now?"}', tool_call_id='call_BB6Q42vBGhMlNEE9XsVJzq1H')], usage=RequestUsage(input_tokens=146, output_tokens=20, details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}), model_name='gpt-4o-mini-2024-07-18', timestamp=datetime.datetime(202

### Day 5

In [48]:
question = "how do I install Kafka in Python?"
result = await agent.run(user_prompt=question)

In [49]:
result

AgentRunResult(output="To install Kafka for Python, you'll want to install specific libraries that enable you to interact with Kafka. Here are the recommended steps:\n\n1. **Install `confluent-kafka`:**\n   You can use either `pip` or `conda` to install this library, which is a high-performance Kafka client for Python.\n\n   - Using pip:\n   ```bash\n   pip install confluent-kafka\n   ```\n\n   - Using conda:\n   ```bash\n   conda install conda-forge::python-confluent-kafka\n   ```\n\n2. **(Optional) Install `fastavro`:**\n   If you're working with Avro data formats or want to optimize serialization/deserialization, you can also install `fastavro`.\n\n   ```bash\n   pip install fastavro\n   ```\n\n3. **Alternative Kafka client:** \n   If you encounter issues with the `kafka-python` library, it's suggested to install `kafka-python-ng`:\n\n   ```bash\n   pip install kafka-python-ng\n   ```\n\nMake sure to choose the library that best suits your project needs!")

In [50]:
from pydantic_ai.messages import ModelMessagesTypeAdapter


def log_entry(agent, messages, source="user"):
    tools = []

    for ts in agent.toolsets:
        tools.extend(ts.tools.keys())

    dict_messages = ModelMessagesTypeAdapter.dump_python(messages)

    return {
        "agent_name": agent.name,
        "system_prompt": agent._instructions,
        "provider": agent.model.system,
        "model": agent.model.model_name,
        "tools": tools,
        "messages": dict_messages,
        "source": source
    }

In [57]:
import json
import secrets
from pathlib import Path
from datetime import datetime


LOG_DIR = Path('logs')
LOG_DIR.mkdir(exist_ok=True)


def serializer(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")


def log_interaction_to_file(agent, messages, source='user'):
    entry = log_entry(agent, messages, source)

    ts = entry['messages'][-1]['timestamp']
    ts_str = ts.strftime("%Y%m%d_%H%M%S")
    rand_hex = secrets.token_hex(3)

    filename = f"{agent.name}_{ts_str}_{rand_hex}.json"
    filepath = LOG_DIR / filename

    with filepath.open("w", encoding="utf-8") as f_out:
        json.dump(entry, f_out, indent=2, default=serializer)

    return filepath

In [59]:
question = input()
result = await agent.run(user_prompt=question)
print(result.output)
log_interaction_to_file(agent, result.new_messages())

 what do I need to do for the certificate?


To obtain a certificate for the course, you need to complete the following requirements:

1. **Peer-Reviewed Capstone Projects**: You must finish the peer-reviewed capstone projects on time. Completing the homeworks is not mandatory if you join late, as long as you submit the projects within the required timeframe.

2. **Course Format**: You need to complete the course with a “live” cohort. Certificates are not awarded for completing the course in self-paced mode, since peer reviews can only occur while the course is actively running.

3. **Certificate Generation**: After the course grading is completed, there will be an announcement on Telegram and in the course channel about:
   - Checking that your full name is displayed correctly on the Certificate.
   - Instructions on how to generate your certificate document yourself. You will find the certificate in your course profile.

   Make sure you're logged in to access your course profile. The link for the 2025 edition is [https://cours

PosixPath('logs/faq_agent_v2_20251225_032939_e451a8.json')

In [55]:
system_prompt = """
You are a helpful assistant for a course.  

Use the search tool to find relevant information from the course materials before answering questions.  

If you can find specific information through search, use it to provide accurate answers.

Always include references by citing the filename of the source material you used.  
When citing the reference, replace "faq-main" by the full path to the GitHub repository: "https://github.com/DataTalksClub/faq/blob/main/"
Format: [LINK TITLE](FULL_GITHUB_LINK)

If the search doesn't return relevant results, let the user know and provide general guidance.  
""".strip()

# Create another version of agent, let's call it faq_agent_v2
agent = Agent(
    name="faq_agent_v2",
    instructions=system_prompt,
    tools=[text_search],
    model='gpt-4o-mini'
)


In [60]:
evaluation_prompt = """
Use this checklist to evaluate the quality of an AI agent's answer (<ANSWER>) to a user question (<QUESTION>).
We also include the entire log (<LOG>) for analysis.

For each item, check if the condition is met. 

Checklist:

- instructions_follow: The agent followed the user's instructions (in <INSTRUCTIONS>)
- instructions_avoid: The agent avoided doing things it was told not to do  
- answer_relevant: The response directly addresses the user's question  
- answer_clear: The answer is clear and correct  
- answer_citations: The response includes proper citations or sources when required  
- completeness: The response is complete and covers all key aspects of the request
- tool_call_search: Is the search tool invoked? 

Output true/false for each check and provide a short explanation for your judgment.
""".strip()

In [61]:
from pydantic import BaseModel

class EvaluationCheck(BaseModel):
    check_name: str
    justification: str
    check_pass: bool

class EvaluationChecklist(BaseModel):
    checklist: list[EvaluationCheck]
    summary: str

In [62]:
eval_agent = Agent(
    name='eval_agent',
    model='gpt-5-nano',
    instructions=evaluation_prompt,
    output_type=EvaluationChecklist
)


In [63]:
user_prompt_format = """
<INSTRUCTIONS>{instructions}</INSTRUCTIONS>
<QUESTION>{question}</QUESTION>
<ANSWER>{answer}</ANSWER>
<LOG>{log}</LOG>
""".strip()

In [64]:
def load_log_file(log_file):
    with open(log_file, 'r') as f_in:
        log_data = json.load(f_in)
        log_data['log_file'] = log_file
        return log_data

In [66]:
log_record = load_log_file('./logs/faq_agent_v2_20251225_032927_164b47.json')

instructions = log_record['system_prompt']
question = log_record['messages'][0]['parts'][0]['content']
answer = log_record['messages'][-1]['parts'][0]['content']
log = json.dumps(log_record['messages'])

user_prompt = user_prompt_format.format(
    instructions=instructions,
    question=question,
    answer=answer,
    log=log
)


In [67]:
result = await eval_agent.run(user_prompt, output_type=EvaluationChecklist)

checklist = result.output
print(checklist.summary)

for check in checklist.checklist:
    print(check)

The answer is accurate, well-cited, and complete. It directly addresses joining late and receiving a certificate, includes required citation format, and points the user to relevant FAQ resources.
check_name='instructions_follow' justification='The assistant followed the course instructions to search for relevant information and cite sources, and provided the required citations.' check_pass=True
check_name='instructions_avoid' justification='No disallowed content or actions detected; answered within scope.' check_pass=True
check_name='answer_relevant' justification='Response directly answers whether one can join late and get a certificate and under what conditions.' check_pass=True
check_name='answer_clear' justification='The explanation is clear: you can join late if you complete peer-reviewed capstone on time; no homework requirement; includes steps to check announcements and where to read more.' check_pass=True
check_name='answer_citations' justification='Citations are present with p

In [68]:
def simplify_log_messages(messages):
    log_simplified = []

    for m in messages:
        parts = []
    
        for original_part in m['parts']:
            part = original_part.copy()
            kind = part['part_kind']
    
            if kind == 'user-prompt':
                del part['timestamp']
            if kind == 'tool-call':
                del part['tool_call_id']
            if kind == 'tool-return':
                del part['tool_call_id']
                del part['metadata']
                del part['timestamp']
                # Replace actual search results with placeholder to save tokens
                part['content'] = 'RETURN_RESULTS_REDACTED'
            if kind == 'text':
                del part['id']
    
            parts.append(part)
    
        message = {
            'kind': m['kind'],
            'parts': parts
        }
    
        log_simplified.append(message)
    return log_simplified

In [70]:
async def evaluate_log_record(eval_agent, log_record):
    messages = log_record['messages']

    instructions = log_record['system_prompt']
    question = messages[0]['parts'][0]['content']
    answer = messages[-1]['parts'][0]['content']

    log_simplified = simplify_log_messages(messages)
    log = json.dumps(log_simplified)

    user_prompt = user_prompt_format.format(
        instructions=instructions,
        question=question,
        answer=answer,
        log=log
    )

    result = await eval_agent.run(user_prompt, output_type=EvaluationChecklist)
    return result.output 


log_record = load_log_file('./logs/faq_agent_v2_20251225_032927_164b47.json')
eval1 = await evaluate_log_record(eval_agent, log_record)

In [71]:
question_generation_prompt = """
You are helping to create test questions for an AI agent that answers questions about a data engineering course.

Based on the provided FAQ content, generate realistic questions that students might ask.

The questions should:

- Be natural and varied in style
- Range from simple to complex
- Include both specific technical questions and general course questions

Generate one question for each record.
""".strip()

class QuestionsList(BaseModel):
    questions: list[str]

question_generator = Agent(
    name="question_generator",
    instructions=question_generation_prompt,
    model='gpt-4o-mini',
    output_type=QuestionsList
)


In [72]:
import random

sample = random.sample(de_dtc_faq, 10)
prompt_docs = [d['content'] for d in sample]
prompt = json.dumps(prompt_docs)

result = await question_generator.run(prompt)
questions = result.output.questions

In [73]:
from tqdm.auto import tqdm

for q in tqdm(questions):
    print(q)

    result = await agent.run(user_prompt=q)
    print(result.output)

    log_interaction_to_file(
        agent,
        result.new_messages(),
        source='ai-generated'
    )

    print()

  0%|          | 0/10 [00:00<?, ?it/s]

How can I authenticate my Spark application with Google Cloud Storage using credentials?
To authenticate your Spark application with Google Cloud Storage (GCS) using credentials, you can follow these steps:

1. **Create a Service Account and Obtain Key**:
   - Go to the Google Cloud Console.
   - Navigate to "IAM & Admin" > "Service accounts".
   - Create a new service account and assign it the necessary roles (e.g., Storage Object Admin).
   - Generate a JSON key for this service account.

2. **Set Environment Variables**:
   - You can set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` to point to your service account key JSON file. This will allow your Spark application to locate the credentials for authentication.
   ```bash
   export GOOGLE_APPLICATION_CREDENTIALS="/path/to/your/service-account-file.json"
   ```

3. **Use Spark Configuration** (if applicable):
   - When initializing your Spark session, you might also need to set configuration properties to use GCS, espec

In [74]:
eval_set = []

for log_file in LOG_DIR.glob('*.json'):
    if 'faq_agent_v2' not in log_file.name:
        continue

    log_record = load_log_file(log_file)
    if log_record['source'] != 'ai-generated':
        continue

    eval_set.append(log_record)

In [75]:
eval_results = []

for log_record in tqdm(eval_set):
    eval_result = await evaluate_log_record(eval_agent, log_record)
    eval_results.append((log_record, eval_result))

  0%|          | 0/10 [00:00<?, ?it/s]

In [76]:
rows = []

for log_record, eval_result in eval_results:
    messages = log_record['messages']

    row = {
        'file': log_record['log_file'].name,
        'question': messages[0]['parts'][0]['content'],
        'answer': messages[-1]['parts'][0]['content'],
    }

    checks = {c.check_name: c.check_pass for c in eval_result.checklist}
    row.update(checks)

    rows.append(row)

In [77]:
import pandas as pd

df_evals = pd.DataFrame(rows)

In [83]:
df_evals.mean(numeric_only=True)

instructions_follow    1.0
dtype: float64

In [84]:
def evaluate_search_quality(search_function, test_queries):
    results = []
    
    for query, expected_docs in test_queries:
        search_results = search_function(query, num_results=5)
        
        # Calculate hit rate
        relevant_found = any(doc['filename'] in expected_docs for doc in search_results)
        
        # Calculate MRR
        for i, doc in enumerate(search_results):
            if doc['filename'] in expected_docs:
                mrr = 1 / (i + 1)
                break
        else:
            mrr = 0
            
        results.append({
            'query': query,
            'hit': relevant_found,
            'mrr': mrr
        })
    return results