In [1]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import LanceDB
from langchain_community.embeddings import HuggingFaceEmbeddings
import lancedb
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI


In [2]:
PDF_PATH = "Kenya-ARV-Guidelines-2022-Final-1.pdf"

loader = PyPDFLoader(PDF_PATH)
documents = loader.load()

print(f"Loaded {len(documents)} pages")


Loaded 286 pages


In [3]:
# Chunking

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n\n", "\n", ".", " "]
)

chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")


Created 1013 chunks


In [5]:
documents

[Document(page_content=' \n \n', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 1}),
 Document(page_content='  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nKenya HIV Prevention and Treatment Guidelines, 2022  \n \n2022 Edition  \n ', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 2}),
 Document(page_content=' \n  \n \n \n \n \n \n \n \n \n \n© National AIDS & STI Control Program 2022  \n \nThis guideline document is a publication of the National AIDS & STI Control Program, Ministry of Health \nKenya. No part of this publication may be reproduced, distributed, or transmitted in any form or by any \nmeans, including photocopying or recording, without the prior written permission of the National AIDS and \nSTI Contro l Program (NASCOP), Ministry of Health Kenya, except for non -commercial uses permitted by \ncopyright la

In [6]:
chunks

[Document(page_content='Kenya HIV Prevention and Treatment Guidelines, 2022  \n \n2022 Edition', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 2}),
 Document(page_content='© National AIDS & STI Control Program 2022  \n \nThis guideline document is a publication of the National AIDS & STI Control Program, Ministry of Health \nKenya. No part of this publication may be reproduced, distributed, or transmitted in any form or by any \nmeans, including photocopying or recording, without the prior written permission of the National AIDS and \nSTI Contro l Program (NASCOP), Ministry of Health Kenya, except for non -commercial uses permitted by \ncopyright law.  \n \nKenya HIV Prevention and Treatment Guidelines, 2022  edition contain relevant information required by \nhealthcare providers in the use of ARVs as of the date of issue. All reasonable precautions have been taken \nby NASCOP to verify the information contained in this guideline document.', metadata={'source': '

## Chunking wit LLM

In [7]:
from dotenv import load_dotenv
load_dotenv()  # OPENAI_API_KEY from .env


In [8]:
from openai import OpenAI

openai_client = OpenAI()


def llm(prompt, model='gpt-4o-mini'):
    messages = [
        {"role": "user", "content": prompt}
    ]

    response = openai_client.responses.create(
        model='gpt-4o-mini',
        input=messages
    )

    return response.output_text

In [9]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

In [10]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [11]:
documents

[Document(page_content=' \n \n', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 1}),
 Document(page_content='  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nKenya HIV Prevention and Treatment Guidelines, 2022  \n \n2022 Edition  \n ', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 2}),
 Document(page_content=' \n  \n \n \n \n \n \n \n \n \n \n© National AIDS & STI Control Program 2022  \n \nThis guideline document is a publication of the National AIDS & STI Control Program, Ministry of Health \nKenya. No part of this publication may be reproduced, distributed, or transmitted in any form or by any \nmeans, including photocopying or recording, without the prior written permission of the National AIDS and \nSTI Contro l Program (NASCOP), Ministry of Health Kenya, except for non -commercial uses permitted by \ncopyright la

In [12]:
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

guides_chunks = []

def process_doc(doc):
    """
    Splits a single document into sections and returns a list of section dicts.
    """
    doc_content = doc.page_content
    doc_metadata = doc.metadata
    sections = intelligent_chunking(doc_content)
    return [{**doc_metadata, 'section': section} for section in sections]

# Use ThreadPoolExecutor to process multiple docs in parallel
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_doc, doc) for doc in documents]
    
    # Create a tqdm progress bar with total=len(futures)
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing docs"):
        # future.result() blocks until the future is done
        guides_chunks.extend(future.result())

print(f"Total chunks created: {len(guides_chunks)}")



Processing docs:   0%|          | 0/286 [00:00<?, ?it/s]

Total chunks created: 1317


In [13]:
# Embedding

In [14]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


In [15]:
guides_chunks

[{'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf',
  'page': 1,
  'section': "Sure! Please provide the document you'd like me to split into sections."},
 {'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf',
  'page': 0,
  'section': "It appears that there is no content provided within the `<DOCUMENT>` tags. If you have a specific text or document that you would like to split into logical sections for a Q&A system, please provide that content, and I'll be glad to assist you!"},
 {'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf',
  'page': 5,
  'section': "## Acknowledgements\n\nThe 2022 Guidelines on Use of Antiretroviral Drugs for Treating and Preventing HIV Infection in Kenya has been a long-awaited document, with its revision significantly affected by the COVID-19 pandemic. This document has been updated through the collaborative effort of multiple stakeholders, including individuals and institutions involved in extensive consultations led by NASCOP's HIV Care and Treatment Program

In [16]:
from langchain_community.vectorstores import LanceDB
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
import lancedb
import pyarrow as pa


# LanceDB configuration
db_path = "./kenya_arv_guidelines_lancedb"
table_name = "kenya-arv-guidelines"

# Connect to LanceDB
db = lancedb.connect(db_path)

# Drop existing table if it exists (for fresh start)
if table_name in db.table_names():
    print(f"Dropping existing table '{table_name}'...")
    db.drop_table(table_name)

# Create vectorstore using add_texts method
print("Creating new vectorstore...")

# Extract texts and metadatas
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]

# Generate embeddings manually
print("Generating embeddings...")
vectors = embeddings.embed_documents(texts)

# Create the table manually with proper schema
data = []
for i, (text, metadata, vector) in enumerate(zip(texts, metadatas, vectors)):
    data.append({
        "text": text,
        "vector": vector,
        "id": str(i),
        "source": metadata.get("source", ""),
        "page": metadata.get("page", 0)
    })

# Create table
table = db.create_table(table_name, data=data, mode="overwrite")
print(f"✓ Created table with {len(data)} records")

# Now create the LanceDB vectorstore wrapper
vectorstore = LanceDB(
    connection=table,
    embedding=embeddings
)
faq_index = vectorstore
faq_vindex = table
print("✓ Vectorstore created successfully!")



Dropping existing table 'kenya-arv-guidelines'...
Creating new vectorstore...
Generating embeddings...
✓ Created table with 286 records
✓ Vectorstore created successfully!


[2026-02-12T13:06:05Z WARN  lance::dataset] No existing dataset at /Users/itsmuriuki/Desktop/cdss-notebooks/kenya-hiv-cdss/kenya_arv_guidelines_lancedb/kenya-arv-guidelines.lance, it will be created


## Add Search

In [17]:
# We will index this data by putting it inside a search engine. This allows us to quickly find relevant information when users ask questions. In particular, we will:
# Build a lexical search for exact matches and keywords
# Implement semantic search using embeddings
# Combine them with a hybrid search

In [18]:
def text_search(query: str):
    """Search the Kenya ARV guidelines by semantic similarity. Use for clinical or guideline questions."""
    return faq_index.similarity_search(query, k=5)

def vector_search(query):
    q = embedding_model.encode(query)
    return faq_vindex.search(q, num_results=5)

def hybrid_search(query):
    text_results = text_search(query)
    vector_results = vector_search(query)
    
    # Combine and deduplicate results
    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        if result['filename'] not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)
    
    return combined_results

## Agent and tools

In [19]:
import openai

openai_client = openai.OpenAI()

user_prompt = "What are the first-line ART regimens?"

chat_messages = [
    {"role": "user", "content": user_prompt}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
)

print(response.output_text)

First-line antiretroviral therapy (ART) regimens for the treatment of HIV typically include a combination of antiretroviral medications from different classes to ensure effective viral suppression and prevent resistance. The recommended first-line regimens often include:

1. **Integrase Strand Transfer Inhibitors (INSTIs)**: These are currently favored due to their efficacy and favorable side effect profile.
   - **Bictegravir/Emtricitabine/Tenofovir alafenamide (BIC/FTC/TAF)** 
   - **Dolutegravir/Abacavir/Lamivudine (DTG/ABC/3TC)** (for those who are HLA-B*5701 negative)
   - **Dolutegravir/Emtricitabine/Tenofovir disoproxil fumarate (DTG/FTC/TDF)**

2. **Non-Nucleoside Reverse Transcriptase Inhibitors (NNRTIs)**: These are another option, though typically used in specific cases or in patients with certain viral characteristics.
   - **Efavirenz/Emtricitabine/Tenofovir disoproxil fumarate (EFV/FTC/TDF)** 

3. **Nucleoside Reverse Transcriptase Inhibitors (NRTIs)**: Often included in 

## Function Calling with OpenAI

In [20]:
from typing import List
from langchain.schema import Document
def arv_text_search(query: str) -> List[Document]:
    """
    Perform a text-based similarity search on the Kenya ARV Guidelines vectorstore.

    Args:
        query (str): Clinical or guideline-related search query

    Returns:
        List[Document]: Top 5 guideline sections matching the query
    """
    return vectorstore.similarity_search(query, k=5)

In [21]:
text_search_tool = {
    "type": "function",
    "name": "arv_text_search",
    "description": "Search the Kenya ART / ARV clinical guidelines knowledge base",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Clinical question or search text related to ARV regimens, eligibility, dosing, monitoring, or guidelines."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }
}


In [22]:
system_prompt = """
You are a clinical decision support assistant specialized in Kenya ART (ARV) guidelines.
Provide accurate, guideline-based answers for HIV treatment, regimens, dosing, eligibility,
monitoring, and special populations. If information is not found in the guidelines, say so clearly.
"""

question = "What are the recommended first-line ART regimens for adults?"

chat_messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]

response = openai_client.responses.create(
    model="gpt-4o-mini",
    input=chat_messages,
    tools=[text_search_tool]
)


In [23]:
response

Response(id='resp_06f536e96e84f14300698dd049d728819783409f86db439048', created_at=1770901577.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-4o-mini-2024-07-18', object='response', output=[ResponseFunctionToolCall(arguments='{"query":"first-line ART regimens for adults"}', call_id='call_RFonMQesDEqRRVm1gm4PdeFA', name='arv_text_search', type='function_call', id='fc_06f536e96e84f14300698dd04a567881979f5c1fe0d40bcfa4', status='completed')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[FunctionTool(name='arv_text_search', parameters={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Clinical question or search text related to ARV regimens, eligibility, dosing, monitoring, or guidelines.'}}, 'required': ['query'], 'additionalProperties': False}, strict=True, type='function', description='Search the Kenya ART / ARV clinical guidelines knowledge base')], top_p=1.0, background=False, conversation=None, max_

In [24]:
import json

call = response.output[0]

arguments = json.loads(call.arguments)
results = text_search(**arguments)

results_serializable = [
    {
        "page_content": doc.page_content,
        "metadata": doc.metadata
    }
    for doc in results
]

call_output = {
    "type": "function_call_output",
    "call_id": call.call_id,
    "output": json.dumps(results_serializable, ensure_ascii=False),
}

In [25]:
chat_messages.append(call)
chat_messages.append(call_output)

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=[text_search_tool]
)

print(response.output_text)

The recommended first-line ART regimens for adults in Kenya, as per the latest guidelines, include:

### Preferred First-Line Regimens:
1. **TDF + 3TC + DTG**
   - **Dosing**: TDF/3TC/DTG (300/300/50mg): 1 tablet once daily.

### Note on Regimen Selection:
- All adults initiating ART should be assessed for their eligibility and readiness for treatment, and the above regimens are preferred for their efficacy and tolerability.
- Special considerations are made for women who are pregnant or breastfeeding, ensuring their regimens do not compromise maternal or fetal health.

For additional details on specific populations or alternative regimens, please refer to the guidelines or consult with a healthcare professional.


## System Prompt: Instructions

In [26]:
chat_messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=[text_search_tool]
)

In [27]:
system_prompt = """
You are a helpful assistant for a course.

Before answering any question, first use the search tool to look for relevant information in the course materials.

If the search returns relevant results:
- Base your answer strictly on that information.
- Be clear and specific.

If the search does not return relevant results:
- Clearly tell the user that the information was not found in the course materials.
- Provide general guidance or best practices instead.
"""


In [28]:
# allows multiple search queries
system_prompt = """
You are a helpful assistant for a course.

Before answering any question, always search the course materials for relevant information.

If the initial search does not return enough information:
- Refine the query or try alternative search terms.
- Perform multiple searches if necessary.

Use the collected search results to provide a complete and accurate answer.
If no relevant information is found after searching, clearly state this and offer general guidance instead.
"""


## Pydantic AI

In [29]:
from typing import List, Any
from pydantic_ai import Agent

def arv_text_search(query: str) -> List[str]:
    """
    Perform a text-based search on the Kenya ART (ARV) guidelines index.

    Args:
        query (str): Clinical or guideline-related search query
                     (e.g. regimens, dosing, eligibility, monitoring).

    Returns:
        List[str]: A list of up to 5 search results from the ARV guidelines index as plain text.
    """
    docs = faq_index.similarity_search(query, k=5)
    # Convert Document objects to plain text
    return [doc.page_content for doc in docs]


In [30]:
from pydantic_ai import Agent

agent = Agent(
    'openai:gpt-4o-mini',
    name="hiv_agent",
    system_prompt=system_prompt,
    tools=[arv_text_search],  # Use the updated function
)


In [31]:
question = "Can a newly diagnosed patient be started on ART immediately?"

result = await agent.run(question)


In [32]:
result

AgentRunResult(output="Yes, a newly diagnosed patient can be started on Antiretroviral Therapy (ART) immediately. According to the Kenya ART guidelines:\n\n1. **Eligibility for ART**: All individuals with confirmed HIV infection are eligible for ART irrespective of their CD4 count, clinical stage, age, or any co-infection status.\n\n2. **Timing of ART Initiation**: ART should ideally be initiated as soon as possible, preferably within 2 weeks of confirmation of HIV status. Patients can even start ART on the same day as their HIV diagnosis if they are ready to do so.\n\n3. **Same-Day ART Initiation**: There are additional benefits associated with same-day ART start, including improved retention in care, enhanced viral suppression, and better health outcomes.\n\n4. **Readiness for ART**: While laboratory tests are often recommended for monitoring, they are not prerequisites for the initiation of ART. There is an emphasis on ensuring that any issues preventing or delaying ART initiation a

In [33]:
# eligibility
question = "Who is eligible to start ART according to Kenya guidelines?"

# timing
question = "When should ART be initiated after HIV diagnosis?"

# regimen selection
question = "What is the recommended first-line ART regimen for adults?"

results = await agent.run(question)


In [34]:
question = "Can you give me first line treatments for different pupoulation groups between children and adults and women in child bearing age vs those in menopouse"

result = await agent.run(question)

In [35]:
results

AgentRunResult(output="The recommended first-line ART regimen for adults is as follows:\n\n- For adults aged 15 years and older, the preferred first-line regimen is:\n  - **TDF (Tenofovir Disoproxil Fumarate) + 3TC (Lamivudine) + DTG (Dolutegravir)**\n  \nThis regimen is generally administered as a fixed-dose combination pill containing TDF/3TC/DTG, typically dosed at 1 tab once daily.\n\nIt's important to note that all patients starting ART should have their weight documented at every visit, and appropriate dosing based on their weight needs to be confirmed. \n\nIf there are any specific factors or conditions that might modify this regimen (such as co-infections or individual patient needs), consulting the guidelines would provide additional recommendations.")

In [36]:
import asyncio

# result = asyncio.run(agent.run(question))

In [37]:
---

SyntaxError: invalid syntax (1947214667.py, line 1)

In [None]:
results.new_messages()


## Evaluation

### Logging

In [None]:
question = "When should ART be initiated after HIV diagnosis?"
result = await agent.run(question)

In [None]:
from pydantic_ai.messages import ModelMessagesTypeAdapter


def log_entry(agent, messages, source="user"):
    tools = []

    for ts in agent.toolsets:
        tools.extend(ts.tools.keys())

    dict_messages = ModelMessagesTypeAdapter.dump_python(messages)

    return {
        "agent_name": agent.name,
        "system_prompt": agent._instructions,
        "provider": agent.model.system,
        "model": agent.model.model_name,
        "tools": tools,
        "messages": dict_messages,
        "source": source
    }

In [None]:
# write logs to a folder

import json
import secrets
from pathlib import Path
from datetime import datetime


LOG_DIR = Path('logs')
LOG_DIR.mkdir(exist_ok=True)


def serializer(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")


def log_interaction_to_file(agent, messages, source='user'):
    entry = log_entry(agent, messages, source)

    ts = entry['messages'][-1]['timestamp']
    ts_str = ts.strftime("%Y%m%d_%H%M%S")
    rand_hex = secrets.token_hex(3)

    filename = f"{agent.name}_{ts_str}_{rand_hex}.json"
    filepath = LOG_DIR / filename

    with filepath.open("w", encoding="utf-8") as f_out:
        json.dump(entry, f_out, indent=2, default=serializer)

    return filepath

In [None]:
question = input()
result = await agent.run(question)
print(result.output)
log_interaction_to_file(agent, result.new_messages())

In [None]:
question = input()
result = await agent.run(question)
print(result.output)
log_interaction_to_file(agent, result.new_messages())

In [None]:
question = input()
result = await agent.run(question)
print(result.output)
log_interaction_to_file(agent, result.new_messages())

## Adding References

In [None]:
system_prompt = """
You are a helpful clinical decision support assistant for HIV care in Kenya.

Use the search tool to find relevant information from the Kenya ARV Guidelines and related clinical documents before answering questions.

If you find specific guidance through search, use it to provide accurate, guideline-aligned answers.

Always include references by citing the filename of the source material you used.
When citing the reference, replace the filename with the page

Format references as:
[DOCUMENT TITLE][PAGE]

If the search does not return relevant results, clearly state that the guidelines do not specify this and provide safe, general clinical guidance consistent with standard HIV care principles.

Do not make up recommendations that are not supported by the guidelines.
""".strip()

# Create ARV clinical assistant agent
agent = Agent(
    "openai:gpt-4o-mini",
    name="arv_guidelines_agent_v2",
    system_prompt=system_prompt,
    tools=[arv_text_search],
)


In [None]:
question = input()
result = await agent.run(question)
print(result.output)
log_interaction_to_file(agent, result.new_messages())

## LLM as a Judge 

In [None]:
evaluation_prompt = """
Use this checklist to evaluate the quality and safety of an AI agent's answer (<ANSWER>) to a clinical question (<QUESTION>).
The full agent interaction log (<LOG>) is provided for context.

For each checklist item, determine whether the condition is met.

Checklist:

- instructions_follow: The agent followed all instructions defined in <INSTRUCTIONS>
- instructions_avoid: The agent avoided actions explicitly disallowed (e.g. hallucinating guidelines, giving unsafe advice)
- answer_relevant: The response directly addresses the clinical question asked
- answer_clear: The answer is clinically clear, accurate, and easy to understand
- answer_guideline_aligned: The response is aligned with Kenya ARV Guidelines or explicitly states when guidance is unavailable
- answer_citations: The response includes proper citations to source documents when required
- completeness: The response covers all key clinical considerations relevant to the question
- tool_call_search: The search tool was invoked when guideline lookup was required

For each item, output true or false and provide a brief justification based on the answer and log.
""".strip()


In [None]:
# Pydantic class with the expected response structure
from pydantic import BaseModel

class EvaluationCheck(BaseModel):
    check_name: str
    justification: str
    check_pass: bool

class EvaluationChecklist(BaseModel):
    checklist: list[EvaluationCheck]
    summary: str

In [None]:
eval_agent = Agent(
    'openai:gpt-4o-mini',
    name='eval_agent',
    system_prompt=evaluation_prompt,
    output_type=EvaluationChecklist,
)


In [None]:
user_prompt_format = """
<INSTRUCTIONS>{instructions}</INSTRUCTIONS>
<QUESTION>{question}</QUESTION>
<ANSWER>{answer}</ANSWER>
<LOG>{log}</LOG>
""".strip()

In [None]:
#  helper function for loading JSON log files
def load_log_file(log_file):
    with open(log_file, 'r') as f_in:
        log_data = json.load(f_in)
        log_data['log_file'] = log_file
        return log_data

In [None]:
log_record = load_log_file("./logs/arv_guidelines_agent_v2_20260210_234813_125259.json")

instructions = log_record['system_prompt']
question = log_record['messages'][0]['parts'][0]['content']
answer = log_record['messages'][-1]['parts'][0]['content']
log = json.dumps(log_record['messages'])

user_prompt = user_prompt_format.format(
    instructions=instructions,
    question=question,
    answer=answer,
    log=log
)

In [None]:
result = await eval_agent.run(user_prompt, output_type=EvaluationChecklist)

checklist = result.output
print(checklist.summary)

for check in checklist.checklist:
    print(check)

In [None]:
def simplify_log_messages(messages):
    log_simplified = []

    for m in messages:
        parts = []
    
        for original_part in m['parts']:
            part = original_part.copy()
            kind = part['part_kind']
    
            if kind == 'user-prompt':
                del part['timestamp']
            if kind == 'tool-call':
                del part['tool_call_id']
            if kind == 'tool-return':
                del part['tool_call_id']
                del part['metadata']
                del part['timestamp']
                # Replace actual search results with placeholder to save tokens
                part['content'] = 'RETURN_RESULTS_REDACTED'
            if kind == 'text':
                del part['id']
    
            parts.append(part)
    
        message = {
            'kind': m['kind'],
            'parts': parts
        }
    
        log_simplified.append(message)
    return log_simplified

In [None]:
async def evaluate_log_record(eval_agent, log_record):
    messages = log_record['messages']

    instructions = log_record['system_prompt']
    question = messages[0]['parts'][0]['content']
    answer = messages[-1]['parts'][0]['content']

    log_simplified = simplify_log_messages(messages)
    log = json.dumps(log_simplified)

    user_prompt = user_prompt_format.format(
        instructions=instructions,
        question=question,
        answer=answer,
        log=log
    )

    result = await eval_agent.run(user_prompt, output_type=EvaluationChecklist)
    return result.output 


log_record = load_log_file('./logs/arv_guidelines_agent_v2_20260210_234618_fcc937.json')
eval1 = await evaluate_log_record(eval_agent, log_record)

## Data Generation and Evaluation

In [None]:
question_generation_prompt = """
You are helping to create test questions for an AI agent that answers questions about Kenya HIV care and ARV guidelines.

Based on the provided ARV guideline content, generate realistic clinical or guideline-related questions that healthcare workers or students might ask.

The questions should:

- Be natural and varied in style
- Range from simple to complex
- Include both specific guideline questions (e.g., regimens, dosing, monitoring, eligibility) and general HIV care questions

Generate one question for each guideline record provided.
""".strip()

from pydantic import BaseModel

class QuestionsList(BaseModel):
    questions: list[str]

arv_question_generator = Agent(
    "openai:gpt-4o-mini",
    name="arv_question_generator",
    system_prompt=question_generation_prompt,
    output_type=QuestionsList,
)


In [None]:
# Sample 10 records
import random

source = guides_chunks if 'guides_chunks' in dir() and len(guides_chunks) > 10 else chunks
sample = random.sample(source, min(10, len(source)))
prompt_docs = [d['section'] for d in sample if d.get('section')]
prompt = json.dumps(prompt_docs)

result = await arv_question_generator.run(prompt)
questions = result.output.questions

In [None]:
# Iterate over each of the question, ask our agent and log the results
from tqdm.auto import tqdm

for q in tqdm(questions):
    print(q)

    result = await agent.run(q)
    print(result.output)

    log_interaction_to_file(
        agent,
        result.new_messages(),
        source='ai-generated'
    )

    print()

In [None]:
eval_set = []

for log_file in LOG_DIR.glob('*.json'):
    if 'arv_guidelines_agent_v2' not in log_file.name:
        continue

    log_record = load_log_file(log_file)
    if log_record['source'] != 'ai-generated':
        continue

    eval_set.append(log_record)

In [None]:
print(len(eval_set))


In [None]:
# Eavluating AI generated logs
eval_results = []

for log_record in tqdm(eval_set):
    eval_result = await evaluate_log_record(eval_agent, log_record)
    eval_results.append((log_record, eval_result))

In [None]:
# Transform the data to later load it into pandas
rows = []

for log_record, eval_result in eval_results:
    messages = log_record['messages']

    row = {
        'file': log_record['log_file'].name,
        'question': messages[0]['parts'][0]['content'],
        'answer': messages[-1]['parts'][0]['content'],
    }

    checks = {c.check_name: c.check_pass for c in eval_result.checklist}
    row.update(checks)

    rows.append(row)

In [None]:
# each row is a key-value dictionary - Create a pandas dataframe 
import pandas as pd

df_evals = pd.DataFrame(rows)
df_evals.head()

In [None]:
df_evals

In [None]:
df_evals_clean.dtypes

In [None]:
# Convert True/False or string "True"/"False" to numeric (1/0)
df_evals_clean = df_evals.replace({"True": 1, "False": 0, True: 1, False: 0})

# Calculate mean only for numeric columns
pass_rates = df_evals_clean.mean(numeric_only=True)

# Convert to percentages for readability
pass_rates = (pass_rates * 100).round(1)

print(pass_rates)

In [None]:
# Only 81% of responses follow instructions completely
# 100% responses avoid forbidden actions 
# 90% responses are relevant and clear
# 77% include proper citations (great)
# 86% of responses are complete
# 81% responses use the search tool




## Evaluating functions and tools

In [None]:
# Precision and Recall: How many relevant results were retrieved vs. how many relevant results were missed
# Hit Rate: Percentage of queries that return at least one relevant result
# MRR (Mean Reciprocal Rank): Reflects the position of the first relevant result in the ranking

In [None]:
def evaluate_search_quality(search_function, test_queries):
    results = []
    
    for query, expected_docs in test_queries:
        search_results = search_function(query, num_results=5)
        
        # Calculate hit rate
        relevant_found = any(doc['filename'] in expected_docs for doc in search_results)
        
        # Calculate MRR
        for i, doc in enumerate(search_results):
            if doc['filename'] in expected_docs:
                mrr = 1 / (i + 1)
                break
        else:
            mrr = 0
            
        results.append({
            'query': query,
            'hit': relevant_found,
            'mrr': mrr
        })
    return results