In [1]:
import json
import os

import chromadb

import autogen
from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent

# Accepted file formats for that can be stored in
# a vector database instance
from autogen.retrieve_utils import TEXT_FORMATS
# When using a single openai endpoint, you can use the following:
config_list = [{"model": "gpt-4", "api_key": os.getenv("OPENAI_API_KEY")}]

In [8]:
# 1. create an RetrieveAssistantAgent instance named "assistant"
assistant = RetrieveAssistantAgent(
    name="assistant",
    system_message="You are a helpful assistant.",
    llm_config={
        "timeout": 600,
        "cache_seed": 42,
        "config_list": config_list,
    },
)

# 2. create the RetrieveUserProxyAgent instance named "ragproxyagent"
# By default, the human_input_mode is "ALWAYS", which means the agent will ask for human input at every step. We set it to "NEVER" here.
# `docs_path` is the path to the docs directory. It can also be the path to a single file, or the url to a single file. By default,
# it is set to None, which works only if the collection is already created.
# `task` indicates the kind of task we're working on. In this example, it's a `code` task.
# `chunk_token_size` is the chunk token size for the retrieve chat. By default, it is set to `max_tokens * 0.6`, here we set it to 2000.
# `custom_text_types` is a list of file types to be processed. Default is `autogen.retrieve_utils.TEXT_FORMATS`.
# This only applies to files under the directories in `docs_path`. Explicitly included files and urls will be chunked regardless of their types.
# In this example, we set it to ["non-existent-type"] to only process markdown files. Since no "non-existent-type" files are included in the `websit/docs`,
# no files there will be processed. However, the explicitly included urls will still be processed.
ragproxyagent = RetrieveUserProxyAgent(
    name="ragproxyagent",
    # human_input_mode="NEVER",
    # max_consecutive_auto_reply=3,
    retrieve_config={
        "task": "qa",
        "docs_path": [
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/BidsAnnotationQuickstart.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/CTaggerGuiTaggingTool.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/DocumentationSummary.md",
            # "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/FileRemodelingQuickstart.md",
            # "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/FileRemodelingTools.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedAndEEGLAB.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedAnnotationQuickstart.md",
            # "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedConditionsAndDesignMatrices.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/llm/docs/source/HedConditionsAndDesignMatricesPart1.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/llm/docs/source/HedConditionsAndDesignMatricesPart2.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedGovernance.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedJavascriptTools.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedMatlabTools.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedOnlineTools.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedPythonTools.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedSchemaDevelopersGuide.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedSchemas.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedSearchGuide.md",
            # "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedSummaryGuide.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedTestDatasets.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HedValidationGuide.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/HowCanYouUseHed.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/IntroductionToHed.md",
            "https://raw.githubusercontent.com/hed-standard/hed-examples/main/docs/source/WhatsNew.md",
        ],
        # "model": config_list[0]["model"],
        "vector_db": "chroma",  # to use the deprecated `client` parameter, set to None and uncomment the line above
        "overwrite": True,  # set to True if you want to overwrite an existing collection
    },
    code_execution_config=False,  # set to False if you don't want to execute the code
)

In [4]:
from autogen.agentchat.contrib.capabilities.text_compressors import LLMLingua
from autogen.agentchat.contrib.capabilities.transforms import TextMessageCompressor
from autogen.agentchat.contrib.capabilities import transform_messages

llm_lingua = LLMLingua()
text_compressor = TextMessageCompressor(text_compressor=llm_lingua)

context_handling = transform_messages.TransformMessages(transforms=[text_compressor])
context_handling.add_to_agent(ragproxyagent)
context_handling.add_to_agent(assistant)

In [11]:
# reset the assistant. Always reset the assistant before starting a new conversation.
assistant.reset()

# given a problem, we use the ragproxyagent to generate a prompt to be sent to the assistant as the initial message.
# the assistant receives the message and generates a response. The response will be sent back to the ragproxyagent for processing.
# The conversation continues until the termination condition is met, in RetrieveChat, the termination condition when no human-in-loop is no code block detected.
# With human-in-loop, the conversation will continue until the user says "exit".

# Wakeman Henson example
# code_problem = "Give example of annotating three face types condition variables using HED. The face types information is detailed below:\n" \
#     "famous_face: A face that should be recognized by the participants.\n", \
#     "unfamiliar_face: A face that should not be recognized by the participants.\n", \
#     "scrambled_face: A scrambled face image generated by taking face 2D FFT."

# Generic example
# code_problem = "Give example of annotating three stimulus condition variables using HED. The stimulus types information is detailed below:\n" \
#     "square_stimulus: A picture of a dark blue square.\n", \
#     "circle_stimulus: A picture of a green circle.\n", \
#     "animal_stimulus: A picture of an animal."

# Generic example with Onset and Offset suggestion
code_problem = "Give example of annotating three stimulus condition variables using HED. The stimulus types information is detailed below:\n" \
    "square_stimulus: A picture of a dark blue square.\n", \
    "circle_stimulus: A picture of a green circle.\n", \
    "animal_stimulus: A picture of an animal."
chat_result = ragproxyagent.initiate_chat(
    assistant, 
    message=ragproxyagent.message_generator, 
    problem=code_problem, 
    search_string="conditions and design matrices"
)

VectorDB returns doc_ids:  [['f0766f55', '85315238', '22a4f1b0', 'c5088cd3']]
[32mAdding content of doc f0766f55 to context.[0m
[33mragproxyagent[0m (to assistant):

You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the
context provided by the user.
If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.
You must give as short an answer as possible.

User's question is: ('Give example of annotating three stimulus condition variables using HED. The stimulus types information is detailed below:\nsquare_stimulus: A picture of a dark blue square.\n', 'circle_stimulus: A picture of a green circle.\n', 'animal_stimulus: A picture of an animal.')

Context is: # HED conditions and design matrices

This tutorial discusses how information from neuroimaging experiments should be
stored and annotated so that the underlying experimental design and experimental conditions
for a dataset can be 

In [34]:
import requests
from io import StringIO
from bs4 import BeautifulSoup

def get_hed_vocab():
    if os.path.exists('HEDLatest_terms'):
        with open('HEDLatest_terms', 'r') as fin:
            return fin.read()
    else:
        # URL of the XML file
        url = "https://raw.githubusercontent.com/hed-standard/hed-schemas/main/standard_schema/hedxml/HEDLatest.xml"
        
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the XML content
            xml_content = response.text
            soup = BeautifulSoup(xml_content, "lxml")
        
            # Find all nodes and extract their names
            all_nodes = soup.find_all('node')
            node_names = [node.find('name', recursive=False).string for node in all_nodes]
        
            return node_names
        else:
            print(f"Failed to retrieve data from the URL. Status code: {response.status_code}") 
hed_vocab = ",".join(get_hed_vocab())

  soup = BeautifulSoup(xml_content, "lxml")
