In [1]:
import json
import os
os.chdir("../../")

import chromadb

import autogen
from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent

# Accepted file formats for that can be stored in
# a vector database instance
from autogen.retrieve_utils import TEXT_FORMATS

# config_list = [
#     {"model": "gpt-3.5-turbo-0125", "api_type": "openai"},
# ]

config_list = [
    {"model": "gpt-4o", "api_type": "openai"},
]


assert len(config_list) > 0
print("models to use: ", [config_list[i]["model"] for i in range(len(config_list))])

models to use:  ['gpt-4o']


In [2]:
print("Accepted file formats for `docs_path`:")
print(TEXT_FORMATS)

Accepted file formats for `docs_path`:
['html', 'odt', 'tsv', 'md', 'ppt', 'xml', 'epub', 'pptx', 'msg', 'docx', 'pdf', 'rst', 'json', 'jsonl', 'xlsx', 'org', 'yaml', 'doc', 'log', 'rtf', 'yml', 'txt', 'csv', 'htm']


In [3]:
config_list[0]["model"] = "gpt-4o"  # change model to gpt-4o

In [4]:
# 1. create an RetrieveAssistantAgent instance named "assistant"
assistant = RetrieveAssistantAgent(
    name="assistant",
    system_message="You are a helpful assistant.",
    llm_config={
        "timeout": 600,
        "cache_seed": 42,
        "config_list": config_list,
    },
)


In [7]:
# 2. create the RetrieveUserProxyAgent instance named "ragproxyagent"
# `docs_path` is the path to the docs directory. It can also be the path to a single file, or the url to a single file. By default,
# `task` indicates the kind of task we're working on. In this example, it's a `code` task.
# `chunk_token_size` is the chunk token size for the retrieve chat. By default, it is set to `max_tokens * 0.6`, here we set it to 2000.
# `custom_text_types` is a list of file types to be processed. Default is `autogen.retrieve_utils.TEXT_FORMATS`.
# This only applies to files under the directories in `docs_path`. Explicitly included files and urls will be chunked regardless of their types.
# In this example, we set it to ["non-existent-type"] to only process markdown files. Since no "non-existent-type" files are included in the `websit/docs`,
# no files there will be processed. However, the explicitly included urls will still be processed.

# Create a new collection for Legal Contracts dataset
# `task` indicates the kind of task we're working on. In this example, it's a `qa` task.
ragproxyagent = RetrieveUserProxyAgent(
    name="ragproxyagent",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    retrieve_config={
        "task": "qa",
        "docs_path": "data/content/Robinson Advisory.docx",
        "chunk_token_size": 500,
        "client": chromadb.PersistentClient(path="/tmp/chromadb"),
        "collection_name": "legal-contracts-113",
        "chunk_mode": "multi_lines",
        "embedding_model": "bert-base-uncased-contracts",
    },
)

In [8]:
qa_problem = "Who are the parties to the Agreement and what are their defined names?"

chat_result = ragproxyagent.initiate_chat(
    assistant, message=ragproxyagent.message_generator, problem=qa_problem, n_results=5
)

Trying to create collection.


max_tokens is too small to fit a single line of text. Breaking this line:
	- 2- ...
Failed to split docs with must_break_at_empty_line being True, set to False.
2024-07-09 21:55:14,650 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 8 chunks.[0m
2024-07-09 21:55:14,660 - autogen.agentchat.contrib.vectordb.chromadb - INFO - No content embedding is provided. Will use the VectorDB's embedding function to generate the content embedding.[0m


VectorDB returns doc_ids:  [['bf289eef', '5a52d5dd', 'c58851ad', 'e2f78511', '2580a094']]
[32mAdding content of doc bf289eef to context.[0m
[32mAdding content of doc 5a52d5dd to context.[0m
[32mAdding content of doc c58851ad to context.[0m
[32mAdding content of doc e2f78511 to context.[0m
[32mAdding content of doc 2580a094 to context.[0m
[33mragproxyagent[0m (to assistant):

You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the
context provided by the user.
If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.
You must give as short an answer as possible.

User's question is: Who are the parties to the Agreement and what are their defined names?

Context is: Notices: Notices under this Agreement shall be delivered to the party’s email address as follows: Company: info@cloudcorp.com, Advisor: jackrobinson@gmail.com, or in any the other means with a proof of acceptance by

In [9]:
print(chat_result)

ChatResult(chat_id=None, chat_history=[{'content': 'You\'re a retrieve augmented chatbot. You answer user\'s questions based on your own knowledge and the\ncontext provided by the user.\nIf you can\'t answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\nYou must give as short an answer as possible.\n\nUser\'s question is: Who are the parties to the Agreement and what are their defined names?\n\nContext is: Notices: Notices under this Agreement shall be delivered to the party’s email address as follows: Company: info@cloudcorp.com, Advisor: jackrobinson@gmail.com, or in any the other means with a proof of acceptance by the other party.\nIN WITNESS WHEREOF the parties have executed this Agreement as of the date first above written.\nCloud Investments Ltd.\t\t\t\tAdvisor \nBy: ________________________\t\tBy:________________________\nName:\tSilvan Joseph\t\t\t\tName:\tJack Robinson\t\t\nTitle: CEO\t\t\t\t\t\n\nConfidentiality, None Compete an