In [8]:
import json
import os
os.chdir("../../")

import chromadb

import autogen
from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent

# Accepted file formats for that can be stored in
# a vector database instance
from autogen.retrieve_utils import TEXT_FORMATS

config_list = [
    {"model": "gpt-3.5-turbo-0125", "api_type": "openai"},
]

assert len(config_list) > 0
print("models to use: ", [config_list[i]["model"] for i in range(len(config_list))])

models to use:  ['gpt-3.5-turbo-0125']


In [2]:
print("Accepted file formats for `docs_path`:")
print(TEXT_FORMATS)

Accepted file formats for `docs_path`:
['yml', 'docx', 'pptx', 'xml', 'epub', 'org', 'html', 'json', 'txt', 'doc', 'log', 'odt', 'rst', 'tsv', 'rtf', 'xlsx', 'md', 'msg', 'ppt', 'csv', 'pdf', 'yaml', 'jsonl', 'htm']


In [3]:
config_list[0]["model"] = "gpt-4o"  # change model to gpt-4o

In [4]:
# 1. create an RetrieveAssistantAgent instance named "assistant"
assistant = RetrieveAssistantAgent(
    name="assistant",
    system_message="You are a helpful assistant.",
    llm_config={
        "timeout": 600,
        "cache_seed": 42,
        "config_list": config_list,
    },
)


In [11]:
# 2. create the RetrieveUserProxyAgent instance named "ragproxyagent"
# `docs_path` is the path to the docs directory. It can also be the path to a single file, or the url to a single file. By default,
# `task` indicates the kind of task we're working on. In this example, it's a `code` task.
# `chunk_token_size` is the chunk token size for the retrieve chat. By default, it is set to `max_tokens * 0.6`, here we set it to 2000.
# `custom_text_types` is a list of file types to be processed. Default is `autogen.retrieve_utils.TEXT_FORMATS`.
# This only applies to files under the directories in `docs_path`. Explicitly included files and urls will be chunked regardless of their types.
# In this example, we set it to ["non-existent-type"] to only process markdown files. Since no "non-existent-type" files are included in the `websit/docs`,
# no files there will be processed. However, the explicitly included urls will still be processed.

# Create a new collection for Legal Contracts dataset
# `task` indicates the kind of task we're working on. In this example, it's a `qa` task.
ragproxyagent = RetrieveUserProxyAgent(
    name="ragproxyagent",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    retrieve_config={
        "task": "qa",
        "docs_path": "data/content/Robinson Advisory.docx",
        "chunk_token_size": 1000,
        "model": config_list[0]["model"],
        "client": chromadb.PersistentClient(path="/tmp/chromadb"),
        "collection_name": "legal-contracts",
        "chunk_mode": "one_line",
        "embedding_model": "all-mpnet-base-v2",
    },
)

In [12]:
qa_problem = "Who are the parties to the Agreement and what are their defined names?"

chat_result = ragproxyagent.initiate_chat(
    assistant, message=ragproxyagent.message_generator, problem=qa_problem, n_results=10
)

Trying to create collection.


2024-07-08 10:38:19,391 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 30 chunks.[0m
2024-07-08 10:38:19,396 - autogen.agentchat.contrib.vectordb.chromadb - INFO - No content embedding is provided. Will use the VectorDB's embedding function to generate the content embedding.[0m


VectorDB returns doc_ids:  [['4b9d97a9', '44b89898', '1f113781', 'ac2b31c9', '7c943210', 'fea83f88', 'e4bfe81f', '1d97d312', '8fa7aef2', 'ca48849c']]
[32mAdding content of doc 4b9d97a9 to context.[0m
[32mAdding content of doc 44b89898 to context.[0m
[32mAdding content of doc 1f113781 to context.[0m
[32mAdding content of doc ac2b31c9 to context.[0m
[32mAdding content of doc 7c943210 to context.[0m
[32mAdding content of doc fea83f88 to context.[0m
[32mAdding content of doc e4bfe81f to context.[0m
[32mAdding content of doc 1d97d312 to context.[0m
[32mAdding content of doc 8fa7aef2 to context.[0m
[32mAdding content of doc ca48849c to context.[0m
[33mragproxyagent[0m (to assistant):

You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the
context provided by the user.
If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.
You must give as short an answer as possible.


In [13]:
print(chat_result)

ChatResult(chat_id=None, chat_history=[{'content': 'You\'re a retrieve augmented chatbot. You answer user\'s questions based on your own knowledge and the\ncontext provided by the user.\nIf you can\'t answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\nYou must give as short an answer as possible.\n\nUser\'s question is: Who are the parties to the Agreement and what are their defined names?\n\nContext is: Name:\tSilvan Joseph\t\t\t\tName:\tJack Robinson\t\t\nTitle: CEO\t\t\t\t\t\n\nConfidentiality, None Compete and IP Ownership Undertaking\nAppendix A to Advisory Service Agreement as of June 15th, 2023\nTHIS CONFIDENTIALITY UNDERTAKING (“Undertaking”) is entered into as of June 15th, 2023 (“Effective Date”), by Mr. Jack Robinson, Passport Number 780055578, residing at 1 Rabin st, Tel Aviv, Israel, Email: jackrobinson@gmail.com, (“Advisor”), towards Cloud Investments Ltd (“Company”), as follows:\nDefinitions: (a) Company’s Business: devel