In [None]:
import nest_asyncio
nest_asyncio.apply()

# from utils import *

import chromadb

In [None]:
# pip install requests pdfminer.six

In [None]:
import asyncio
from typing import Dict, List, Optional, Union, Callable
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
from autogen.formatting_utils import colored
from typing_extensions import Annotated
import autogen

from teachability import Teachability
from concurrent.futures import ThreadPoolExecutor, as_completed

import arxiv

db_dir = './teachability_db'
# check if db_dir exists, delete it if it does
import os
import shutil
if os.path.exists(db_dir): shutil.rmtree(db_dir)

config_list = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    file_location=".",
    filter_dict={
        "model": ["gpt-4", "gpt4", "gpt-35-turbo-16k", "gpt-4-0613", "gpt-3.5-turbo", "gpt-35-turbo", "gpt-35-turbo-0613"]
    },
)



print("LLM models: ", [config_list[i]["model"] for i in range(len(config_list))])

from typing import Annotated
import concurrent.futures
# Assuming other necessary imports like `autogen`, `Teachability`, etc., are done earlier.

def initiate_chat_with_paper_info(paper, teachable_agent, user, query_text):
    user.initiate_chat(teachable_agent,
                       silent=True,
                       message=f"The following article is one of the articles that I found for '{query_text}' topic: \n\n '{paper.title}' by {paper.authors} updated on {paper.updated}: {paper.pdf_url} \nsummary: {paper.summary} \n?")

def process_query(query_text, n_results, teachable_agent, user):
    """Function to process each query and initiate chats for each paper found."""
    sort_by = arxiv.SortCriterion.Relevance
    papers = arxiv.Search(query=query_text, max_results=n_results, sort_by=sort_by)
    
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(initiate_chat_with_paper_info, paper, teachable_agent, user, query_text) for paper in arxiv.Client().results(papers)]
        for future in as_completed(futures):
            future.result()

def arxiv_retriever(queries: Annotated[List[str], "The list of query texts to search for."], 
                    n_results: Annotated[int, "The number of results to retrieve for each query."] = 10,
                    ) -> str:
    
    # Start by instantiating any agent that inherits from ConversableAgent.
    teachable_agent = autogen.ConversableAgent(
        name="teachable_agent",  # The name is flexible, but should not contain spaces to work in group chat.
        llm_config={"config_list": config_list, "timeout": 120, "cache_seed": None},  # Disable caching.
    )

    # Instantiate the Teachability capability. Its parameters are all optional.
    teachability = Teachability(
        verbosity=0,  # 0 for basic info, 1 to add memory operations, 2 for analyzer messages, 3 for memo lists.
        reset_db=False,  
        path_to_db_dir=db_dir,
        recall_threshold=1.5,  # Higher numbers allow more (but less relevant) memos to be recalled.
    )

    # Now add the Teachability capability to the agent.
    teachability.add_to_agent(teachable_agent)
    
    user = autogen.UserProxyAgent(
        name="user",
        human_input_mode="NEVER",
        is_termination_msg=lambda x: "TERMINATE" in x.get("content"),
        max_consecutive_auto_reply=0,
        code_execution_config={"use_docker": False},
    )

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_query, query_text, n_results, teachable_agent, user) for query_text in queries]
        for future in as_completed(futures):
            future.result()

    # Instantiate a UserProxyAgent to represent the user. But in this notebook, all user input will be simulated.
    return f"Database updated with on the following topics: {', '.join([query for query in queries])}. Contact me for updates if needed."


message = ["Overview of time series forecasting methods", "Deep learning for time series forecasting"]
# arxiv_retriever(message, n_results=3)

In [None]:
import requests
from pdfminer.high_level import extract_text
import os

def download_pdf(url, save_path):
    """Download a PDF from a given URL."""
    response = requests.get(url)
    with open(save_path, 'wb') as f:
        f.write(response.content)

def pdf_to_text(pdf_path):
    """Extract text from a PDF file."""
    return extract_text(pdf_path)


def get_pdf(url: Annotated[str, "The URL of the PDF to convert."]) -> str:
    """Convert a PDF URL to a Markdown file."""
    output_dir = './pdf_output'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Extracting PDF filename from URL
    pdf_filename = url.split('/')[-1]
    pdf_path = os.path.join(output_dir, pdf_filename)
    md_filename = pdf_filename.replace('.pdf', '.md')
    md_path = os.path.join(output_dir, md_filename)
    
    # Download, convert, and save
    download_pdf(url, pdf_path)
    text = pdf_to_text(pdf_path)
    # save_text_as_md(text, md_path)
    
    return text

# Example usage
pdf_url = "https://arxiv.org/ftp/arxiv/papers/2305/2305.17873.pdf"
# get_pdf(pdf_url)


In [None]:
# Configuration for the Language Model (LLM)
llm_config = {
    "config_list": config_list,  # config_list should be defined or imported
    "timeout": 120,
    "seed": 42,
}

# Configuration for the manager using the same config_list as llm_config
manager_config = {
    "config_list": config_list,  # config_list should be defined or imported
    "timeout": 60,
    "seed": 42,
}

In [None]:
# System message constants for different roles
PI_prompt = """You are a Principal investigator (PI): You are the leader of the research team who asks the questions and gives task.
    You MUST make sure that the research team is on the right track and the research is going in the right direction. 
    You should check the work of the researcher, coordinator and Planner and provide feedback to improve the quality of the work or confirm the work.
    reply 'TERMINATE' in the end when everything is done.
    """


COORDINATOR = """You are a Research coordinator: This is the person who coordinates the various aspects of the research project. 
you are equipped wih a tool that could help you to query for the arxiv api. 
You MUST rephrase research questions into a list of queries (at least 5) for the arxiv api that cover the key aspects of the research questions. 
"""

RESEARCHER = """You are a Researcher: This is the person who performs the research and writes the final report/article.
You MUST take to account the feedback from the critic to improve the quality of the work.
You are equipped with a memory tool that could help you to retrieve the information you need. 
You are equipped with a tool that could help you to get the content of the pdf file from the url. In case you have a useful pdf url in your memory, which you can use to get the content of the pdf file, you should use the tool to get the content of the pdf file.
You MUST verify the information you retrieve from the memory tool before using it in your work an make sure it is accurate and enough to answer the question.
If there isn't enough information for you to perform your task, you should asked the PI to provide you with the missing information from Arxiv. You should mention what is missing in your request start you sentence exactly with "ARXIV REQUEST:". 
You should not generate answers that don't use the sources provided in the context.
"""
SUB1 = """You are a Sub-investigator (Sub-I): This is the assistant to the PI, who helps with the tasks of the PI with a step wise research plan with sub-research topics.
you MUST help PI to ensure that the research team is on the right track and the research is going in the right direction. 
In your role, you have the autonomy to question the provided content or the process presented in this group chat and can request corrections or seek clarification if there is something that appears to be missing or unclear after executing a given task. If at any point you find yourself confused or in need of assistance, do not hesitate to reach out to the group chat manager, who can guide you or delegate the task to another qualified participant.
Reply 'TERMINATE' in the end when everything is done.
"""

# If there isn't enough information below, you should reply exactly 'UPDATE CONTEXT'.
QNA_PROMPT = """Assistant helps the researchers with searching information from the arxiv API. Be brief in your answers.
Answer ONLY with the facts listed in the list of sources below. Do not generate answers that don't use the sources below. 
In your role, you have the autonomy to question the provided content or the process presented in this group chat and can request corrections or seek clarification if there is something that appears to be missing or unclear after executing a given task. If at any point you find yourself confused or in need of assistance, do not hesitate to reach out to the group chat manager, who can guide you or delegate the task to another qualified participant.
For tabular information return it as an html table. Do not return markdown format. If the question is not in English, answer in the language used in the question.
Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately.
User's question is: {input_question}

Context is: {input_context}"""

PLANNER = """Planner. Suggest a plan. Revise the plan based on feedback from PI and critic, until PI approval.
The plan may involve a Research Coordinator to to rephrase research questions into key word queries for the arxiv api and a Researcher who could performs the research of paper's summaries. 
Explain the plan first. Be clear which step is performed by an Research Coordinator, and which step is performed by a Researcher.
"""

# Termination message definition
termination_msg = (
    lambda x: isinstance(x, dict)
    and str(x.get("content", "")).upper() == "TERMINATE"
)

# Agent definitions
principalInvestigator = autogen.UserProxyAgent(
    name="PI",
    is_termination_msg=termination_msg,
    human_input_mode="TERMINATE",
    system_message=PI_prompt,
    llm_config=llm_config, 
    code_execution_config=False,
    description="Principal investigator (PI) is the leader of the research team who asks the questions and gives task."
)


planner = autogen.AssistantAgent(
    name="Planner",
    system_message=PLANNER,
    llm_config=llm_config,
    description="Planner suggests a plan and revises the plan based on feedback from PI and critic, until PI approval."
)

subInvestigator = autogen.AssistantAgent(
    name="Sub-I",
    is_termination_msg=termination_msg,
    system_message=SUB1,
    llm_config=llm_config,
    description="Sub-investigator (Sub-I) is the assistant to the PI, who helps with the tasks of the PI with a step wise research plan with sub-research topics."
)

critic = autogen.AssistantAgent(
    name="Critic",
    system_message="Critic. Double check the work of researcher, research coordinator and Planner and provide feedback to improve the quality of the work",
    llm_config=llm_config,
    description="Critic is responsible for double checking the work of researcher, research coordinator and Planner and provide feedback to improve the quality of the work"
)


researchCoordinator = autogen.AssistantAgent(
    name="ResearchCoordinator",
    is_termination_msg=termination_msg,
    system_message=COORDINATOR,  # COORDINATOR should be a predefined string variable
    llm_config=llm_config,
    description="Research coordinator is the person who rephrase research questions into key word queries for the arxiv api."
)

# create a UserProxyAgent instance named "user_proxy"
RC_proxy = autogen.UserProxyAgent(
    name="ResearchCoordinator_proxy",
    human_input_mode="NEVER",
    is_termination_msg=lambda x: "content" in x
    and x["content"] is not None
    and x["content"].rstrip().endswith("TERMINATE"),
    code_execution_config={
        "work_dir": "ResearchCoordinator",
        "use_docker": False,
    },  # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.
    description="assist Research coordinator to query for the arxiv api."
)

researcher = autogen.AssistantAgent(
    name="Researcher",
    is_termination_msg=termination_msg,
    system_message=RESEARCHER, 
    llm_config=llm_config,
    description="Researcher is the person who performs the research of paper's summaries."
)


from chromadb.config import Settings

# db_dir = './teachability_db_AI_safety'
# settings = Settings(
#             anonymized_telemetry=False, allow_reset=True, is_persistent=True, persist_directory=db_dir
#         )

# Researcher_aid = RetrieveUserProxyAgent(
#     name="RAG_USER",
#     is_termination_msg=termination_msg,
#     human_input_mode="NEVER",
#     max_consecutive_auto_reply=3,
#     llm_config=llm_config, 
#     retrieve_config={
#         "task": "code",
#         "customized_prompt": QNA_PROMPT, 
#         "docs_path": None,
#         "model": config_list[0]["model"],
#         "client": chromadb.Client(settings),
#         "collection_name": "memos",
#         "get_or_create": True,
#     },
#     code_execution_config=False,  # we don't want to execute code in this case.
#     description="Assistant has extra content retrieval power and can provide team members with context in database.",
# )

# def chromadb_retriever(
#     message: Annotated[
#         str,
#         "Refined message which keeps the original meaning and can be used to retrieve content for code generation and question answering.",
#     ],
#     n_results: Annotated[int, "number of results. it should be at least 10."],
# ) -> str:
#     Researcher_aid.n_results = n_results  # Set the number of results to be retrieved.
#     # Check if we need to update the context.
#     update_context_case1, update_context_case2 = Researcher_aid._check_update_context(message)
#     if (update_context_case1 or update_context_case2) and Researcher_aid.update_context:
#         Researcher_aid.problem = message if not hasattr(Researcher_aid, "problem") else Researcher_aid.problem
#         _, ret_msg = Researcher_aid._generate_retrieve_user_reply(message)
#     else:
#         _context = {"problem": message, "n_results": n_results}
#         ret_msg = Researcher_aid.message_generator(Researcher_aid, None, _context)
#     return ret_msg if ret_msg else message

# message = "Overview of time series forecasting methods"
# chromadb_retriever(message, n_results=3)

# Instantiate the Teachability capability. Its parameters are all optional.
teachability = Teachability(
    verbosity=3,  # 0 for basic info, 1 to add memory operations, 2 for analyzer messages, 3 for memo lists.
    reset_db=False,
    path_to_db_dir=db_dir,
    recall_threshold=1.5,  # Higher numbers allow more (but less relevant) memos to be recalled.
)

# Now add the Teachability capability to the agent.
teachability.add_to_agent(researcher)


In [None]:
from autogen import Agent
from typing import List, Dict

def custom_speaker_selection_func(last_speaker: Agent, groupchat: autogen.GroupChat):
    """Define a customized speaker selection function.
    A recommended way is to define a transition for each speaker in the groupchat.
    # principalInvestigator, planner, researchCoordinator, researcher, critic, RC_proxy, Researcher_aid

    Returns:
        Return an `Agent` class or a string from ['auto', 'manual', 'random', 'round_robin'] to select a default method to use.
    """
    messages = groupchat.messages

    if len(messages) <= 1:
        # first, let the researchCoordinator retrieve relevant data populate db
        return researchCoordinator

    # if last_speaker is researchCoordinator:
    #     # if the last message is from researchCoordinator, let the planner to speak
    #     return planner
    
    if last_speaker is planner:
        # if the last message is from researchCoordinator, let the planner to speak
        return critic

    # if last_speaker is RC_proxy:
    #     if messages[-1]["content"].strip() != "":
    #         # If the last message is from user and is not empty, let the writer to continue
    #         return researcher

    # elif last_speaker is engineer:
    #     if "```python" in messages[-1]["content"]:
    #         # If the last message is a python code block, let the executor to speak
    #         return executor
    #     else:
    #         # Otherwise, let the engineer to continue
    #         return engineer

    # elif last_speaker is executor:
    #     if "exitcode: 1" in messages[-1]["content"]:
    #         # If the last message indicates an error, let the engineer to improve the code
    #         return engineer
    #     else:
    #         # Otherwise, let the writer to speak
    #         return writer

    # elif last_speaker is researcher and messages[-1] does not have 'tool_calls' in it:
    # elif last_speaker is researcher and "tool_calls" not in messages[-1]:
    #     # Always let the user to speak after the writer
    #     return critic

    else:
        # default to auto speaker selection method
        return "auto"



In [None]:
def _reset_agents():
    principalInvestigator.reset()
    subInvestigator.reset()
    researchCoordinator.reset()
    researcher.reset()
    planner.reset()
    critic.reset()
    RC_proxy.reset()
    # Researcher_aid.reset()


def call_chat(PROBLEM):
    _reset_agents()  # Resets the state of all the agents before starting the chat

    autogen.agentchat.register_function(
        arxiv_retriever,
        caller=researchCoordinator,
        executor=RC_proxy,
        name="arxiv_retriever",
        description="Retrieve content for question answering from arxiv."
    )

    autogen.agentchat.register_function(
        get_pdf,
        caller=researcher,
        executor=RC_proxy,
        name="get_pdf",
        description="retrieve the content of the pdf file from the url."
    )

    # autogen.agentchat.register_function(
    #     chromadb_retriever,
    #     caller=researcher,
    #     executor=RC_proxy,
    #     name="chromadb_retriever",
    #     description="Retrieve content for question answering from database."
    # )

    # Create the GroupChat manager instance.
    groupchat = autogen.GroupChat(
        agents=[principalInvestigator, planner, researchCoordinator, researcher, critic, RC_proxy],
        # agents=[principalInvestigator, planner, researcher, critic, Researcher_aid],
        messages=[],
        max_round=35,
        speaker_selection_method=custom_speaker_selection_func,
        allow_repeat_speaker=False,
    )

    manager = autogen.GroupChatManager(
        groupchat=groupchat,
        llm_config=manager_config,
    )

    # Initialize the chat with the primary investigator as the proxy agent.
    principalInvestigator.initiate_chat(
        manager,
        message=PROBLEM
    )

    # Start chatting with the primary investigator acting as the user proxy agent.
    return principalInvestigator.chat_messages

# Example usage:
# You MUST Collect enough information from the arxiv API and summerize the information in the blog post.
PROBLEM = """Write blog post about the modelling of reliability and safety mechanisms in AI system. 
The focus MUST be on Large Language Models.
The blog post MUST be written in a way that is easy to understand for a non-technical audience.
The blog post MUST be up to date and include the latest research in the field in your database.
"""
messages = call_chat(PROBLEM)
print(messages)
