## Initialize the project 

In [1]:

from typing import Dict, List, Optional, Union, Callable, Literal
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
from autogen.formatting_utils import colored
from typing_extensions import Annotated
import autogen
from autogen import Agent
from autogen.token_count_utils import count_token, get_max_token_limit
from autogen.agentchat.contrib.capabilities import transform_messages, transforms

from teachability import Teachability
from concurrent.futures import ThreadPoolExecutor, as_completed

import arxiv

import requests


import os
import shutil
import requests
import pickle
import re
from pathlib import Path

import nest_asyncio
nest_asyncio.apply()

import sqlite3
from utils import *

In [2]:
print(autogen.__version__)

0.2.29


### parameters

In [3]:
version = "0.0.1"
ProjectID = "polymer_rep"
initiate_db = False

# config_file = "OAI_CONFIG_LIST"
config_file = "OAI_CONFIG_LIST-sweden-505"
max_round = 10
silent = False
recall_threshold = 0.7

Project_dir = Path(f"./{ProjectID}/{version}")
output_dir = f'{Project_dir}/pdf_output'
db_dir = f'{Project_dir}/memo-db/'

config_list = autogen.config_list_from_json(
    config_file,
    file_location=".",
    filter_dict={
        "model": ["gpt-4-32k", "gpt-4"]#, "gpt4", "gpt-35-turbo-16k", "gpt-4-0613", "gpt-3.5-turbo", "gpt-35-turbo", "gpt-35-turbo-0613"]
    },
)

print("LLM models: ", [config_list[i]["model"] for i in range(len(config_list))])

config_list_32 = autogen.config_list_from_json(
    config_file,
    file_location=".",
    filter_dict={
        "model": ["gpt-4-32k"]
    },
)

config_list_4v = autogen.config_list_from_json(
    config_file,
    file_location=".",
    filter_dict={
        "model": ["gpt-4v"]
    },
)

config_list_35 = autogen.config_list_from_json(
    config_file,
    file_location=".",
    filter_dict={
        "model": ["gpt-35-turbo-16k", "gpt-35-turbo"]
    },
)

print("LLM models 32K: ", [config_list_32[i]["model"] for i in range(len(config_list_32))])

# Configuration for the Language Model (LLM)
llm_config = {
    "config_list": config_list,  # config_list should be defined or imported
    "timeout": 120,
    # "seed": 42,
}

# Configuration for the Language Model (LLM)
llm_config_32 = {
    "config_list": config_list_32,  # config_list should be defined or imported
    "timeout": 120,
    # "seed": 42,
}

llm_config_35 = {
    "config_list": config_list_35,  # config_list should be defined or imported
    "timeout": 120,
    # "seed": 42,
}

# Configuration for the manager using the same config_list as llm_config
manager_config = {
    "config_list": config_list,  # config_list should be defined or imported
    "timeout": 60,
    # "seed": 42,
}

# Termination message definition
termination_msg = (
    lambda x: isinstance(x, dict)
    and "TERMINATE" in str(x.get("content", "")).upper()
)



LLM models:  ['gpt-4', 'gpt-4-32k']
LLM models 32K:  ['gpt-4-32k']


In [4]:
prompt = """carefully read the following section of a paper with the title {title}:
{section}

to enhance the readibility of this section, generate at least a diagram related to the content of the section.
if you need more information, please get help from your MEMOS.
"""

title = "Exploring the Intricacies of Polymer Representation: Unraveling Complexity"

section = """
**Case Study: Discovery of Thermally Conductive Polymers**

A significant study in polymer informatics is the 'Machine Learning-Assisted Exploration of Thermally Conductive Polymers Based on High-Throughput Molecular Dynamics Simulations' by Ruimin Ma et al. (2021). This research underscores how ML, combined with molecular dynamics simulations, accelerates the design process to identify polymers with high thermal conductivity, which are pivotal for heat transfer applications. By leveraging the expansive PolyInfo database, the research team demonstrated a paradigm shift from labor-intensive experiments to a data-enriched, algorithmic approach (Ma et al., 2021).
"""

message = prompt.format(title=title, section=section)

In [5]:
from autogen.agentchat.contrib.agent_builder import AgentBuilder


teachability = Teachability(
                            verbosity=0,  # 0 for basic info, 1 to add memory operations, 2 for analyzer messages, 3 for memo lists.
                            reset_db=False,
                            path_to_db_dir=db_dir,
                            recall_threshold=recall_threshold,  # Higher numbers allow more (but less relevant) memos to be recalled.
                            llm_config=llm_config_35,
                            )

user = autogen.UserProxyAgent(
        name="user",
        human_input_mode="TERMINATE",
        is_termination_msg=termination_msg,
        code_execution_config={
            "last_n_messages": 2,
            "work_dir": "./image_generation",
            "use_docker": False,
            "timeout": 60
        }
    )

agent = autogen.ConversableAgent(
        name="agent",
        is_termination_msg=termination_msg,
        system_message="""
        you are a AI assistnat that answer user's questions based on the provided MEMOS.
        """,
        llm_config=llm_config,
        )
        
teachability.add_to_agent(agent)
        
critics = autogen.AssistantAgent(
    name="Critics",
    system_message="""Please provide feedback on the generated anwser. 
    You MUST ask follow-up questions to make sure the answer is correct and conperhensive.
    if you satisfied with the answer, please type 'TERMINATE'.  
    """,
    llm_config=llm_config,
    human_input_mode="NEVER",
    is_termination_msg=termination_msg,
    max_consecutive_auto_reply=1,
)

teachability.add_to_agent(critics)

groupchat = autogen.GroupChat(
    agents=[user, agent, critics],
    messages=[],
    speaker_selection_method="auto",  # With two agents, this is equivalent to a 1:1 conversation.
    allow_repeat_speaker=True,
    max_round=max_round,
)

manager = autogen.GroupChatManager(
            groupchat=groupchat,
            is_termination_msg=termination_msg,
            llm_config=manager_config,
            code_execution_config={
                "work_dir": "coding",
                "use_docker": False,
            },
        )


[92m    Location = polymer_rep/0.0.1/memo-db/uid_text_dict.pkl[0m


In [6]:
q = "what is SMILES and BigSMILES? give me an example."

chat_hist = user.initiate_chat(manager, 
                                silent=silent, 
                                message=q,
                                )

[33muser[0m (to chat_manager):

what is SMILES and BigSMILES? give me an example.

--------------------------------------------------------------------------------
[32m
Next speaker: Critics
[0m


Add of existing embedding ID: 156
Add of existing embedding ID: 230
Add of existing embedding ID: 283


[33mCritics[0m (to chat_manager):

SMILES, which stands for "Simplified Molecular Input Line Entry System," is a notation that allows a user to describe the structure of a chemical compound using a line of text. It encodes molecular structures into a series of ASCII characters, enabling easy input, storage, and communication of chemical information. The notation provides a systematic way to depict atoms, bonds, branching, and ring structures.

BigSMILES is an extension of SMILES designed to handle polymers and other large molecules that possess repeating units. It introduces additional syntax to represent these structures, particularly the repeat units that are a characteristic of polymer molecules.

An example of a SMILES string for the caffeine molecule is "CN1C=NC2=C1C(=O)N(C(=O)N2C)C". This structure accounts for all the atoms in caffeine and how they are connected.

A BigSMILES string for a polymer like polyethylene, which has a simple repeating unit, would include placeholder m

In [6]:
q = "list all available polymer databases in a table format."

chat_hist = user.initiate_chat(manager, 
                                silent=silent, 
                                message=q,
                                )

[33muser[0m (to chat_manager):

list all available polymer databases in a table format.

--------------------------------------------------------------------------------
[32m
Next speaker: agent
[0m


Add of existing embedding ID: 156
Add of existing embedding ID: 230
Add of existing embedding ID: 283


[31m
>>>>>>>> USING AUTO REPLY...[0m
[33magent[0m (to chat_manager):

| Database Name                        | URL                                  | Description                                                                                                                                                                                        |
|--------------------------------------|--------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| PoLyInfo                            | polymer.nims.go.jp                   | Supported by National Institute for Materials Science (NIMS), it includes data from academic literature with 367,711 property data points of polymers built from 18,015 monomers.                   |
| Polymer Genome - Khazana             | khazana.gatech.edu                   | Stores computat

In [7]:
q = "list all available polymer models in a table format."

chat_hist = user.initiate_chat(manager, 
                                silent=silent, 
                                message=q,
                                )

[33muser[0m (to chat_manager):

list all available polymer models in a table format.

--------------------------------------------------------------------------------
[32m
Next speaker: agent
[0m
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33magent[0m (to chat_manager):

Creating a comprehensive list of all available polymer models could be quite extensive, as there are many types based on the specific polymer characteristics, properties, and applications. However, I can provide a general list of broad categories of polymer models that are commonly used within the scientific and engineering communities.

Please note that the following table represents a simplified categorization and is far from exhaustive:

| Category            | Types of Polymer Models                         |
|---------------------|-------------------------------------------------|
| Thermoplastic       | - Polyethylene (PE)                             |
|                     | - Polypropylene (PP)               

In [7]:
q = "list all available polymer machine-learning models in a table format and compare them. For each models provide the following information: model name, model type, model size, refrence, and model performance."

chat_hist = user.initiate_chat(manager, 
                                silent=silent, 
                                message=q,
                                )

[33muser[0m (to chat_manager):

list all available polymer machine-learning models in a table format and compare them. For each models provide the following information: model name, model type, model size, refrence, and model performance.

--------------------------------------------------------------------------------
[32m
Next speaker: agent
[0m
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33magent[0m (to chat_manager):

As an AI language model, I don't have real-time access to external databases and cannot dynamically generate a list of all available polymer machine-learning models. But I can provide a hypothetical example based on the typical format in which such comparisons are made and pull in relevant information that might be used to construct such a comparison from existing resources.

| Model Name | Model Type | Model Size | Reference | Model Performance  |
|------------|------------|------------|-----------|-------------------|
| PolyNet  | Neural Network | Large | [Referen