In [38]:
import os
import json
import pandas as pd
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from sqlalchemy import create_engine
from langchain.agents import create_json_agent
from langchain_community.tools.json.tool import JsonSpec
from langchain_community.agent_toolkits import JsonToolkit

load_dotenv("../.env")

True

In [39]:
TABLE_NAME = os.getenv("EFORM_TABLE_NAME")
DB_NAME = os.getenv("EFORM_DB_NAME")
USER = "kamalleaf"  # os.getenv('EFORM_DB_USER')
PASS = os.getenv("EFORM_DB_PASS")
HOST = os.getenv("EFORM_DB_HOST")
GPT_4 = "gpt-4-1106-preview"
GPT_3 = "gpt-3.5-turbo-0125"
LLM = ChatOpenAI(temperature=0, model=GPT_4)

In [40]:
ENGINE = create_engine(f"mysql+mysqlconnector://{USER}:{PASS}@{HOST}/{DB_NAME}")

In [58]:
sql_query = pd.read_sql_query(
    f"""
        SELECT *
        FROM {TABLE_NAME}
        WHERE entry_date > '2024-05-10' AND deleted=0 
        LIMIT 20
    """,
    ENGINE,
)

In [59]:
json_dicts = {}
for group_id, group_df in sql_query.groupby("id_group"):
    for _, row in group_df.iterrows():
        restricted = row["restricted"]
        form_title = row["form_title"].replace("/", "-")
        entry_date = int(row["entry_date"].timestamp())
        json_dict = json.loads(row.to_json())
        to_update = json.loads(json_dict.pop("data_components", "{}"))
        json_dict.update(to_update)
        json_name = f"{form_title}_{group_id}_{entry_date}"
        json_dicts[json_name] = json_dict

In [60]:
json_spec = JsonSpec(dict_=json_dicts, max_value_length=10000)
json_toolkit = JsonToolkit(spec=json_spec)

In [68]:
agent = create_json_agent(
    llm=LLM,
    toolkit=json_toolkit,
    verbose=True,
    return_intermediate_steps=True,
    handle_parsing_errors=True,
    prefix="""You are an agent designed to interact with JSON.
    Your goal is to return a final answer by interacting with the JSON.
    You have access to the following tools which help you learn more about the JSON you are interacting with.
    Only use the below tools. Only use the information returned by the below tools to construct your final answer.
    Do not make up any information that is not contained in the JSON.
    Your input to the tools should be in the form of `data["key"][0]` where `data` is the JSON blob you are interacting with, and the syntax used is Python. 
    You should only use keys that you know for a fact exist. You must validate that a key exists by seeing it previously when calling `json_spec_list_keys`. 
    If you have not seen a key in one of those responses, you cannot use it.
    You should only add one key at a time to the path. You cannot add multiple keys at once.
    If you encounter a "KeyError", go back to the previous key, look at the available keys, and try again.

    If the question does not seem to be related to the JSON, just return "I don't know" as the answer.
    Always begin your interaction with the `json_spec_list_keys` tool with input "data" to see what keys exist in the JSON.

    Note that sometimes the value at a given path is large. In this case, you will get an error "Value is a large dictionary, should explore its keys directly".
    In this case, you should ALWAYS follow up by using the `json_spec_list_keys` tool to see what keys exist at that path.
    Do not simply refer the user to the JSON or a section of the JSON, as this is not a valid answer. Keep digging until you find the answer and explicitly return it.
    
    Whenever querying for meeting minutes, if the the filename is queried and user doesn't specify any rule, do not return the original filename. Use their subject instead
    """,
)

In [45]:
# agent.invoke("how many meetings are there?")

In [46]:
# agent.invoke("what are the meetings about?")

In [47]:
# agent.invoke("is there any affected area feedback by us")

In [48]:
def pretty_print(json):
    for key, val in json.items():
        print(key)
        print("\t", val)
        print("====================================")

In [49]:
variables = vars(agent)
chain_obj = variables["agent"]
chain_vars = vars(chain_obj)
llm_obj = vars(chain_vars["llm_chain"])
prompt = vars(llm_obj["prompt"])
print(prompt["template"])

Whenever user query for a meetings name, do not return the filename. return the meeting minute subject in the json file instead as it's filename

json_spec_list_keys: 
    Can be used to list all keys at a given path. 
    Before calling this you should be SURE that the path to this exists.
    The input is a text representation of the path to the dict in Python syntax (e.g. data["key1"][0]["key2"]).
    
json_spec_get_value: 
    Can be used to see value in string format at a given path.
    Before calling this you should be SURE that the path to this exists.
    The input is a text representation of the path to the dict in Python syntax (e.g. data["key1"][0]["key2"]).
    

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [json_spec_list_keys, json_spec_get_value]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Acti

In [69]:
agent.invoke("List me all the meeting minutes")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction: json_spec_list_keys
Action Input: data[0m
Observation: [36;1m[1;3m['Vendor Company Registration Form_1_1715685420', 'Vendor Staff Registration Form_1_1715685667', 'Vendor Staff Registration Form_1_1715685770', 'Vendor Staff Registration Form_1_1715685871', 'Vendor Staff Registration Form_1_1715685893', 'Vendor Staff Registration Form_1_1715685917', 'Vendor Staff Registration Form_1_1715768604', 'Walk-in Feedback Form_1_1716268987', 'Walk-in Feedback Form_1_1716269388', 'Unit Owner Registration Form_1_1716269587', 'Meeting Minutes_1_1716271407', 'Meeting Minutes_1_1716953961', 'Meeting Minutes_1_1717038347', 'Meeting Minutes_1_1717139773', 'Checklist_360_1715574634', 'testestset_360_1716892988', 'testestset_360_1716893123', 'testestset_360_1716893197', 'testestset_360_1716974592', 'testestset_360_1716975174'][0m
Thought:[32;1m[1;3mThere are keys that seem to be related to meeting minutes. I should get the values 

{'input': 'List me all the meeting minutes',
 'output': 'The subjects of the meeting minutes are:\n1. JMB Meeting\n2. LEAF Meeting\n3. MC Meeting No.10\n4. LEAF Meeting'}

: 

In [67]:
# agent.invoke(
#     "List me all the meeting minutes related to LEAF together with their meeting dates"
# )



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction: json_spec_list_keys
Action Input: data[0m
Observation: [36;1m[1;3m['Vendor Company Registration Form_1_1715685420', 'Vendor Staff Registration Form_1_1715685667', 'Vendor Staff Registration Form_1_1715685770', 'Vendor Staff Registration Form_1_1715685871', 'Vendor Staff Registration Form_1_1715685893', 'Vendor Staff Registration Form_1_1715685917', 'Vendor Staff Registration Form_1_1715768604', 'Walk-in Feedback Form_1_1716268987', 'Walk-in Feedback Form_1_1716269388', 'Unit Owner Registration Form_1_1716269587', 'Meeting Minutes_1_1716271407', 'Meeting Minutes_1_1716953961', 'Meeting Minutes_1_1717038347', 'Meeting Minutes_1_1717139773', 'Checklist_360_1715574634', 'testestset_360_1716892988', 'testestset_360_1716893123', 'testestset_360_1716893197', 'testestset_360_1716974592', 'testestset_360_1716975174'][0m
Thought:[32;1m[1;3mThere are several "Meeting Minutes" files listed in the data. I need to check each 

{'input': 'List me all the meeting minutes related to LEAF together with their meeting dates',
 'output': 'There are two meeting minutes related to LEAF:\n1. Subject: LEAF Meeting, Date and Time: 2024-05-24T12:00:00+08:00\n2. Subject: LEAF Meeting, Date and Time: 2024-05-25T12:00:00+08:00'}