In [1]:
import os
from glob import glob
from tqdm import tqdm
from langchain import hub
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import AgentExecutor, create_react_agent
from langchain_community.document_loaders import PDFMinerLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate

In [2]:
load_dotenv("../.env")

GPT_4 = "gpt-4-1106-preview"
GPT_3 = "gpt-3.5-turbo-0125"
MANAGEMENT_DIR = "../data/management"
USER_DIR = "../data/user"
LLM = ChatOpenAI(temperature=0, model=GPT_4)
TOOLS = []
SESSION_ID = "test"
USER_ROLE = "management"
CHUNK_SIZE = 400
CHUNK_OVERLAP = 0
STORE = {}

In [3]:
system_prompt = """
You are a LEAF chatbot agent, you are required to provide answer to the question based on the embedded knowledge. Provide a short, concise and precise answers. Do not give a general answer or trying to fabricate an answer when you don't know the answer or when the question is out of this company context.
"""

In [4]:
system_prompt = """You are Leaf's company chatbot. Answer the following questions with facts, use the right tool based on the user access level. Do not give a general answer, be specific to the question in context of this company:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question without addressing the user role

Begin!

Question: {input}
Thought:{agent_scratchpad}"""

prompt = PromptTemplate(
    input_variables=["agent_scratchpad", "input", "tool_names", "tools"],
    template=system_prompt,
)

In [5]:
hub_prompt = hub.pull("hwchase17/react")

In [6]:
hub_prompt

PromptTemplate(input_variables=['agent_scratchpad', 'input', 'tool_names', 'tools'], metadata={'lc_hub_owner': 'hwchase17', 'lc_hub_repo': 'react', 'lc_hub_commit_hash': 'd15fe3c426f1c4b3f37c9198853e4a86e20c425ca7f4752ec0c9b0e97ca7ea4d'}, template='Answer the following questions as best you can. You have access to the following tools:\n\n{tools}\n\nUse the following format:\n\nQuestion: the input question you must answer\nThought: you should always think about what to do\nAction: the action to take, should be one of [{tool_names}]\nAction Input: the input to the action\nObservation: the result of the action\n... (this Thought/Action/Action Input/Observation can repeat N times)\nThought: I now know the final answer\nFinal Answer: the final answer to the original input question\n\nBegin!\n\nQuestion: {input}\nThought:{agent_scratchpad}')

In [7]:
query = "steps to set up a home alarm system?"

In [8]:
def get_filepaths(dirs, ext):
    all_files = []
    for dir in dirs:
        pattern = os.path.join(dir, "**", f"*{ext}")
        files = tqdm(
            glob(pattern, recursive=True), desc=f"Loading {ext} files for {dir}"
        )
        # files = [file.replace("\\", "/") for file in files]
        all_files.extend(files)
    return all_files

In [9]:
def load_files(user):
    texts = []
    if user in ["management"]:
        files = get_filepaths([MANAGEMENT_DIR, USER_DIR], ".pdf")
    else:
        files = get_filepaths([USER_DIR], ".pdf")

    for file in files:
        loader = PDFMinerLoader(file)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=400, chunk_overlap=0, separators=["\n\n"]
        )
        split_text = text_splitter.split_documents(loader.load())
        texts.extend(split_text)

    return texts

In [10]:
admin_documents = load_files("management")
user_documents = load_files("user")

Loading .pdf files for ../data/management: 100%|██████████| 1/1 [00:00<?, ?it/s]
Loading .pdf files for ../data/user: 100%|██████████| 6/6 [00:00<?, ?it/s]


Loading .pdf files for ../data/user: 100%|██████████| 6/6 [00:00<?, ?it/s]


In [11]:
# for doc in user_documents:
#     try:
#         print(doc.metadata, "\n", doc.page_content[:100], "\n", "====================")
#     except:
#         print("fail", doc)

In [12]:
admin_faiss_index = FAISS.from_documents(admin_documents, OpenAIEmbeddings())
user_faiss_index = FAISS.from_documents(user_documents, OpenAIEmbeddings())

admin_bm25_retriever = BM25Retriever.from_documents(documents=admin_documents, k=2)
admin_bm25_retriever.k = 2
user_bm25_retriever = BM25Retriever.from_documents(documents=user_documents, k=2)
user_bm25_retriever.k = 2

admin_ensemble_retriever = EnsembleRetriever(
    retrievers=[
        admin_bm25_retriever,
        admin_faiss_index.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={"score_threshold": 0.3, "k": 2},
        ),
    ]
)

user_ensemble_retriever = EnsembleRetriever(
    retrievers=[
        user_bm25_retriever,
        user_faiss_index.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={"score_threshold": 0.3, "k": 2},
        ),
    ],
    weights=[0.5, 0.5],
)

In [13]:
# admin_faiss_index.similarity_search_with_score(query, k=4, score_threshold=0.4)

In [14]:
# admin_ensemble_retriever.invoke(query)

In [15]:
# user_ensemble_retriever.invoke(query)

In [16]:
# retriever = admin_faiss_index.as_retriever(
#     search_type="similarity_score_threshold",
#     search_kwargs={
#         "score_threshold": .3,
#         "k": 2}
# )
# retriever.invoke(query)

In [17]:
admin_ensemble_retriever_tool = create_retriever_tool(
    retriever=admin_ensemble_retriever,
    name="admin_retriever_tool",
    description="useful for retrieving the admin vector store when querying for result based on context",
)

user_ensemble_retriever_tool = create_retriever_tool(
    retriever=user_ensemble_retriever,
    name="user_retriever_tool",
    description="useful for retrieving the user vector store when querying for result based on context",
)

tools = [admin_ensemble_retriever_tool, user_ensemble_retriever_tool]

In [18]:
agent = create_react_agent(llm=LLM, tools=tools, prompt=prompt)

In [19]:
agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,
    return_intermediate_steps=True,
    handle_parsing_errors=True,
)

In [47]:
response = agent_executor.invoke({"input": f"as a {USER_ROLE}, {query}"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To provide steps for setting up a home alarm system, I need to access general information that would be relevant to management but not specific to an individual user or admin.
Action: user_retriever_tool
Action Input: steps to set up a home alarm system for management[0m[33;1m[1;3m

As our gesture of goodwill and a token of appreciation for your support, the Developer is pleased to contribute on 
your  behalf,  a  ONE  (1)  month  value  of  Charges  and  Contribution  to  the  Sinking  Fund  for  the  maintenance  and 
management of the estate for period from 4 September 2021 to 3 October 2021.  Charges and contribution to the 
sinking fund shall be due and payable by you with effect from 4 October 2021 onwards.  We shall subsidize partial of 
the Charges imposed by the Management during a period of One year from the date of 1st billing (Charges). 

1.  Visitor Management 
2.  Online Payments 
3.  Community Calen

In [72]:
import json


def get_intermediate_steps(steps):
    intermediate_steps = {}
    for idx, (tool, output) in enumerate(steps):
        intermediate_steps[idx] = {**vars(tool), "output": output}

    return json.dumps(intermediate_steps, indent=4)

In [67]:
tool, output = response["intermediate_steps"][1]
tool
vars(tool)

{'tool': 'admin_retriever_tool',
 'tool_input': 'steps to set up a home alarm system for management',
 'log': 'The observation does not provide the steps for setting up a home alarm system. It seems to be a mix of community management guidelines and building regulations, which are not relevant to the question asked. I need to try a different approach to find the specific steps for setting up a home alarm system for management.\n\nAction: admin_retriever_tool\nAction Input: steps to set up a home alarm system for management',
 'type': 'AgentAction'}

In [74]:
steps = get_intermediate_steps(response["intermediate_steps"])
print(steps)

{
    "0": {
        "tool": "user_retriever_tool",
        "tool_input": "steps to set up a home alarm system for management",
        "log": "Thought: To provide steps for setting up a home alarm system, I need to access general information that would be relevant to management but not specific to an individual user or admin.\nAction: user_retriever_tool\nAction Input: steps to set up a home alarm system for management",
        "type": "AgentAction",
        "output": "\n\nAs our gesture of goodwill and a token of appreciation for your support, the Developer is pleased to contribute on \nyour  behalf,  a  ONE  (1)  month  value  of  Charges  and  Contribution  to  the  Sinking  Fund  for  the  maintenance  and \nmanagement of the estate for period from 4 September 2021 to 3 October 2021.  Charges and contribution to the \nsinking fund shall be due and payable by you with effect from 4 October 2021 onwards.  We shall subsidize partial of \nthe Charges imposed by the Management during 