In [52]:
import os
from glob import glob
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain.document_loaders import PyPDFLoader, UnstructuredExcelLoader, JSONLoader 


In [6]:
load_dotenv()
LLM_MODEL = "gpt-4-turbo"
EMBEDDINGS = OpenAIEmbeddings()
LLM = ChatOpenAI(model=LLM_MODEL)

In [78]:
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["meeting_subject"] = record.get("tf_MeetingMinuteSubject")
    metadata["entry_date"] = record.get("entry_date")

    return metadata

In [82]:
def load_files(root_folders):
    documents = []
    for root_folder in root_folders:
        files_types = {}
        exts = ('*.pdf', '*.xlsx', '*.json')
        for ext in exts:
            pattern = os.path.join(root_folder, '**', ext)
            if pattern:
                files_types[ext] = glob(pattern, recursive=True)
            
        for ext, files in files_types.items():
            if not files:
                continue
            for file in files:
                if "file_references.json" in file:
                    continue
                
                if "pdf" in ext:
                    loader = PyPDFLoader(file)
                elif "xlsx" in ext:
                    loader = UnstructuredExcelLoader(file)
                elif "json" in ext:
                    loader = JSONLoader(file, jq_schema=".", text_content=False, metadata_func=metadata_func)
                documents.extend(loader.load())
            
    return documents
    

In [83]:
chunk_size = 1000
chunk_overlap = 200

root_folders = [os.path.join("..","data", "test", "management", "1")]

docs = load_files(root_folders)

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=chunk_size, chunk_overlap=chunk_overlap
# )
# docs = text_splitter.split_documents(document_pages)
vector = FAISS.from_documents(docs, EMBEDDINGS)

In [61]:
document_pages

[Document(page_content='{"ta_PropertyNameAndDetails": "The Leaf Condominium", "tf_MeetingMinuteSubject": "MC Meeting No.10", "dt_DateandTimeOfMeetingMinute": "2024-05-17T12:00:00+08:00", "tf_JointManagementCommitteeMember": "Lee Sian Lee", "tf_InitialName": "LSL", "tf_CommitteeUnitNo": "T-04-04", "sel_MemberDesignation": "chairman", "tf_MeetingAttendance": "1/1", "tf_JointManagementCommitteeMember1": "Tan Kim Seng", "tf_InitialName1": "TKS", "tf_CommitteeUnitNo1": "T-05-05", "sel_MemberDesignation1": "secretary", "tf_MeetingAttendance1": "1/1", "tf_JointManagementCommitteeMember2": "Wong Wei Kang", "tf_InitialName2": "WWK", "tf_CommitteeUnitNo2": "T-06-06", "sel_MemberDesignation2": "treasurer", "tf_MeetingAttendance2": "1/1", "tf_JointManagementCommitteeMember3": "Teow Wei Yoong", "tf_InitialName3": "TWY", "tf_CommitteeUnitNo3": "T-08-08", "sel_MemberDesignation3": "committee", "tf_MeetingAttendance3": "1/1", "tf_JointManagementCommitteeMember4": "Lai Choon Fai", "tf_InitialName4": "L

In [84]:
docs

[Document(page_content='{"ta_PropertyNameAndDetails": "The Leaf Condominium", "tf_MeetingMinuteSubject": "MC Meeting No.10", "dt_DateandTimeOfMeetingMinute": "2024-05-17T12:00:00+08:00", "tf_JointManagementCommitteeMember": "Lee Sian Lee", "tf_InitialName": "LSL", "tf_CommitteeUnitNo": "T-04-04", "sel_MemberDesignation": "chairman", "tf_MeetingAttendance": "1/1", "tf_JointManagementCommitteeMember1": "Tan Kim Seng", "tf_InitialName1": "TKS", "tf_CommitteeUnitNo1": "T-05-05", "sel_MemberDesignation1": "secretary", "tf_MeetingAttendance1": "1/1", "tf_JointManagementCommitteeMember2": "Wong Wei Kang", "tf_InitialName2": "WWK", "tf_CommitteeUnitNo2": "T-06-06", "sel_MemberDesignation2": "treasurer", "tf_MeetingAttendance2": "1/1", "tf_JointManagementCommitteeMember3": "Teow Wei Yoong", "tf_InitialName3": "TWY", "tf_CommitteeUnitNo3": "T-08-08", "sel_MemberDesignation3": "committee", "tf_MeetingAttendance3": "1/1", "tf_JointManagementCommitteeMember4": "Lai Choon Fai", "tf_InitialName4": "L

In [85]:
vector.similarity_search_with_relevance_scores("How many LEAF meetings are there?", k=100)

[(Document(page_content='{"ta_PropertyNameAndDetails": "The Leaf Condominium", "tf_MeetingMinuteSubject": "MC Meeting No.10", "dt_DateandTimeOfMeetingMinute": "2024-05-17T12:00:00+08:00", "tf_JointManagementCommitteeMember": "Lee Sian Lee", "tf_InitialName": "LSL", "tf_CommitteeUnitNo": "T-04-04", "sel_MemberDesignation": "chairman", "tf_MeetingAttendance": "1/1", "tf_JointManagementCommitteeMember1": "Tan Kim Seng", "tf_InitialName1": "TKS", "tf_CommitteeUnitNo1": "T-05-05", "sel_MemberDesignation1": "secretary", "tf_MeetingAttendance1": "1/1", "tf_JointManagementCommitteeMember2": "Wong Wei Kang", "tf_InitialName2": "WWK", "tf_CommitteeUnitNo2": "T-06-06", "sel_MemberDesignation2": "treasurer", "tf_MeetingAttendance2": "1/1", "tf_JointManagementCommitteeMember3": "Teow Wei Yoong", "tf_InitialName3": "TWY", "tf_CommitteeUnitNo3": "T-08-08", "sel_MemberDesignation3": "committee", "tf_MeetingAttendance3": "1/1", "tf_JointManagementCommitteeMember4": "Lai Choon Fai", "tf_InitialName4": "

In [73]:
QA_PROMPT = PromptTemplate.from_template(
    """
    Acts as an intelligent LEAF chatbot.
    Do not give vague answers and do not give general knowledge answers and do not make new false information.
    Only give a specific answer related to the context of LEAF knowledge base and be concise.
    
    

    {context}

    Question: {question}
    Helpful Answer:
    """
)

In [86]:
qa_chain = RetrievalQA.from_chain_type(
    llm=LLM,
    retriever=vector.as_retriever(),
    chain_type="stuff",
    input_key="question",
    chain_type_kwargs={"verbose": True, "prompt": QA_PROMPT},
)

In [87]:
qa_chain.invoke({"question": "Who are the attendees of first LEAF meeting?"})



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    Acts as an intelligent LEAF chatbot.
    Do not give vague answers and do not give general knowledge answers and do not make new false information.
    Only give a specific answer related to the context of LEAF knowledge base and be concise.
    
    

    {"ta_PropertyNameAndDetails": "The Leaf Condominium", "tf_MeetingMinuteSubject": "MC Meeting No.10", "dt_DateandTimeOfMeetingMinute": "2024-05-17T12:00:00+08:00", "tf_JointManagementCommitteeMember": "Lee Sian Lee", "tf_InitialName": "LSL", "tf_CommitteeUnitNo": "T-04-04", "sel_MemberDesignation": "chairman", "tf_MeetingAttendance": "1/1", "tf_JointManagementCommitteeMember1": "Tan Kim Seng", "tf_InitialName1": "TKS", "tf_CommitteeUnitNo1": "T-05-05", "sel_MemberDesignation1": "secretary", "tf_MeetingAttendance1": "1/1", "tf_JointManagementCommitteeMember2": "Wong Wei Kang", "tf_InitialName2": "W

{'question': 'Who are the attendees of first LEAF meeting?',
 'result': 'The attendees of the first LEAF meeting are:\n\n1. Lee Sian Lee (Chairman, Unit T-04-04)\n2. Tan Kim Seng (Secretary, Unit T-05-05)\n3. Wong Wei Kang (Treasurer, Unit T-06-06)\n4. Teow Wei Yoong (Committee Member, Unit T-08-08)\n5. Lai Choon Fai (Developer Representative, Unit T-11-11)\n6. Chong Ken Yung (Developer Representative, Unit T-12-12)\n7. Chia Wei Ping (Senior Property Manager)\n8. Chan Rong Bang (Building Manager)\n9. Yap Sin Yee (Account Manager)\n10. Thomas Yeap (Building Executive)\n11. Jacky Ling (Account Executive)'}