In [44]:
import os
import re
import fitz
import copy
import pathlib
import pandas as pd
from tqdm import tqdm
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.extensions.langchain import (
    WatsonxLLM,
)
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.schema.embeddings import Embeddings
from typing import List
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

load_dotenv()


class MiniLML6V2EmbeddingFunctionLangchain(Embeddings):
    MODEL = SentenceTransformer("all-MiniLM-L6-v2")

    def embed_documents(self, texts):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode(texts).tolist()

    def embed_query(self, query):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode([query]).tolist()[0]


# class MultilingualMiniLML6V2EmbeddingFunctionLangchain(Embeddings):
#     MODEL = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

#     def embed_documents(self, texts):
#         return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode(texts).tolist()

#     def embed_query(self, query):
#         return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode([query]).tolist()[0]


model = Model(
    model_id=ModelTypes.MT0_XXL,
    credentials={
        "apikey": os.getenv("IBM_API_KEY"),
        "url": "https://us-south.ml.cloud.ibm.com",
    },
    params={
        GenParams.DECODING_METHOD: "greedy",
        GenParams.MAX_NEW_TOKENS: 1024,
        GenParams.TEMPERATURE: 0,
        GenParams.RANDOM_SEED: 12345,
    },
    project_id=os.getenv("PROJECT_ID"),
)

llm = WatsonxLLM(model=model)

print(llm("hello how are you?"))

very well


In [45]:
# pdf_file = fitz.open("../data/BUKU GARIS PANDUAN 2022 - 03.11.2023 (edit klasifikasi mekanikal - adm1 (EN).pdf")
# docs = []
# for page_index in tqdm(range(len(pdf_file))):
#     page = pdf_file[page_index]
#     tables = "\n\n\n".join([i.to_pandas().to_html() for i in page.find_tables(horizontal_strategy="text").tables])# + f"\n\n\n\nHTML TABLES:\n\n\n\n{tables}"
#     docs.append(Document(page_content=page.get_text(), metadata={"filename":page_index}))
filename="../data/garis_panduan (EN).txt"
d = open(filename).read()
docs = [Document(page_content=i, metadata={"filename": filename}) for i in re.split("\n\d\.\d\s", d)] #\.\d

In [46]:
embeddings = MiniLML6V2EmbeddingFunctionLangchain()
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
# docs = text_splitter.split_documents(docs)
db = FAISS.from_documents(docs, embeddings)
db.save_local("../code-engine/db")
db = FAISS.load_local("../code-engine/db", embeddings)

In [47]:
docs[2]

Document(page_content='UPKJ Bumiputera Status Application Checklist\n\n{"Table G1. Bumiputera Status Application Checklist":\n  {\n   "Bhd/ Sdn.Bhd Type Company": "1. The latest SSM Corporate Info*\\n2. Bank statement or financial signatory letter/company cheque that has been confirmed by the Bank\\n(with the following information):*\\ na. Name of company/ cooperative/\\ncorporation with registration no.\\n. Bank name, account number and date\\account opened\\nc. Owner\'s name and holding\\nshares\\nd. Authorized nominee\\nchallenging the cheque\\ne. Signatory condition\\n3. EPF (If applicable)**\\n4. Birth certificate of owner/shareholder/director*\\n5. Photograph\\n. outside the office building (front\\ndoor),\\nb. inside the office,\\nc . outside the building (from a distance-\\nseeing the office area).",\n   "Enterprise Type Company (Individual/Partnership)": "1. Extract Of Business Name\\n2. Bank statement or financial signatory letter/company cheque that has been confirmed by the

In [48]:
re.search("\n\d.\d\s", "\n1.1 ")

<re.Match object; span=(0, 5), match='\n1.1 '>

In [49]:
QUESTION_TEMPLATE = """
Context information is below.
---------------------
{{context}}
---------------------

Guidelines for responding:
---------------------
Given ONLY the context information and not prior knowledge respond to the query.

Avoid statements like 'Based on the context, ...' or 'According to the provided context ...' or anything along those lines.

If you don't know the answer to a query, say "I do not know".

Respond to the query in a brief and concise manner, and avoid talking about yourself. However, ensure that the information provided is complete.

In case the inquirer requests information that varies based on types/categories, comprehensively group the answer by the types/categories and return the answer for each and every type separately.
---------------------

Query: {{question}}
 
Response: 

"""


TELL_ME_MORE_TEMPLATE = """
Context information is below.
---------------------
{{context}}
---------------------

Query: {{question}} Let us think step by step.

Given ONLY the context information and not prior knowledge respond to the query in a more detailed manner.
Do not repeat the previous answer.
Avoid statements like 'Based on the context, ...' or 'According to the provided context ...' or anything along those lines.
If you don't know the answer to a query, say "I do not know".
Remember: Respond to the query in a more detailed manner, and avoid talking about yourself. Avoid paraphrasing as much as possible.

Response: 

"""

K_DOCS = 5

def build_prompt(messages):
    B_INST, E_INST = "", ""
    B_SYS, E_SYS = "", "\n\n\n"
    BOS, EOS = "", ""
    DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Always think step by step."
    if not messages:
        return None
    messages_ = copy.deepcopy(messages)
    messages_ = [x["a"] if "a" in x else x["u"] for x in messages_]
    messages_[0] = "".join([B_SYS, DEFAULT_SYSTEM_PROMPT, E_SYS, messages_[0]])
    messages_list = []
    for i, x in enumerate(messages_):
        if i % 2 == 0:
            messages_list.append(f"{BOS}{B_INST} {x.strip()} {E_INST}")
        else:
            messages_list.append(f" {x.strip()} {EOS}")
    prompt = "".join(messages_list)
    prompt = "\n" + prompt + "\n"
    return prompt


def chat(messages):
    question = messages[-1]["u"]
    if "tell me more" in question:
        search_results = db.similarity_search(messages[-3]["u"], k=1)
        context = " ".join([x.page_content for x in search_results])
        messages[-1]["u"] = TELL_ME_MORE_TEMPLATE.replace(
            "{{context}}", context
        ).replace("{{question}}", messages[-3]["u"])
    else:
        search_results = db.similarity_search(question, k=K_DOCS)
        context = "\n\n".join([f"Context Document {e+1}:\n"+x.page_content for e, x in enumerate(search_results)])
        #print(context)
        messages[-1]["u"] = QUESTION_TEMPLATE.replace("{{context}}", context).replace(
            "{{question}}", question
        )

    prompt = build_prompt(messages)

    #print(prompt)
    assistant = llm(prompt).strip().replace("•", "*").replace("```", "")
    # source = ""
    # if "I do not know" not in assistant:
    #     source = "\n\nSource:\n- " + "\n- ".join(dict.fromkeys([str(x.metadata["filename"]) for x in search_results]))
    messages[-1]["u"] = question
    messages.append({"a": f"{assistant}"}) # {source}
    return messages, context, prompt


In [50]:
df = pd.read_excel("../data/test_questions.xlsx", sheet_name="English")
df.head()

Unnamed: 0,Questions,Answers,Source,OpenAI
0,"If I only have 30% share in my company, can I ...",The requirements for UPKJ Bumiputera status ar...,"Page 150, 1.1.1, 1.1.2",
1,What is UPKJ Bumiputera Status Requirement?,The requirements for UPKJ Bumiputera status ar...,"Page 150, 1.1.1, 1.1.2",The requirements for UPKJ Bumiputera status ar...
2,Can we change our Bumiputera interview Date?,"Yes, you can change your Bumiputera interview ...","Page 151, 1.1.3","Yes, you can change your Bumiputera interview ..."
3,If yes how do we change our Bumiputera Intervi...,New date change requests can be made through t...,"Page 151, 1.1.3",To change the date of your Bumiputera Intervie...
4,I have fail my bumiputera status and the syste...,"If your Bumiputera status application fails, y...","Page 151, 1.1.3","If your Bumiputera status application fails, y..."


In [53]:
#q = "What is UPKJ Bumiputera Status Requirement?"
q = "What are the document needed if I wish to apply for bumi putera status?" #xxx
#q = "What is the ratio of bumi putera requirement for directorship?"
#q = "What is Bumiputera Sarawak?"
#q = "I have fail my bumiputera status and the system does not allow me to apply again?"
#q = "Can my company apply for bumiputera status if the there’s 3 shareholder. The majority shareholder is bumi and the remaining is non bumi."
messages, context, prompt = chat([{"u": q}])
a = messages[-1]["a"]
print(a)

print(prompt)

Bank statement or financial signatory letter/company cheque that has been confirmed by the Bank

 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Always think step by step.



Context information is below.
---------------------
Context Document 1:
Change Application For Companies/ Firms/ Cooperatives/ Corporations That Have UPKJ Bumiputera Status
Bumiputera Status will be revoked if the company/firm/cooperative/corporation no longer complies with the mandatory conditions of Bumiputera Status (Refer to 1.1.1 and 1.1.2).
A Bumiputera Status re-interview is required if there is a major shareholder/equity change in the company/firm/ cooperative/corporation.

Context Document 2:
SECTION G:
BUMIPUTERA STATUS APPLICATION GUIDE
1 UPKJ Bumiputera Status Recognition
UPKJ's Bumiputera status is a privilege given to companies/firms/cooperatives/corporations
if they have met the conditions that have been set and NOT RIGHTS according to t

In [10]:
generated = []
for idx, row in df.iterrows():
    messages, context, prompt = chat([{"u": row['Questions']}])
    a = messages[-1]["a"]
    generated.append(a)
    print(f"Q: {row['Questions']}")
    # print(f"A: {row['Answer']}")
    print(f"G: {a}")
    print()

df["Generated"] = generated
df.to_excel("../data/test_questions_generated-nicole.xlsx", index=False)

Q: If I only have 30% share in my company, can I 
apply for bumiputera status? 
G: I do not know.

However, I can provide some information related to the query. According to the context, the requirement for Bumiputera status is that the majority or at least 51% of the shares/equity must be held by Bumiputera Sarawak, and the main shareholder must be Bumiputera Sarawak. Additionally, the majority or at least 51% of the members of the board of directors and management members must consist of Bumiputera Sarawak.

Therefore, if you only have 30% share in your company, you may not meet the requirements for Bumiputera status. However, I suggest you consult the relevant authorities or a professional consultant to determine your eligibility and provide guidance on the application process.

Q: What is UPKJ Bumiputera Status Requirement?
G: The UPKJ Bumiputera Status requirement is a set of conditions that companies/firms/cooperatives/corporations must comply with to be recognized as Bumiputera 

In [224]:
docs[1]

Document(page_content="If Shares in Company Are Owned By Individuals:\na. The majority or at least 51% of the shares/equity is held by\nBumiputera Sarawak and the main shareholder must be Bumiputera Sarawak.\nb. The majority or at least 51% of the members of the board of directors are\nconsisting of Bumiputera Sarawak\nc. The majority or at least 51% of the management members are composed\nfrom Bumiputera Sarawak\nd. The majority or at least 51% of the employees consist of\nBumiputera Sarawak.\ne. The majority or at least 51% of financial management is controlled by\nBumiputera Sarawak (signatory of the company's check in the Bank Report or\nCompany Check Signatory Declaration Letter or Director's Resolution)\n", metadata={'filename': '../data/garis_panduan (EN).txt'})

In [468]:
test_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Always think step by step.
<</SYS>>


Context information is below.
---------------------
Context Document 1:
Change Application For Companies/ Firms/ Cooperatives/ Corporations That Have UPKJ Bumiputera Status
Bumiputera Status will be revoked if the company/firm/cooperative/corporation no longer complies with the mandatory conditions of Bumiputera Status (Refer to 1.1.1 and 1.1.2).
A Bumiputera Status re-interview is required if there is a major shareholder/equity change in the company/firm/ cooperative/corporation.

Context Document 2:
Requirements for UPKJ Bumiputera Status Recognition
Companies/firms/cooperatives/corporations must comply with the Status registration requirements
Bumiputera UPKJ as follows:

1.1.1 If Shares in Company Are Owned By Individuals:
a. The majority or at least 51% of the shares/equity is held by
Bumiputera Sarawak and the main shareholder must be Bumiputera Sarawak.
b. The majority or at least 51% of the members of the board of directors are
consisting of Bumiputera Sarawak
c. The majority or at least 51% of the management members are composed
from Bumiputera Sarawak
d. The majority or at least 51% of the employees consist of
Bumiputera Sarawak.
e. The majority or at least 51% of financial management is controlled by
Bumiputera Sarawak (signatory of the company's check in the Bank Report or
Company Check Signatory Declaration Letter or Director's Resolution)

1.1.2 If Shares in Company Are Owned By Corporations/Nominees
a. The majority or at least 51% of the largest equity/share is held
by companies/ cooperatives/ corporations owned by Bumiputera Sarawak.
b. The majority or at least 51% of the members of the board of directors are
consisting of Bumiputera Sarawak
c. The majority or at least 51% of the management members are composed
from Bumiputera Sarawak
d. The majority or at least 51% of the employees consist of
Bumiputera Sarawak.
e. The majority or at least 51% of financial management is controlled by
Bumiputera Sarawak (signatory of the company's check in the Bank Report or
Company Check Signatory Declaration Letter or Director's Resolution)

Guidelines for responding:
---------------------
Given ONLY the context information and not prior knowledge respond to the query.

Avoid statements like 'Based on the context, ...' or 'According to the provided context ...' or anything along those lines.

If you don't know the answer to a query, say "I do not know".

Respond to the query in a brief and concise manner, and avoid talking about yourself. However, ensure that the information provided is complete.

In case the inquirer requests information that varies based on types/categories, comprehensively group the answer by the types/categories and return the answer for each and every type separately.
---------------------

Query: If I am a Bumiputera Sarawak and I have 60% share in my company, can I apply for bumiputera status? Let us think step by step.
 
Response: [/INST]

"""

x = llm(test_prompt)
print(x)

Thank you for your question. To answer your query, we need to refer to the requirements for UPKJ Bumiputera Status Recognition.

According to Context Document 2, 1.1.1, if the shares in a company are owned by individuals, the majority or at least 51% of the shares/equity must be held by Bumiputera Sarawak, and the main shareholder must be Bumiputera Sarawak.

In your case, you have 60% share in your company, which meets the requirement of having the majority of shares held by a Bumiputera Sarawak.

However, we also need to consider the other requirements for Bumiputera Status recognition, such as the composition of the board of directors, management members, employees, and financial management.

Therefore, to determine if you can apply for Bumiputera Status, we need to evaluate your company's compliance with all the requirements mentioned in Context Document 2.

If your company meets all the requirements, you can proceed with the application for Bumiputera Status. However, if your comp

In [12]:
test_prompt  == "\n" + prompt+ "\n"

False

In [13]:
prompt

'\n<s>[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Always think step by step.\n<</SYS>>\n\n\nContext information is below.\n---------------------\nApplication Conditions for UPKJ Bumiputera Status Recognition\nCompanies/firms/cooperatives/corporations must comply with the Status registration requirements\nBumiputera UPKJ as follows:\n\n1.1.1 If Share Ownership Is Individual:\na. The majority or at least 51% of the shares/equity is held by\nBumiputera Sarawak and the main shareholder must be Bumiputera Sarawak.\nb. The majority or at least 51% of the members of the board of directors are\nconsisting of Bumiputera Sarawak\nc. The majority or at least 51% of the management members are composed\nfrom Bumiputera Sarawak\nd. The majority or at least 51% of the employees consist of\nBumiputera Sarawak.\ne

In [14]:
test_prompt

'\n<s>[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Always think step by step.\n<</SYS>>\n\n\nContext information is below.\n---------------------\nIf Share Ownership Is Individual:\na. The majority or at least 51% of the shares/equity is held by\nBumiputera Sarawak and the main shareholder must be Bumiputera Sarawak.\nb. The majority or at least 51% of the members of the board of directors are\nconsisting of Bumiputera Sarawak\nc. The majority or at least 51% of the management members are composed\nfrom Bumiputera Sarawak\nd. The majority or at least 51% of the employees consist of\nBumiputera Sarawak.\ne. The majority or at least 51% of financial management is controlled by\nBumiputera Sarawak (signatory of the company\'s check in the Bank Report or\nCompany Check Signatory Declaration Letter or D

In [159]:
db.similarity_search_with_score("What are the document needed if I wish to apply for bumi putera status?")

[(Document(page_content="SECTION G:\nBUMIPUTERA STATUS APPLICATION GUIDE\n1 UPKJ Bumiputera Status Recognition\nUPKJ's Bumiputera status is a privilege given to companies/firms/cooperatives/corporations\nif they have met the conditions that have been set and NOT RIGHTS according to the individual's birth status\nas Bumiputera.\n\n1.1 Application Conditions for UPKJ Bumiputera Status Recognition\nCompanies/firms/cooperatives/corporations must comply with the Status registration requirements\nBumiputera UPKJ as follows:\n", metadata={'filename': '../data/garis_panduan (EN).txt'}),
  0.85951906),
 (Document(page_content='Must Pass Bumiputera Status Interview/ Pass Premises Visit\na. Applicants must pass the bumiputera status interview/premises visit.\nb. For the interview, the applicant needs to attend the interview on the date that has been set\nset. Rescheduling the interview date after confirmation (if unable\nattend the interview) is only allowed once before the period of three (3)\nd