In [1]:
import os
import re
import fitz
import copy
import pathlib
import pandas as pd
from tqdm import tqdm
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.extensions.langchain import (
    WatsonxLLM,
)
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.schema.embeddings import Embeddings
from typing import List
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

load_dotenv()


class MiniLML6V2EmbeddingFunctionLangchain(Embeddings):
    MODEL = SentenceTransformer("all-MiniLM-L6-v2")

    def embed_documents(self, texts):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode(texts).tolist()

    def embed_query(self, query):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode([query]).tolist()[0]


# class MultilingualMiniLML6V2EmbeddingFunctionLangchain(Embeddings):
#     MODEL = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

#     def embed_documents(self, texts):
#         return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode(texts).tolist()

#     def embed_query(self, query):
#         return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode([query]).tolist()[0]


model = Model(
    model_id=ModelTypes.LLAMA_2_70B_CHAT,
    credentials={
        "apikey": os.getenv("IBM_API_KEY"),
        "url": "https://us-south.ml.cloud.ibm.com",
    },
    params={
        GenParams.DECODING_METHOD: "greedy",
        GenParams.MAX_NEW_TOKENS: 1024,
        GenParams.TEMPERATURE: 0,
        GenParams.RANDOM_SEED: 12345,
    },
    project_id=os.getenv("PROJECT_ID"),
)

llm = WatsonxLLM(model=model)

print(llm("hello how are you?"))



Comment: Hello! I'm doing well, thanks for asking. How about you? Is there anything you'd like to chat about or ask? I'm here to help with any questions you might have.


In [2]:
# pdf_file = fitz.open("../data/BUKU GARIS PANDUAN 2022 - 03.11.2023 (edit klasifikasi mekanikal - adm1 (EN).pdf")
# docs = []
# for page_index in tqdm(range(len(pdf_file))):
#     page = pdf_file[page_index]
#     tables = "\n\n\n".join([i.to_pandas().to_html() for i in page.find_tables(horizontal_strategy="text").tables])# + f"\n\n\n\nHTML TABLES:\n\n\n\n{tables}"
#     docs.append(Document(page_content=page.get_text(), metadata={"filename":page_index}))
filename="../data/garis_panduan (EN).txt"
d = open(filename).read()
docs = [Document(page_content=i, metadata={"filename": filename}) for i in re.split("\n\d\.\d\s", d)] #\.\d

In [3]:
embeddings = MiniLML6V2EmbeddingFunctionLangchain()
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
# docs = text_splitter.split_documents(docs)
db = FAISS.from_documents(docs, embeddings)
db.save_local("../code-engine/db")
db = FAISS.load_local("../code-engine/db", embeddings)

In [4]:
docs[2]

Document(page_content='UPKJ Bumiputera Status Application Checklist\n\n{"Table G1. Bumiputera Status Application Checklist":\n  {\n   "Bhd/ Sdn.Bhd Type Company": "1. The latest SSM Corporate Info*\\n2. Bank statement or financial signatory letter/company cheque that has been confirmed by the Bank\\n(with the following information):*\\ na. Name of company/ cooperative/\\ncorporation with registration no.\\n. Bank name, account number and date\\account opened\\nc. Owner\'s name and holding\\nshares\\nd. Authorized nominee\\nchallenging the cheque\\ne. Signatory condition\\n3. EPF (If applicable)**\\n4. Birth certificate of owner/shareholder/director*\\n5. Photograph\\n. outside the office building (front\\ndoor),\\nb. inside the office,\\nc . outside the building (from a distance-\\nseeing the office area).",\n   "Enterprise Type Company (Individual/Partnership)": "1. Extract Of Business Name\\n2. Bank statement or financial signatory letter/company cheque that has been confirmed by the

In [5]:
re.search("\n\d.\d\s", "\n1.1 ")

<re.Match object; span=(0, 5), match='\n1.1 '>

In [6]:
QUESTION_TEMPLATE = """
Context information is below.
---------------------
{{context}}
---------------------

Guidelines for responding:
---------------------
Given ONLY the context information and not prior knowledge respond to the query.

Avoid statements like 'Based on the context, ...' or 'According to the provided context ...' or 'According to Context Document 1', or anything along those lines.

If you don't know the answer to a query, say "I do not know".

Respond to the query in a brief and concise manner, and avoid talking about yourself. However, ensure that the information provided is complete. Do not extrapolate or provide suggestions.

In case the inquirer requests information that varies based on types/categories, comprehensively group the answer by the types/categories and return the answer for each and every type separately.
---------------------

Query: {{question}} Let us think step by step.
 
Response: 

"""


TELL_ME_MORE_TEMPLATE = """
Context information is below.
---------------------
{{context}}
---------------------

Guidelines for responding:
---------------------
Given ONLY the context information and not prior knowledge respond to the query in a more detailed manner.

Avoid statements like 'Based on the context, ...' or 'According to the provided context ...' or 'According to Context Document 1', or anything along those lines.

If you don't know the answer to a query, say "I do not know".

Respond to the query in a brief and concise manner, and avoid talking about yourself. However, ensure that the information provided is complete. Do not extrapolate or provide suggestions.

In case the inquirer requests information that varies based on types/categories, comprehensively group the answer by the types/categories and return the answer for each and every type separately.
---------------------

Query: {{question}} Let us think step by step.
 
Response: 

"""

K_DOCS = 5

def build_prompt(messages):
    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
    BOS, EOS = "<s>", "</s>"
    DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content."
    if not messages:
        return None
    messages_ = copy.deepcopy(messages)
    messages_ = [x["a"] if "a" in x else x["u"] for x in messages_]
    messages_[0] = "".join([B_SYS, DEFAULT_SYSTEM_PROMPT, E_SYS, messages_[0]])
    messages_list = []
    for i, x in enumerate(messages_):
        if i % 2 == 0:
            messages_list.append(f"{BOS}{B_INST} {x.strip()} {E_INST}")
        else:
            messages_list.append(f" {x.strip()} {EOS}")
    prompt = "".join(messages_list)
    prompt = "\n" + prompt + "\n"
    return prompt


def chat(messages):
    question = messages[-1]["u"].strip()
    if "tell me more" in question:
        search_results = db.similarity_search(messages[-3]["u"].strip(), k=1)
        context = "\n\n".join([f"Context Document {e+1}:\n"+x.page_content for e, x in enumerate(search_results)])
        messages[-1]["u"] = TELL_ME_MORE_TEMPLATE.replace(
            "{{context}}", context
        ).replace("{{question}}", messages[-3]["u"])
    else:
        search_results = db.similarity_search(question, k=K_DOCS)
        context = "\n\n".join([f"Context Document {e+1}:\n"+x.page_content for e, x in enumerate(search_results)])
        #print(context)
        messages[-1]["u"] = QUESTION_TEMPLATE.replace("{{context}}", context).replace(
            "{{question}}", question
        )

    prompt = build_prompt(messages)

    #print(prompt)
    assistant = llm(prompt).strip().replace("•", "*").replace("```", "")
    assistant = re.sub("Context Document \d", "documentation", assistant)
    assistant = re.sub("Context Documents \d-\d", "documentation", assistant)
    # source = ""
    # if "I do not know" not in assistant:
    #     source = "\n\nSource:\n- " + "\n- ".join(dict.fromkeys([str(x.metadata["filename"]) for x in search_results]))
    messages[-1]["u"] = question
    messages.append({"a": f"{assistant}"}) # {source}
    return messages, context, prompt


In [7]:
df = pd.read_excel("../data/test_questions.xlsx", sheet_name="English")
df.head()

Unnamed: 0,Questions,Answers,Source,OpenAI
0,"If I only have 30% share in my company, can I ...",The requirements for UPKJ Bumiputera status ar...,"Page 150, 1.1.1, 1.1.2",
1,What is UPKJ Bumiputera Status Requirement?,The requirements for UPKJ Bumiputera status ar...,"Page 150, 1.1.1, 1.1.2",The requirements for UPKJ Bumiputera status ar...
2,Can we change our Bumiputera interview Date?,"Yes, you can change your Bumiputera interview ...","Page 151, 1.1.3","Yes, you can change your Bumiputera interview ..."
3,If yes how do we change our Bumiputera Intervi...,New date change requests can be made through t...,"Page 151, 1.1.3",To change the date of your Bumiputera Intervie...
4,I have fail my bumiputera status and the syste...,"If your Bumiputera status application fails, y...","Page 151, 1.1.3","If your Bumiputera status application fails, y..."


In [8]:
demo_answers = []
demo_questions = [
    "What is UPKJ Bumiputera Status Requirement?", 
    "What is the ratio of bumi putera requirement for directorship?", 
    "I have fail my bumiputera status and the system does not allow me to apply again?", 
    "Can we change our Bumiputera interview Date?", 
    "Can my company apply for bumiputera status if the there’s 3 shareholder. The majority shareholder is bumi and the remaining is non bumi."]
for q in demo_questions:
    messages, context, prompt = chat([{"u": q}])
    a = messages[-1]["a"]
    print("new question")
    print(q)
    print(a)
    demo_answers.append(a)

new question
What is UPKJ Bumiputera Status Requirement?
The UPKJ Bumiputera Status Requirement includes the following:

1.1.1 If Shares in Company Are Owned By Individuals:
a. The majority or at least 51% of the shares/equity is held by Bumiputera Sarawak and the main shareholder must be Bumiputera Sarawak.
b. The majority or at least 51% of the members of the board of directors are consisting of Bumiputera Sarawak
c. The majority or at least 51% of the management members are composed from Bumiputera Sarawak
d. The majority or at least 51% of the employees consist of Bumiputera Sarawak.
e. The majority or at least 51% of financial management is controlled by Bumiputera Sarawak (signatory of the company's check in the Bank Report or Company Check Signatory Declaration Letter or Director's Resolution)

1.1.2 If Shares in Company Are Owned By Corporations/Nominees:
a. The majority or at least 51% of the largest equity/share is held by companies/ cooperatives/ corporations owned by Bumipu

In [None]:
generated = []
for idx, row in df.iterrows():
    messages, context, prompt = chat([{"u": row['Questions']}])
    a = messages[-1]["a"]
    generated.append(a)
    print(f"Q: {row['Questions']}")
    # print(f"A: {row['Answer']}")
    print(f"G: {a}")
    print()

df["Generated"] = generated
df.to_excel("../data/test_questions_generated-nicole.xlsx", index=False)

Q: If I only have 30% share in my company, can I 
apply for bumiputera status? 
G: I do not know. However, I can provide you with information on the requirements for Bumiputera Status recognition. According to documentation, companies/firms/cooperatives/corporations must comply with the following requirements:

1.1.1 If Shares in Company Are Owned By Individuals:
a. The majority or at least 51% of the shares/equity is held by Bumiputera Sarawak and the main shareholder must be Bumiputera Sarawak.

Therefore, if you only have 30% share in your company, you may not meet the requirement of having the majority or at least 51% of the shares/equity held by Bumiputera Sarawak. However, I suggest you refer to the complete list of requirements in documentation and assess your eligibility accordingly. Additionally, you may want to consider consulting with a professional consultant or UPKJ representative for further guidance.

Q: What is UPKJ Bumiputera Status Requirement?
G: The UPKJ Bumiputera 

In [None]:
docs[1]

In [None]:
test_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Always think step by step.
<</SYS>>


Context information is below.
---------------------
Context Document 1:
Change Application For Companies/ Firms/ Cooperatives/ Corporations That Have UPKJ Bumiputera Status
Bumiputera Status will be revoked if the company/firm/cooperative/corporation no longer complies with the mandatory conditions of Bumiputera Status (Refer to 1.1.1 and 1.1.2).
A Bumiputera Status re-interview is required if there is a major shareholder/equity change in the company/firm/ cooperative/corporation.

Context Document 2:
Requirements for UPKJ Bumiputera Status Recognition
Companies/firms/cooperatives/corporations must comply with the Status registration requirements
Bumiputera UPKJ as follows:

1.1.1 If Shares in Company Are Owned By Individuals:
a. The majority or at least 51% of the shares/equity is held by
Bumiputera Sarawak and the main shareholder must be Bumiputera Sarawak.
b. The majority or at least 51% of the members of the board of directors are
consisting of Bumiputera Sarawak
c. The majority or at least 51% of the management members are composed
from Bumiputera Sarawak
d. The majority or at least 51% of the employees consist of
Bumiputera Sarawak.
e. The majority or at least 51% of financial management is controlled by
Bumiputera Sarawak (signatory of the company's check in the Bank Report or
Company Check Signatory Declaration Letter or Director's Resolution)

1.1.2 If Shares in Company Are Owned By Corporations/Nominees
a. The majority or at least 51% of the largest equity/share is held
by companies/ cooperatives/ corporations owned by Bumiputera Sarawak.
b. The majority or at least 51% of the members of the board of directors are
consisting of Bumiputera Sarawak
c. The majority or at least 51% of the management members are composed
from Bumiputera Sarawak
d. The majority or at least 51% of the employees consist of
Bumiputera Sarawak.
e. The majority or at least 51% of financial management is controlled by
Bumiputera Sarawak (signatory of the company's check in the Bank Report or
Company Check Signatory Declaration Letter or Director's Resolution)

Guidelines for responding:
---------------------
Given ONLY the context information and not prior knowledge respond to the query.

Avoid statements like 'Based on the context, ...' or 'According to the provided context ...' or anything along those lines.

If you don't know the answer to a query, say "I do not know".

Respond to the query in a brief and concise manner, and avoid talking about yourself. However, ensure that the information provided is complete.

In case the inquirer requests information that varies based on types/categories, comprehensively group the answer by the types/categories and return the answer for each and every type separately.
---------------------

Query: If I am a Bumiputera Sarawak and I have 60% share in my company, can I apply for bumiputera status? Let us think step by step.
 
Response: [/INST]

"""

x = llm(test_prompt)
print(x)

In [None]:
test_prompt  == "\n" + prompt+ "\n"

In [None]:
prompt

In [None]:
test_prompt

In [None]:
db.similarity_search_with_score("What are the document needed if I wish to apply for bumi putera status?")

In [None]:
q = "Can my company apply for bumiputera status if the there’s 3 shareholder. The majority shareholder is bumi and the remaining is non bumi. "
messages, context, prompt = chat([{"u": q}])
a = messages[-1]["a"]
print(q)
print(a)