In [1]:
import os
import fitz
import copy
import pathlib
import pandas as pd
from tqdm import tqdm
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.extensions.langchain import (
    WatsonxLLM,
)
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.schema.embeddings import Embeddings
from typing import List
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

load_dotenv()


class MiniLML6V2EmbeddingFunctionLangchain(Embeddings):
    MODEL = SentenceTransformer("all-MiniLM-L6-v2")

    def embed_documents(self, texts):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode(texts).tolist()

    def embed_query(self, query):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode([query]).tolist()[0]


# class MultilingualMiniLML6V2EmbeddingFunctionLangchain(Embeddings):
#     MODEL = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

#     def embed_documents(self, texts):
#         return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode(texts).tolist()

#     def embed_query(self, query):
#         return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode([query]).tolist()[0]


model = Model(
    model_id=ModelTypes.LLAMA_2_70B_CHAT,
    credentials={
        "apikey": os.getenv("IBM_API_KEY"),
        "url": "https://us-south.ml.cloud.ibm.com",
    },
    params={
        GenParams.DECODING_METHOD: "sample",
        GenParams.MAX_NEW_TOKENS: 1024,
        GenParams.TEMPERATURE: 0.1,
        GenParams.RANDOM_SEED: 12345,
    },
    project_id=os.getenv("PROJECT_ID"),
)

llm = WatsonxLLM(model=model)

print(llm("hello how are you?"))

Error getting IAM Token.
Reason: <Response [400]>


WMLClientError: Error getting IAM Token.
Reason: <Response [400]>

In [2]:
pdf_file = fitz.open("../data/BUKU GARIS PANDUAN 2022 - 03.11.2023 (edit klasifikasi mekanikal - adm1 (EN).pdf")
docs = []
for page_index in tqdm(range(len(pdf_file))):
    page = pdf_file[page_index]
    tables = "\n\n\n".join([i.to_pandas().to_html() for i in page.find_tables(horizontal_strategy="text").tables])# + f"\n\n\n\nHTML TABLES:\n\n\n\n{tables}"
    docs.append(Document(page_content=page.get_text(), metadata={"filename":page_index}))

  0%|          | 0/176 [00:00<?, ?it/s]100%|██████████| 176/176 [00:04<00:00, 35.85it/s]


In [3]:
embeddings = MiniLML6V2EmbeddingFunctionLangchain()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
docs = text_splitter.split_documents(docs)
db = FAISS.from_documents(docs, embeddings)
db.save_local("../code-engine/db")
db = FAISS.load_local("../code-engine/db", embeddings)

In [4]:
QUESTION_TEMPLATE = """
Context information is below.
---------------------
{{context}}
---------------------

Query: {{question}}
        
Given the context information and not prior knowledge respond to the query in a brief and concise manner using only one sentence. If the question is asked in Malay, answer in Malay. If the question is asked in English, answer in English.
Avoid statements like 'Based on the context, ...' or 'According to the provided context ...' or anything along those lines.

Based on the context, if the meaning/intent of the query is not clear, respond by asking for additional information that will enable you to determine the intent. Then, end the response. 
If the intent behind the query is clear, but you don't know the answer to a query, say "I do not know".

Remember: Respond to the query in a brief and concise manner using only one sentence, and avoid talking about yourself. If the question is asked in Malay, answer in Malay. If the question is asked in English, answer in English.

Let us think step by step.
        
Response: 

"""


TELL_ME_MORE_TEMPLATE = """
Context information is below.
---------------------
{{context}}
---------------------

Query: {{question}}

Given the context information and not prior knowledge respond to the query in a more detailed manner. If the question is asked in Malay, answer in Malay. If the question is asked in English, answer in English.

Do not repeat the previous answer.

Avoid statements like 'Based on the context, ...' or 'According to the provided context ...' or anything along those lines.

Based on the context, if the meaning/intent of the query is not clear, respond by asking for additional information that will enable you to determine the intent. Then, end the response. 
If the intent behind the query is clear, but you don't know the answer to a query, say "I do not know".

Remember: Respond to the query in a more detailed manner, and avoid talking about yourself. If the question is asked in Malay, answer in Malay. If the question is asked in English, answer in English.

Let us think step by step.

Response: 

"""

K_DOCS = 3

def build_prompt(messages):
    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
    BOS, EOS = "<s>", "</s>"
    DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content."
    if not messages:
        return None
    messages_ = copy.deepcopy(messages)
    messages_ = [x["a"] if "a" in x else x["u"] for x in messages_]
    messages_[0] = "".join([B_SYS, DEFAULT_SYSTEM_PROMPT, E_SYS, messages_[0]])
    messages_list = []
    for i, x in enumerate(messages_):
        if i % 2 == 0:
            messages_list.append(f"{BOS}{B_INST} {x.strip()} {E_INST}")
        else:
            messages_list.append(f" {x.strip()} {EOS}")
    prompt = "".join(messages_list)
    return prompt


def chat(messages):
    question = messages[-1]["u"]
    if "tell me more" in question:
        search_results = db.similarity_search(messages[-3]["u"], k=1)
        context = " ".join([x.page_content for x in search_results])
        messages[-1]["u"] = TELL_ME_MORE_TEMPLATE.replace(
            "{{context}}", context
        ).replace("{{question}}", messages[-3]["u"])
    else:
        search_results = db.similarity_search(question, k=K_DOCS)
        context = " ".join([x.page_content for x in search_results])
        messages[-1]["u"] = QUESTION_TEMPLATE.replace("{{context}}", context).replace(
            "{{question}}", question
        )

    prompt = build_prompt(messages)
    assistant = llm(prompt).strip().replace("•", "*").replace("```", "")
    # source = ""
    # if "I do not know" not in assistant:
    #     source = "\n\nSource:\n- " + "\n- ".join(dict.fromkeys([str(x.metadata["filename"]) for x in search_results]))
    messages[-1]["u"] = question
    messages.append({"a": f"{assistant}"}) # {source}
    return messages, context, prompt


In [7]:
q = "If I only have 30 percent share in my company, can I apply for bumiputera status?"
messages, context, prompt = chat([{"u": q}])
a = messages[-1]["a"]
print(a)

I do not know.

Please provide more information regarding the company's ownership structure and the individual's role in the company to determine if they meet the requirements for Bumiputera status. Additionally, it would be helpful to know the specific requirements for Bumiputera status in the relevant jurisdiction.


In [12]:
# print(prompt)

PROMPT = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
<</SYS>>


Context information is below.
---------------------
Companies/firms/cooperatives/corporations must comply with the Status registration requirements Bumiputera UPKJ as follows:
1.1.1
If the shareholder is an individual:
a. The majority or at least 51% of the shares/equity is held by Sarawakian Bumiputera and the main shareholder must be Sarawakian Bumiputera.
b. The majority or at least 51% of the members of the board of directors are made up of Bumiputera Sarawak c. The majority
or at least 51% of the management members are composed from Bumiputera Sarawak
d. The majority or at least 51% of the employees consist of Native of Sarawak.
e. The majority or at least 51% of financial management is dominated by Bumiputera Sarawak (company check signatories in the Bank Report or Company Check Signatory Declaration Letter or Director's Resolution)

a. The majority or at least 51% of the largest equity/shares are held by companies/ cooperatives/corporations owned by Bumiputera Sarawak.
b. The majority or at least 51% of the members of the board of directors are composed of Bumiputera Sarawak
c. The majority or at least 51% from Bumiputera Sarawak
d. The majority or at least 51% Native of Sarawak.
of the management members are composed
of the employees consist of
e. The majority or at least 51%
---------------------

Query: If I only have 30 percent share in my company, can I apply for bumiputera status?
        
Given the context information and not prior knowledge respond to the query in a brief and concise manner using only one sentence. If the question is asked in Malay, answer in Malay. If the question is asked in English, answer in English.
Avoid statements like 'Based on the context, ...' or 'According to the provided context ...' or anything along those lines.

Based on the context, if the meaning/intent of the query is not clear, respond by asking for additional information that will enable you to determine the intent. Then, end the response. 
If the intent behind the query is clear, but you don't know the answer to a query, say "I do not know".

Remember: Respond to the query in a brief and concise manner using only one sentence, and avoid talking about yourself. If the question is asked in Malay, answer in Malay. If the question is asked in English, answer in English.

Let us think step by step.
        
Response: [/INST]

"""

print(llm(prompt).strip())

I do not know.

Please provide more information regarding the company's ownership structure and the individual's role in the company to determine if they meet the requirements for Bumiputera status. Additionally, it would be helpful to know the specific requirements for Bumiputera status in the relevant jurisdiction.


In [7]:
df = pd.read_excel("../data/test_questions.xlsx", sheet_name="English")
df.head()

Unnamed: 0,Questions,Answers,Source
0,"If I only have 30% share in my company, can I ...",,"Page 150, 1.1"
1,What is UPKJ Bumiputera Status Requirement?,,
2,Can we change our Bumiputera interview Date?,,
3,If yes how do we change our Bumiputera Intervi...,,
4,I have fail my bumiputera status and the syste...,,


In [8]:
generated = []
for idx, row in df.iterrows():
    messages, context, prompt = chat([{"u": row['Questions']}])
    a = messages[-1]["a"]
    generated.append(a)
    print(f"Q: {row['Questions']}")
    # print(f"A: {row['Answer']}")
    print(f"G: {a}")
    print()

df["Generated"] = generated
df.to_excel("../data/test_questions_generated-randy.xlsx", index=False)

Q: If I only have 30% share in my company, can I 
apply for bumiputera status? 
G: I do not know.

Please provide more information on the company's ownership structure and the individual's role in the company to determine if they meet the requirements for Bumiputera status. Additionally, it would be helpful to know the specific requirements for Bumiputera status in the relevant jurisdiction.

Q: What is UPKJ Bumiputera Status Requirement?
G: UPKJ Bumiputera Status requirement includes submitting an application through the OLAM system, providing required documents such as a business registration certificate, bank statement, and EPF statement, and passing an interview.

Q: Can we change our Bumiputera interview Date?
G: Yes, the interview date can be changed, but it is only allowed within two weeks from the original confirmed date, and a new date can be selected through the OLAM system.

Q: If yes how do we change our Bumiputera Interview date?
G: "To change the Bumiputera interview date