In [None]:
#pip install python-dotenv
#pip install camelot-py
#pip install opencv-python
#pip install PyMuPDF

In [None]:
import os
import copy
import pathlib
import pandas as pd
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.extensions.langchain import (
    WatsonxLLM,
)
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.schema.embeddings import Embeddings
from typing import List
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()


class MiniLML6V2EmbeddingFunctionLangchain(Embeddings):
    MODEL = SentenceTransformer("all-MiniLM-L6-v2")

    def embed_documents(self, texts):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode(texts).tolist()

    def embed_query(self, query):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode([query]).tolist()[0]


class MultilingualMiniLML6V2EmbeddingFunctionLangchain(Embeddings):
    MODEL = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

    def embed_documents(self, texts):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode(texts).tolist()

    def embed_query(self, query):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode([query]).tolist()[0]

In [2]:
def translate(text, llm, direction):
    if direction=="ms2en":
        prompt = f"""
        Translate the following from Malay to English, but avoid translating the words "bumiputera", "Bumiputera" and "bumi putera":
    
        {text}
        .
        """
    else:
        prompt = f"""
        Translate the following from English to Malay:
    
        {text}
        .
        """
        

    return(llm(prompt))

In [6]:
#load data

import pandas as pd
test_q_eng = pd.read_excel("../data/test_questions_generated-indrajit.xlsx", sheet_name="Sheet1")
#test_q_ms = pd.read_excel("../data/test_questions.xlsx", sheet_name="Malay")

In [7]:
import fitz
import camelot
from langchain.docstore.document import Document

class Conversation:
    def __init__(self, english_source=True, llama=True, hook_translation=False):
        self.hook_translation = hook_translation
        self.translation_llm = WatsonxLLM(Model(
            model_id=ModelTypes.MT0_XXL,
            credentials={
                "apikey": os.getenv("API_KEY"),
                "url": "https://us-south.ml.cloud.ibm.com",
            },
            params={
                GenParams.MAX_NEW_TOKENS: 1024,
                GenParams.TEMPERATURE: 0,
                GenParams.RANDOM_SEED: 12345,
            },
            project_id=os.getenv("PROJECT_ID"),))
        if english_source:
            self.pdf = self.read_pdf("../data/BUKU GARIS PANDUAN 2022 - 03.11.2023 (edit klasifikasi mekanikal - adm1 (EN).pdf")
            self.embeddings = MiniLML6V2EmbeddingFunctionLangchain()
        else:
            self.pdf = self.read_pdf("../data/BUKU GARIS PANDUAN 2022 - 03.11.2023 (edit klasifikasi mekanikal - adm1.pdf")
            self.embeddings = MultilingualMiniLML6V2EmbeddingFunctionLangchain()
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
        self.K_DOCS = 3
        self.llama = llama


        if llama:
            model = Model(
            model_id=ModelTypes.LLAMA_2_70B_CHAT,
            credentials={
                "apikey": os.getenv("API_KEY"),
                "url": "https://us-south.ml.cloud.ibm.com",
            },
            params={
                GenParams.MAX_NEW_TOKENS: 1024,
                GenParams.TEMPERATURE: 0,
                GenParams.RANDOM_SEED: 12345,
            },
            project_id=os.getenv("PROJECT_ID"),)
        else:
            model = Model(
            model_id=ModelTypes.MT0_XXL,
            credentials={
                "apikey": os.getenv("API_KEY"),
                "url": "https://us-south.ml.cloud.ibm.com",
            },
            params={
                GenParams.MAX_NEW_TOKENS: 1024,
                GenParams.TEMPERATURE: 0,
                GenParams.RANDOM_SEED: 12345,
            },
            project_id=os.getenv("PROJECT_ID"),)
            

        self.llm = WatsonxLLM(model=model)

        self.QUESTION_TEMPLATE = """
        Context information is below.
        ---------------------
        {{context}}
        ---------------------
        
        Query: {{question}}
        
        Given the context information and not prior knowledge respond to the query in a brief and concise manner using only one sentence. If the question is asked in Malay, answer in Malay. If the question is asked in English, answer in English.
        Avoid statements like 'Based on the context, ...' or 'According to the provided context ...' or anything along those lines.
        
        Based on the context, if the meaning/intent of the query is not clear, respond by asking for additional information that will enable you to determine the intent. Then, end the response. 
        If the intent behind the query is clear, but you don't know the answer to a query, say "I do not know".
        
        Remember: Respond to the query in a brief and concise manner using only one sentence, and avoid talking about yourself. If the question is asked in Malay, answer in Malay. If the question is asked in English, answer in English.

        Let us think step by step.
                
        Response: 
        
        """
        
        
        self.TELL_ME_MORE_TEMPLATE = """
        Context information is below.
        ---------------------
        {{context}}
        ---------------------
        
        Query: {{question}}
        
        Given the context information and not prior knowledge respond to the query in a more detailed manner. If the question is asked in Malay, answer in Malay. If the question is asked in English, answer in English.
        
        Do not repeat the previous answer.
        
        Avoid statements like 'Based on the context, ...' or 'According to the provided context ...' or anything along those lines.
        
        Based on the context, if the meaning/intent of the query is not clear, respond by asking for additional information that will enable you to determine the intent. Then, end the response. 
        If the intent behind the query is clear, but you don't know the answer to a query, say "I do not know".
        
        Remember: Respond to the query in a more detailed manner, and avoid talking about yourself. If the question is asked in Malay, answer in Malay. If the question is asked in English, answer in English.
        
        Let us think step by step.
        
        Response: 
        
        """

        self.embed_docs(english_source=english_source)

    def embed_docs(self, english_source):
        if english_source:
            if not os.path.exists("../code-engine/db"):
                docs = text_splitter.split_documents(self.pdf)
                db = FAISS.from_documents(docs, self.embeddings)
                db.save_local("../code-engine/db")
            self.db = FAISS.load_local("../code-engine/db", self.embeddings)
        else:
            if not os.path.exists("../code-engine/db2"):
                docs = text_splitter.split_documents(self.pdf)
                db = FAISS.from_documents(docs, self.embeddings)
                db.save_local("../code-engine/db2")
            self.db = FAISS.load_local("../code-engine/db2", self.embeddings)

    def read_pdf(self, path):
        pdf_file = fitz.open(path)
        docs = []
        for page_index in tqdm(range(len(pdf_file))):
            # get the page itself
            page = pdf_file[page_index]
            tables = "\n\n\n".join([i.to_pandas().to_html() for i in page.find_tables(horizontal_strategy="text").tables])# + f"\n\n\n\nHTML TABLES:\n\n\n\n{tables}"
            docs.append(Document(page_content=page.get_text(), metadata={"filename":page_index}))
            #print(page.get_text())
        return docs
    
    def build_prompt(self,messages):
        if self.llama:
            B_INST, E_INST = "[INST]", "[/INST]"
            B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
            BOS, EOS = "<s>", "</s>"
            DEFAULT_SYSTEM_PROMPT = """
            "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content."
            """
            if not messages:
                return None
            messages_ = copy.deepcopy(messages)
            messages_ = [x["a"] if "a" in x else x["u"] for x in messages_]
            messages_[0] = "".join([B_SYS, DEFAULT_SYSTEM_PROMPT, E_SYS, messages_[0]])
            messages_list = []
            for i, x in enumerate(messages_):
                if i % 2 == 0:
                    messages_list.append(f"{BOS}{B_INST} {x.strip()} {E_INST}")
                else:
                    messages_list.append(f" {x.strip()} {EOS}")
            prompt = "".join(messages_list)
        else:
            if not messages:
                return None
            messages_ = copy.deepcopy(messages)
            messages_ = [x["a"] if "a" in x else x["u"] for x in messages_]
            messages_[0] = "".join([messages_[0]])
            messages_list = []
            for i, x in enumerate(messages_):
                if i % 2 == 0:
                    messages_list.append(f"{x.strip()}")
                else:
                    messages_list.append(f"{x.strip()}")
            prompt = "".join(messages_list)
            

        return prompt


    def chat(self,messages):
        if self.hook_translation:
            messages[-1]["u"] = translate(messages[-1]["u"], llm=self.translation_llm, direction="en2ms")
        question = messages[-1]["u"]
        if "tell me more" in question:
            search_results = self.db.similarity_search(messages[-3]["u"], k=self.K_DOCS)
            context = " ".join([x.page_content for x in search_results])
            messages[-1]["u"] = self.TELL_ME_MORE_TEMPLATE.replace(
                "{{context}}", context
            ).replace("{{question}}", messages[-3]["u"])
        else:
            search_results = self.db.similarity_search(question, k=self.K_DOCS)
            context = " ".join([x.page_content for x in search_results])
            messages[-1]["u"] = self.QUESTION_TEMPLATE.replace("{{context}}", context).replace(
                "{{question}}", question
            )
    
        prompt = self.build_prompt(messages)
        assistant = self.llm(prompt).strip().replace("•", "*").replace("```", "")
        if self.hook_translation:
            assistant = translate(assistant, llm=self.translation_llm, direction="ms2en")
        source = ""
        if "I do not know" not in assistant:
            source = "\n\nSource:\n- " + "\n- ".join(dict.fromkeys([str(x.metadata["filename"]) for x in search_results]))
        messages[-1]["u"] = question
        messages.append({"a": f"{assistant}"}) #{source}
        return messages, context, prompt
                 


In [8]:
#default setting - use translated pdf
chatter = Conversation()
generated_default = []
for question in test_q_eng["Questions"]:
    messages, context, prompt = chatter.chat([{"u": question}])
    a = messages[-1]["a"]
    generated_default.append(a)
    print(f"Q: {question}")
    print(f"G: {a}")
    print()
    


100%|████████████████████████████████████████████████████████████████████████████████| 176/176 [00:11<00:00, 15.37it/s]


Q: If I only have 30% share in my company, can I 
apply for bumiputera status? 
G: I do not know.

However, based on the information provided, it seems that the requirement for Bumiputera status is that the company/firm/cooperative/corporation must be owned at least 51% by Bumiputera Sarawak. Therefore, if you only have 30% share in your company, you may not be eligible for Bumiputera status. But I'm not sure about the specific details of your case, so I can't give a definitive answer. Can you provide more information about your company's ownership structure and the specific requirements for Bumiputera status in your area?

Q: What is UPKJ Bumiputera Status Requirement?
G: UPKJ Bumiputera Status requirement is a set of criteria that must be met by a company or individual to qualify for Bumiputera status, including minimum requirements for Bumiputera ownership, management, and control.

Q: Can we change our Bumiputera interview Date?
G: Sorry, the given context information is not suffic

In [9]:
english_results = pd.DataFrame({"question":test_q_eng["Questions"], "generated_default":generated_default})
#malay_results = pd.DataFrame({"question":test_q_ms["Questions"], "generated_answer":generated_ms})

In [10]:
pd.set_option('display.max_colwidth', 0)
english_results

Unnamed: 0,question,generated_default
0,"If I only have 30% share in my company, can I \napply for bumiputera status?","I do not know.\n\nHowever, based on the information provided, it seems that the requirement for Bumiputera status is that the company/firm/cooperative/corporation must be owned at least 51% by Bumiputera Sarawak. Therefore, if you only have 30% share in your company, you may not be eligible for Bumiputera status. But I'm not sure about the specific details of your case, so I can't give a definitive answer. Can you provide more information about your company's ownership structure and the specific requirements for Bumiputera status in your area?"
1,What is UPKJ Bumiputera Status Requirement?,"UPKJ Bumiputera Status requirement is a set of criteria that must be met by a company or individual to qualify for Bumiputera status, including minimum requirements for Bumiputera ownership, management, and control."
2,Can we change our Bumiputera interview Date?,"Sorry, the given context information is not sufficient to answer your question. Can you provide more information or clarify your query?"
3,If yes how do we change our Bumiputera Interview date?,"""To change the Bumiputera interview date, you can do so through the OLAM system, but it's only allowed once before a period of three days from the original interview date, and only if you have a valid reason."""
4,I have fail my bumiputera status and the system does not allow me to apply again?,"Saya tidak tahu. (I don't know)\n\nHowever, I can suggest some possible reasons why the system may not allow you to apply again:\n\n1. The system may have detected that you have previously applied for Bumiputera status and were rejected.\n2. You may have provided incorrect or incomplete information in your previous application.\n3. You may not meet the eligibility criteria for Bumiputera status.\n\nTo resolve this issue, you can try the following:\n\n1. Check your previous application status and see if there were any errors or missing information.\n2. Ensure that you meet the eligibility criteria for Bumiputera status.\n3. Contact the relevant authorities or the system administrator for assistance.\n\nRemember to always provide accurate and complete information when applying for Bumiputera status, and ensure that you meet the eligibility criteria to avoid any issues or rejections."
5,Do I need to apply for bumiputera status if I allready have bumi status for mechanical category?,"Based on the context, it seems that you are asking if you need to apply for bumiputera status if you already have bumi status for mechanical category. However, to confirm, can you please provide more information on your current bumi status and the specific context you are referring to? This will help me better understand your question and provide a more accurate response."
6,"I wish to apply bumi putera status for my consultancy firm, can you help?","""Saya tidak mengetahui"" (I don't know) as I require more information to determine the intent behind your query. Can you please provide more context or clarify your question?"
7,What are the document needed if I wish to apply for bumi putera status?,Documents needed for Bumiputera status application include:\n\n* Company registration certificate\n* Business license\n* EPF registration certificate (if applicable)\n* Bank statement or financial signature confirmation letter/company check that has been certified by the bank\n* Birth certificate of owner/holder of shares/director\n* Signatory condition\n* Name of owner and share handle.
8,How can I renew my company bumiputera status?,"""You can renew your company's bumiputera status by submitting an application through the OLAM system and ensuring that all required documents and information are up to date and accurate."""
9,What is Bumiputera Sarawak?,"Bumiputera Sarawak refers to a status granted to companies or individuals that are owned or controlled by indigenous people of Sarawak, Malaysia, and meet certain criteria set by the Sarawak government."
