# HPTChat: A RAG Chat Application for General and HPT's Internal regulations of ISO 27001

## [Problem Statement](https://cwbhack.dev/problem-statements/hpt/)
Develop a chatbot that takes the role as a Virtual Assistant for all HPT employees to answer the questions related to General and HPT’s Internal regulations of ISO 27001 as well as the evidence of applying the HPT’s regulation.



## Solution
A multilingual RAG Chat App that assists employees with General and HPT's Internal regulations of ISO 27001 and evidences specific to HPT.

### Prerequisites
- [VS Code](https://code.visualstudio.com/Download)
- [Python](https://www.python.org/)
- [Azure AI Search](https://learn.microsoft.com/en-us/azure/search/search-create-service-portal)
- [Open AI](https://openai.com/index/openai-api/)

### Install Packages & Set-up Environment Variables
Run the code below to install packages. Create a `.env` file and insert the following info below: 
```
search_endpoint = "<YOUR-AZURE-AI-SEARCH-ENDPOINT>"
index_name = "<NAME-OF-INDEX>"
search_api_key="<YOUR-SEARCH-API-KEY>"
openapi_key = "<YOUR-OPEN-AI-API-KEY>"


translate_endpoint = "<YOUR-AZURE-TRANSLATION-ENDPOINT>"
translate_api_key = "<YOUR-AZURE-TRANSLATION-API-KEY>"
translate_region = "eastus"
```

In [None]:
# ! pip install -r requirements.txt --quiet

### Import Libraries and Load Keys

In [1]:
from ragfunctions import (
    get_search_index,
    get_embedding,
    recursive_chunking,
    get_file_stats,
    translate_chunk,
    ask_data, 
    )

from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient

from langchain.document_loaders import PyPDFium2Loader
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.embeddings.openai import OpenAIEmbeddings


import os
from dotenv import dotenv_values


In [3]:
env_name = ".env"
config = dotenv_values(env_name)           

# Translate Details 
translate_endpoint = config["translate_endpoint"]
translate_api_key = config["translate_api_key"]
translate_region = config["translate_region"]

# Azure AI Search Details
service_endpoint = config["search_endpoint"]
index_name = config["index_name"]
search_key = config["search_api_key"]

credential = AzureKeyCredential(search_key)

# OpenAI Details
openapi_key = config["openapi_key"]

### Create Index

In [7]:
index_client = SearchIndexClient(service_endpoint, credential)
index_client.create_index(get_search_index(index_name))

<azure.search.documents.indexes.models._index.SearchIndex at 0x218ab142590>

### Chunk, Translate and Upload Documents
Data sources: 
-  HPT’s internal ISO 27K regulations, processes, and evidence of ISO27K implementation. (Provided to Hackathon Participants)
- [ISO27k standards](https://www.iso27001security.com/html/iso27000.html)

#### English Documents

In [11]:
en_dir = "documents/en"
en_pdf_files = [file for file in os.listdir(en_dir) if file.lower().endswith(".pdf")]

In [7]:
# Document Details 
for filename in en_pdf_files:
    fpath = os.path.join(en_dir, filename)
    print(f"{filename} stats:")
    print(get_file_stats(fpath))
    print()

In [13]:
search_index_client = index_client.get_search_client(index_name)

for filename in en_pdf_files:
    loader = PyPDFium2Loader(os.path.join(en_dir,filename))
    documents = loader.load()
    chunks = recursive_chunking(documents)
    docs = [
    {
        "id": f"{filename.rstrip('.pdf')}_{i}",
        "metadata": f"Document:{filename} Page:{chunk.metadata['page']}",
        "isInternal": False,
        "en_content": chunk.page_content,
        "vi_content": translate_chunk(chunk.page_content.strip('"'), translate_api_key,translate_region,translate_endpoint),
        "en_content_vector": get_embedding(chunk.page_content,openapi_key),
        "vi_content_vector": get_embedding(translate_chunk(chunk.page_content.strip('"'), translate_api_key,translate_region,translate_endpoint),openapi_key)
    }
    for i, chunk in enumerate(chunks)
    ]

    search_index_client.upload_documents(docs)   


#### Vietnamese Documents


In [24]:
vi_dir = "documents/vi"
vi_pdf_files = [file for file in os.listdir(vi_dir) if file.lower().endswith(".pdf")]

In [8]:
# Document Details 
for filename in vi_pdf_files:
    fpath = os.path.join(vi_dir, filename)
    print(f"{filename} stats:")
    print(get_file_stats(fpath))
    print()

In [26]:
search_index_client = index_client.get_search_client(index_name)

for filename in vi_pdf_files:
    loader = PyPDFium2Loader(os.path.join(vi_dir,filename))
    documents = loader.load()
    chunks = recursive_chunking(documents)
    docs = [
    {
        "id": f"{filename.rstrip('.pdf')}_{i}",
        "metadata": f"Document:{filename} Page:{chunk.metadata['page']}",
        "isInternal": True,
        "en_content": translate_chunk(chunk.page_content.strip('"'), translate_api_key,translate_region,translate_endpoint,langFrom="vi",langTo="en"),
        "vi_content": chunk.page_content,
        "en_content_vector": get_embedding(translate_chunk(chunk.page_content.strip('"'), translate_api_key,translate_region,translate_endpoint,langFrom="vi",langTo="en"),openapi_key),
        "vi_content_vector": get_embedding(chunk.page_content,openapi_key),
    }
    for i, chunk in enumerate(chunks)
    ]

    search_index_client.upload_documents(docs)
    

### Chat with Data

In [9]:
print(ask_data("What is ISO27k", "en", service_endpoint, index_name, search_key, openapi_key))

### Gradio WebApp

In [12]:
import gradio as gr
from openai import OpenAI
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
from azure.core.credentials import AzureKeyCredential
import time

api_key = openapi_key  # Replace with your key
client = OpenAI(api_key=api_key)

def predict(message, history, system_prompt, tokens, checkbox):
    history_openai_format = []
    for human, assistant in history:
        history_openai_format.append({"role": "user", "content": human })
        history_openai_format.append({"role": "assistant", "content":assistant})
    history_openai_format.append({"role": "user", "content": message})
  
    response = client.chat.completions.create(model='gpt-3.5-turbo',
    messages= history_openai_format,
    temperature=1.0,
    stream=True)

    partial_message = ""
    for chunk in response:
        if chunk.choices[0].delta.content is not None:
              partial_message = partial_message + chunk.choices[0].delta.content
              yield partial_message
    
def context():
    return """
    You are an assistant that helps company employees with their ISO27K questions, and questions about Internal regulations of ISO 27001. Be detailed and complete with your answers.
    Answer ONLY with the information above. 
    If there isn't enough information below, say you don't know. 
    Do not make up your own answers. 
    If asking a clarifying question to the user would help, ask the question.
    If the question is not in English, answer in the language used in the question.
"""

def get_embedding(text, key, model="text-embedding-ada-002"):
   client = OpenAI(api_key=key)
   
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

def chat_data(message, history, system_prompt, radio, isInternal):
    language = "en"
    if radio is not None:
        language = radio 

    search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(search_key))
    vector_query = VectorizedQuery(vector=get_embedding(message, openapi_key), k_nearest_neighbors=3, fields=f"{language}_content_vector")
    filter = f"{'isInternal eq true' if isInternal else ''}"

    results = search_client.search(
            message,
            vector_queries=[vector_query],
            top=3,
            filter=filter,
            query_type="semantic",
            semantic_configuration_name="hpt-semantic-config",
            select=["metadata", f"{language}_content"],
        )

    fulltext_list = []

    for result in results:
        reference = result[f"{language}_content"] + " "
        reference += result["metadata"]
        fulltext_list.append(reference)

    fulltext = "".join(fulltext_list)

    client = OpenAI(api_key=openapi_key)

    history_openai_format = []
    for human, assistant in history:
        history_openai_format.append({"role": "user", "content": human })
        history_openai_format.append({"role": "assistant", "content":assistant})
    history_openai_format.append({"role": "user", "content": fulltext})

    completion = client.chat.completions.create(
    model= "gpt-3.5-turbo-0125",
    stream=True,
    messages=history_openai_format
    )

    partial_message = ""
    for chunk in completion:
        if chunk.choices[0].delta.content is not None:
              partial_message = partial_message + chunk.choices[0].delta.content
              yield partial_message

with gr.Blocks() as demo:
    system_prompt = gr.Textbox(context(), label="System Prompt")
    radio = gr.Radio(["en", "vi"], label="Search Index", info="Select Index Language")
    slider = gr.Slider(10, 100, render=False)
    checkbox = gr.Checkbox(label="Internal", info="Limit to Internal Documents?")

    gr.ChatInterface(
        chat_data, additional_inputs=[system_prompt, radio, checkbox]
    )

demo.launch()

Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.


