In [3]:

import openai
import os
from dotenv import load_dotenv
import yaml
from langchain.embeddings.openai import OpenAIEmbeddings
from pyprojroot import here
import shutil

load_dotenv()

class LoadConfig:
    def __init__(self) -> None:
        with open(here("configs/app_config.yml")) as cfg:
            app_config = yaml.load(cfg, Loader=yaml.FullLoader)

        # LLM configs
        self.llm_engine = app_config["llm_config"]["engine"]
        self.llm_system_role = app_config["llm_config"]["llm_system_role"]
        self.persist_directory = str(here(
            app_config["directories"]["persist_directory"]))  # needs to be strin for summation in chromadb backend: self._settings.require("persist_directory") + "/chroma.sqlite3"
        self.custom_persist_directory = str(here(
            app_config["directories"]["custom_persist_directory"]))
        self.embedding_model = OpenAIEmbeddings()

        # Retrieval configs
        self.data_directory = app_config["directories"]["data_directory"]
        self.k = app_config["retrieval_config"]["k"]
        self.embedding_model_engine = app_config["embedding_model_config"]["engine"]
        self.chunk_size = app_config["splitter_config"]["chunk_size"]
        self.chunk_overlap = app_config["splitter_config"]["chunk_overlap"]

        # Summarizer config
        self.max_final_token = app_config["summarizer_config"]["max_final_token"]
        self.token_threshold = app_config["summarizer_config"]["token_threshold"]
        self.summarizer_llm_system_role = app_config["summarizer_config"]["summarizer_llm_system_role"]
        self.character_overlap = app_config["summarizer_config"]["character_overlap"]
        self.final_summarizer_llm_system_role = app_config[
            "summarizer_config"]["final_summarizer_llm_system_role"]
        self.temperature = app_config["llm_config"]["temperature"]

        # Memory
        self.number_of_q_a_pairs = app_config["memory"]["number_of_q_a_pairs"]

        # Load OpenAI credentials
        self.load_openai_cfg()

        # clean up the upload doc vectordb if it exists
        self.create_directory(self.persist_directory)
        self.remove_directory(self.custom_persist_directory)

    def load_openai_cfg(self):
        openai.api_key = os.getenv("OPENAI_API_KEY")

    def create_directory(self, directory_path: str):
        if not os.path.exists(directory_path):
            os.makedirs(directory_path)

    def remove_directory(self, directory_path: str):
        if os.path.exists(directory_path):
            try:
                shutil.rmtree(directory_path)
                print(
                    f"The directory '{directory_path}' has been successfully removed.")
            except OSError as e:
                print(f"Error: {e}")
        else:
            print(f"The directory '{directory_path}' does not exist.")


In [4]:

import openai
import os
from dotenv import load_dotenv
import yaml
from langchain.embeddings.openai import OpenAIEmbeddings
from pyprojroot import here
import shutil

load_dotenv()

# class LoadConfig:
#     def __init__() -> None:
with open(here("configs/app_config.yml")) as cfg:
    app_config = yaml.load(cfg, Loader=yaml.FullLoader)

# LLM configs
llm_engine = app_config["llm_config"]["engine"]
llm_system_role = app_config["llm_config"]["llm_system_role"]
persist_directory = str(here(
    app_config["directories"]["persist_directory"]))  # needs to be strin for summation in chromadb backend: ._settings.require("persist_directory") + "/chroma.sqlite3"
custom_persist_directory = str(here(
    app_config["directories"]["custom_persist_directory"]))
embedding_model = OpenAIEmbeddings()

# Retrieval configs
data_directory = app_config["directories"]["data_directory"]
k = app_config["retrieval_config"]["k"]
embedding_model_engine = app_config["embedding_model_config"]["engine"]
chunk_size = app_config["splitter_config"]["chunk_size"]
chunk_overlap = app_config["splitter_config"]["chunk_overlap"]

# Summarizer config
max_final_token = app_config["summarizer_config"]["max_final_token"]
token_threshold = app_config["summarizer_config"]["token_threshold"]
summarizer_llm_system_role = app_config["summarizer_config"]["summarizer_llm_system_role"]
character_overlap = app_config["summarizer_config"]["character_overlap"]
final_summarizer_llm_system_role = app_config[
    "summarizer_config"]["final_summarizer_llm_system_role"]
temperature = app_config["llm_config"]["temperature"]

# Memory
number_of_q_a_pairs = app_config["memory"]["number_of_q_a_pairs"]

def load_openai_cfg():
    openai.api_key = os.getenv("OPENAI_API_KEY")

def create_directory(directory_path: str):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

def remove_directory(directory_path: str):
    if os.path.exists(directory_path):
        try:
            shutil.rmtree(directory_path)
            print(
                f"The directory '{directory_path}' has been successfully removed.")
        except OSError as e:
            print(f"Error: {e}")
    else:
        print(f"The directory '{directory_path}' does not exist.")



In [5]:
load_openai_cfg()
create_directory(persist_directory)
remove_directory(custom_persist_directory)

The directory '/app/data/vectordb/uploaded/chroma' does not exist.


In [None]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from typing import List
from langchain.embeddings.openai import OpenAIEmbeddings


class PrepareVectorDB:
    def __init__(
            self,
            data_directory: str,
            persist_directory: str,
            embedding_model_engine: str,
            chunk_size: int,
            chunk_overlap: int
    ) -> None:
        """
        Initialize the PrepareVectorDB instance.

        Parameters:
            data_directory (str or List[str]): The directory or list of directories containing the documents.
            persist_directory (str): The directory to save the VectorDB.
            embedding_model_engine (str): The engine for OpenAI embeddings.
            chunk_size (int): The size of the chunks for document processing.
            chunk_overlap (int): The overlap between chunks.

        """

        # self.embedding_model_engine = embedding_model_engine
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""]
        )
        """Other options: CharacterTextSplitter, TokenTextSplitter, etc."""
        self.data_directory = data_directory
        self.persist_directory = persist_directory
        self.embedding = OpenAIEmbeddings()

    def __load_all_documents(self) -> List:
        """
        Load all documents from the specified directory or directories.

        Returns:
            List: A list of loaded documents.
        """
        doc_counter = 0
        if isinstance(self.data_directory, list):
            print("Loading the uploaded documents...")
            docs = []
            for doc_dir in self.data_directory:
                docs.extend(PyPDFLoader(doc_dir).load())
                doc_counter += 1
            print("Number of loaded documents:", doc_counter)
            print("Number of pages:", len(docs), "\n\n")
        else:
            print("Loading documents manually...")
            document_list = os.listdir(self.data_directory)
            docs = []
            for doc_name in document_list:
                docs.extend(PyPDFLoader(os.path.join(
                    self.data_directory, doc_name)).load())
                doc_counter += 1
            print("Number of loaded documents:", doc_counter)
            print("Number of pages:", len(docs), "\n\n")

        return docs

    def __chunk_documents(self, docs: List) -> List:
        """
        Chunk the loaded documents using the specified text splitter.

        Parameters:
            docs (List): The list of loaded documents.

        Returns:
            List: A list of chunked documents.

        """
        print("Chunking documents...")
        chunked_documents = self.text_splitter.split_documents(docs)
        print("Number of chunks:", len(chunked_documents), "\n\n")
        return chunked_documents

    def prepare_and_save_vectordb(self):
        """
        Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.

        Returns:
            Chroma: The created VectorDB.
        """
        docs = self.__load_all_documents()
        chunked_documents = self.__chunk_documents(docs)
        print("Preparing vectordb...")
        vectordb = Chroma.from_documents(
            documents=chunked_documents,
            embedding=self.embedding,
            persist_directory=self.persist_directory
        )
        print("VectorDB is created and saved.")
        print("Number of vectors in vectordb:",
              vectordb._collection.count(), "\n\n")
        return vectordb


In [None]:
prepare_vectordb_instance = PrepareVectorDB(
    data_directory=data_directory,
    persist_directory=persist_directory,
    embedding_model_engine=embedding_model_engine,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)

In [6]:
print(f"data_directory: {data_directory}")
print(f"persist_directory: {persist_directory}")
print(f"embedding_model_engine: {embedding_model_engine}")
print(f"chunk_size: {chunk_size}")
print(f"chunk_overlap: {chunk_overlap}")


data_directory: data/docs
persist_directory: /app/data/vectordb/processed/chroma
embedding_model_engine: text-embedding-ada-002
chunk_size: 3000
chunk_overlap: 1000


In [7]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from typing import List
from langchain.embeddings.openai import OpenAIEmbeddings


# self.embedding_model_engine = embedding_model_engine
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", " ", ""]
)
"""Other options: CharacterTextSplitter, TokenTextSplitter, etc."""
data_directory = data_directory
persist_directory = persist_directory
embedding = OpenAIEmbeddings()

def load_all_documents(self) -> List:
    """
    Load all documents from the specified directory or directories.

    Returns:
        List: A list of loaded documents.
    """
    doc_counter = 0
    if isinstance(self.data_directory, list):
        print("Loading the uploaded documents...")
        docs = []
        for doc_dir in self.data_directory:
            docs.extend(PyPDFLoader(doc_dir).load())
            doc_counter += 1
        print("Number of loaded documents:", doc_counter)
        print("Number of pages:", len(docs), "\n\n")
    else:
        print("Loading documents manually...")
        document_list = os.listdir(self.data_directory)
        docs = []
        for doc_name in document_list:
            docs.extend(PyPDFLoader(os.path.join(
                self.data_directory, doc_name)).load())
            doc_counter += 1
        print("Number of loaded documents:", doc_counter)
        print("Number of pages:", len(docs), "\n\n")

    return docs

def chunk_documents(self, docs: List) -> List:
    """
    Chunk the loaded documents using the specified text splitter.

    Parameters:
        docs (List): The list of loaded documents.

    Returns:
        List: A list of chunked documents.

    """
    print("Chunking documents...")
    chunked_documents = self.text_splitter.split_documents(docs)
    print("Number of chunks:", len(chunked_documents), "\n\n")
    return chunked_documents




In [11]:
data_directory = '../data/docs'
document_list = os.listdir(data_directory)

In [12]:
document_list

['e.pdf', 'h1.pdf']

In [17]:
PyPDFLoader(os.path.join(data_directory, 'e.pdf')).load()

[Document(page_content='Adjudicator’s Field Manual \nNOTE: The  USCIS Policy Manual  is our centralized online repository for immigration policies. We \nare working quickly to update and move material from the Adjudicator’s Field Manual to the Policy \nManual. Please check that resource, along with our  Policy Memoranda  page, to verify information \nyou find in the Adjudicator’s Field Manual. If y ou have questions or concerns about any \ndiscrepancies among these resources, please contact PolicyFeedback@uscis.dhs.gov. \nChapter 34 Other Employment Authorized Nonimmigrants (E, I & R Classificat ions).  \n34.1  Background  \n34.2  Treaty Traders  \n34.3  Treaty Investors  \n34.4  Representatives of Information Media has been superseded by Volume 2, Part K: Media \nRepresentatives as of November 10, 2015. \n34.5  Nonimmigrant Aliens Employed in Religious Occupations has been superseded by USCIS Policy \nManual, Volume 2: Nonimmigrants as of May 15, 2020. \n34.6  E\n-3 Specialty Occupati

In [18]:
# load all documents
doc_counter = 0
# if isinstance(data_directory, list):
#     print("Loading the uploaded documents...")
#     docs = []
#     for doc_dir in data_directory:
#         docs.extend(PyPDFLoader(doc_dir).load())
#         doc_counter += 1
#     print("Number of loaded documents:", doc_counter)
#     print("Number of pages:", len(docs), "\n\n")
# else:
print("Loading documents manually...")
document_list = os.listdir(data_directory)
docs = []
for doc_name in document_list:
    docs.extend(PyPDFLoader(os.path.join(
        data_directory, doc_name)).load())
    doc_counter += 1
print("Number of loaded documents:", doc_counter)
print("Number of pages:", len(docs), "\n\n")

Loading documents manually...
Number of loaded documents: 2
Number of pages: 116 




In [40]:
chunk_size = 3000
chunk_overlap = 500

# self.embedding_model_engine = embedding_model_engine
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", " ", ""]
)

print("Chunking documents...")
print("Chunk Size = ", chunk_size)
print("chunk overlap = ", chunk_overlap)

chunked_documents = text_splitter.split_documents(docs)
print("Number of chunks:", len(chunked_documents), "\n\n")

Chunking documents...
Chunk Size =  3000
chunk overlap =  500
Number of chunks: 122 




In [41]:
len(docs)

116

In [42]:
len(chunked_documents)

122

In [43]:
docs[:10]

[Document(page_content='Adjudicator’s Field Manual \nNOTE: The  USCIS Policy Manual  is our centralized online repository for immigration policies. We \nare working quickly to update and move material from the Adjudicator’s Field Manual to the Policy \nManual. Please check that resource, along with our  Policy Memoranda  page, to verify information \nyou find in the Adjudicator’s Field Manual. If y ou have questions or concerns about any \ndiscrepancies among these resources, please contact PolicyFeedback@uscis.dhs.gov. \nChapter 34 Other Employment Authorized Nonimmigrants (E, I & R Classificat ions).  \n34.1  Background  \n34.2  Treaty Traders  \n34.3  Treaty Investors  \n34.4  Representatives of Information Media has been superseded by Volume 2, Part K: Media \nRepresentatives as of November 10, 2015. \n34.5  Nonimmigrant Aliens Employed in Religious Occupations has been superseded by USCIS Policy \nManual, Volume 2: Nonimmigrants as of May 15, 2020. \n34.6  E\n-3 Specialty Occupati

In [44]:
chunked_documents[:10]

[Document(page_content='Adjudicator’s Field Manual \nNOTE: The  USCIS Policy Manual  is our centralized online repository for immigration policies. We \nare working quickly to update and move material from the Adjudicator’s Field Manual to the Policy \nManual. Please check that resource, along with our  Policy Memoranda  page, to verify information \nyou find in the Adjudicator’s Field Manual. If y ou have questions or concerns about any \ndiscrepancies among these resources, please contact PolicyFeedback@uscis.dhs.gov. \nChapter 34 Other Employment Authorized Nonimmigrants (E, I & R Classificat ions).  \n34.1  Background  \n34.2  Treaty Traders  \n34.3  Treaty Investors  \n34.4  Representatives of Information Media has been superseded by Volume 2, Part K: Media \nRepresentatives as of November 10, 2015. \n34.5  Nonimmigrant Aliens Employed in Religious Occupations has been superseded by USCIS Policy \nManual, Volume 2: Nonimmigrants as of May 15, 2020. \n34.6  E\n-3 Specialty Occupati

In [47]:
print("Preparing vectordb...")
vectordb = Chroma.from_documents(
    documents=chunked_documents,
    embedding=embedding,
    persist_directory=persist_directory
)
print("VectorDB is created and saved.")
print("Number of vectors in vectordb:",
        vectordb._collection.count(), "\n\n")

Preparing vectordb...
VectorDB is created and saved.
Number of vectors in vectordb: 342 




In [58]:
vectordb.get()

{'ids': ['1ca359d8-c4c1-11ef-8e43-0242ac110002',
  '1ca3607c-c4c1-11ef-8e43-0242ac110002',
  '1ca360d6-c4c1-11ef-8e43-0242ac110002',
  '1ca36108-c4c1-11ef-8e43-0242ac110002',
  '1ca3613a-c4c1-11ef-8e43-0242ac110002',
  '1ca3616c-c4c1-11ef-8e43-0242ac110002',
  '1ca3619e-c4c1-11ef-8e43-0242ac110002',
  '1ca361e4-c4c1-11ef-8e43-0242ac110002',
  '1ca36220-c4c1-11ef-8e43-0242ac110002',
  '1ca3625c-c4c1-11ef-8e43-0242ac110002',
  '1ca36284-c4c1-11ef-8e43-0242ac110002',
  '1ca362ac-c4c1-11ef-8e43-0242ac110002',
  '1ca362de-c4c1-11ef-8e43-0242ac110002',
  '1ca36306-c4c1-11ef-8e43-0242ac110002',
  '1ca36338-c4c1-11ef-8e43-0242ac110002',
  '1ca3636a-c4c1-11ef-8e43-0242ac110002',
  '1ca36392-c4c1-11ef-8e43-0242ac110002',
  '1ca363ba-c4c1-11ef-8e43-0242ac110002',
  '1ca363ec-c4c1-11ef-8e43-0242ac110002',
  '1ca36414-c4c1-11ef-8e43-0242ac110002',
  '1ca36446-c4c1-11ef-8e43-0242ac110002',
  '1ca36496-c4c1-11ef-8e43-0242ac110002',
  '1ca364c8-c4c1-11ef-8e43-0242ac110002',
  '1ca364f0-c4c1-11ef-8e43-

In [55]:
len(vectordb.get()['ids'])

342

In [57]:
vectordb.get()['embeddings']

In [None]:
import os
from utils.prepare_vectordb import PrepareVectorDB
from utils.load_config import LoadConfig
CONFIG = LoadConfig()


def upload_data_manually() -> None:
    """
    Uploads data manually to the VectorDB.

    This function initializes a PrepareVectorDB instance with configuration parameters
    such as data_directory, persist_directory, embedding_model_engine, chunk_size,
    and chunk_overlap. It then checks if the VectorDB already exists in the specified
    persist_directory. If not, it calls the prepare_and_save_vectordb method to
    create and save the VectorDB. If the VectorDB already exists, a message is printed
    indicating its presence.

    Returns:
        None
    """
    prepare_vectordb_instance = PrepareVectorDB(
        data_directory=CONFIG.data_directory,
        persist_directory=CONFIG.persist_directory,
        embedding_model_engine=CONFIG.embedding_model_engine,
        chunk_size=CONFIG.chunk_size,
        chunk_overlap=CONFIG.chunk_overlap,
    )
    if not len(os.listdir(CONFIG.persist_directory)) != 0:
        prepare_vectordb_instance.prepare_and_save_vectordb()
    else:
        print(f"VectorDB already exists in {CONFIG.persist_directory}")
    return None


# utils/chatbot.py

In [None]:
import gradio as gr
import time
from openai import OpenAI
import os
from langchain.vectorstores import Chroma
from typing import List, Tuple
import re
import ast
import html
from utils.load_config import LoadConfig

APPCFG = LoadConfig()
URL = "https://github.com/Farzad-R/LLM-Zero-to-Hundred/tree/master/RAG-GPT"
hyperlink = f"[RAG-GPT user guideline]({URL})"
client = OpenAI()

class ChatBot:
    """
    Class representing a chatbot with document retrieval and response generation capabilities.

    This class provides static methods for responding to user queries, handling feedback, and
    cleaning references from retrieved documents.
    """
    @staticmethod
    def respond(chatbot: List, message: str, data_type: str = "Preprocessed doc", temperature: float = 0.0) -> Tuple:
        """
        Generate a response to a user query using document retrieval and language model completion.

        Parameters:
            chatbot (List): List representing the chatbot's conversation history.
            message (str): The user's query.
            data_type (str): Type of data used for document retrieval ("Preprocessed doc" or "Upload doc: Process for RAG").
            temperature (float): Temperature parameter for language model completion.

        Returns:
            Tuple: A tuple containing an empty string, the updated chat history, and references from retrieved documents.
        """
        if data_type == "Preprocessed doc":
            # directories
            if os.path.exists(APPCFG.persist_directory):
                vectordb = Chroma(persist_directory=APPCFG.persist_directory,
                                  embedding_function=APPCFG.embedding_model)
            else:
                chatbot.append(
                    (message, f"VectorDB does not exist. Please first execute the 'upload_data_manually.py' module. For further information please visit {hyperlink}."))
                return "", chatbot, None

        elif data_type == "Upload doc: Process for RAG":
            if os.path.exists(APPCFG.custom_persist_directory):
                vectordb = Chroma(persist_directory=APPCFG.custom_persist_directory,
                                  embedding_function=APPCFG.embedding_model)
            else:
                chatbot.append(
                    (message, f"No file was uploaded. Please first upload your files using the 'upload' button."))
                return "", chatbot, None

        docs = vectordb.similarity_search(message, k=APPCFG.k)
        print(docs)
        question = "# User new question:\n" + message
        retrieved_content = ChatBot.clean_references(docs)
        # Memory: previous two Q&A pairs
        chat_history = f"Chat history:\n {str(chatbot[-APPCFG.number_of_q_a_pairs:])}\n\n"
        prompt = f"{chat_history}{retrieved_content}{question}"
        print("========================")
        print(prompt)
        response = client.chat.completions.create(
            model=APPCFG.llm_engine,
            messages=[
                {"role": "system", "content": APPCFG.llm_system_role},
                {"role": "user", "content": prompt}
            ],
            temperature=temperature,
            # stream=False
        )
        chatbot.append(
            (message, response.choices[0].message.content))
        time.sleep(2)

        return "", chatbot, retrieved_content

    @staticmethod
    def clean_references(documents: List) -> str:
        """
        Clean and format references from retrieved documents.

        Parameters:
            documents (List): List of retrieved documents.

        Returns:
            str: A string containing cleaned and formatted references.
        """
        server_url = "http://localhost:8000"
        documents = [str(x)+"\n\n" for x in documents]
        markdown_documents = ""
        counter = 1
        for doc in documents:
            # Extract content and metadata
            content, metadata = re.match(
                r"page_content=(.*?)( metadata=\{.*\})", doc).groups()
            metadata = metadata.split('=', 1)[1]
            metadata_dict = ast.literal_eval(metadata)

            # Decode newlines and other escape sequences
            content = bytes(content, "utf-8").decode("unicode_escape")

            # Replace escaped newlines with actual newlines
            content = re.sub(r'\\n', '\n', content)
            # Remove special tokens
            content = re.sub(r'\s*<EOS>\s*<pad>\s*', ' ', content)
            # Remove any remaining multiple spaces
            content = re.sub(r'\s+', ' ', content).strip()

            # Decode HTML entities
            content = html.unescape(content)

            # Replace incorrect unicode characters with correct ones
            content = content.encode('latin1').decode('utf-8', 'ignore')

            # Remove or replace special characters and mathematical symbols
            # This step may need to be customized based on the specific symbols in your documents
            content = re.sub(r'â', '-', content)
            content = re.sub(r'â', '∈', content)
            content = re.sub(r'Ã', '×', content)
            content = re.sub(r'ï¬', 'fi', content)
            content = re.sub(r'â', '∈', content)
            content = re.sub(r'Â·', '·', content)
            content = re.sub(r'ï¬', 'fl', content)

            pdf_url = f"{server_url}/{os.path.basename(metadata_dict['source'])}"

            # Append cleaned content to the markdown string with two newlines between documents
            markdown_documents += f"# Retrieved content {counter}:\n" + content + "\n\n" + \
                f"Source: {os.path.basename(metadata_dict['source'])}" + " | " +\
                f"Page number: {str(metadata_dict['page'])}" + " | " +\
                f"[View PDF]({pdf_url})" "\n\n"
            counter += 1

        return markdown_documents


In [14]:
import os

# Print the current working directory
print(os.getcwd())

# Verify the full path to "src/utils"
print(os.path.abspath("app/src/utils"))


/
/app/src/utils


In [16]:
import sys
import os

# Add the src directory to the Python path
sys.path.append(os.path.abspath("app/src"))

# Import from utils
from utils.load_config import LoadConfig


In [25]:

sys.path

['/usr/local/lib/python311.zip',
 '/usr/local/lib/python3.11',
 '/usr/local/lib/python3.11/lib-dynload',
 '',
 '/root/.local/lib/python3.11/site-packages',
 '/usr/local/lib/python3.11/site-packages',
 '/src',
 '/src/utils',
 '/src/utils',
 '/src',
 '/src',
 '/src',
 '/src',
 '/app/src',
 '/app/src',
 '/app/src',
 '/app/src',
 '/app',
 '/app']

In [None]:
import openai
import os
from dotenv import load_dotenv
import yaml
from langchain.embeddings.openai import OpenAIEmbeddings
from pyprojroot import here
import shutil
import os
os.chdir("app")
print(os.listdir())

with open(here("configs/app_config.yml")) as cfg:
    app_config = yaml.load(cfg, Loader=yaml.FullLoader)

In [44]:
def clean_references(documents: List) -> str:
    """
    Clean and format references from retrieved documents.

    Parameters:
        documents (List): List of retrieved documents.

    Returns:
        str: A string containing cleaned and formatted references.
    """
    server_url = "http://localhost:8000"
    documents = [str(x)+"\n\n" for x in documents]
    markdown_documents = ""
    counter = 1
    for doc in documents:
        # Extract content and metadata
        content, metadata = re.match(
            r"page_content=(.*?)( metadata=\{.*\})", doc).groups()
        metadata = metadata.split('=', 1)[1]
        metadata_dict = ast.literal_eval(metadata)

        # Decode newlines and other escape sequences
        content = bytes(content, "utf-8").decode("unicode_escape")

        # Replace escaped newlines with actual newlines
        content = re.sub(r'\\n', '\n', content)
        # Remove special tokens
        content = re.sub(r'\s*<EOS>\s*<pad>\s*', ' ', content)
        # Remove any remaining multiple spaces
        content = re.sub(r'\s+', ' ', content).strip()

        # Decode HTML entities
        content = html.unescape(content)

        # Replace incorrect unicode characters with correct ones
        content = content.encode('latin1').decode('utf-8', 'ignore')

        # Remove or replace special characters and mathematical symbols
        # This step may need to be customized based on the specific symbols in your documents
        content = re.sub(r'â', '-', content)
        content = re.sub(r'â', '∈', content)
        content = re.sub(r'Ã', '×', content)
        content = re.sub(r'ï¬', 'fi', content)
        content = re.sub(r'â', '∈', content)
        content = re.sub(r'Â·', '·', content)
        content = re.sub(r'ï¬', 'fl', content)

        pdf_url = f"{server_url}/{os.path.basename(metadata_dict['source'])}"

        # Append cleaned content to the markdown string with two newlines between documents
        markdown_documents += f"# Retrieved content {counter}:\n" + content + "\n\n" + \
            f"Source: {os.path.basename(metadata_dict['source'])}" + " | " +\
            f"Page number: {str(metadata_dict['page'])}" + " | " +\
            f"[View PDF]({pdf_url})" "\n\n"
        counter += 1

    return markdown_documents

In [45]:

import sys
import os

# Add the src directory to the Python path
sys.path.append(os.path.abspath("app"))

import gradio as gr
import time
from openai import OpenAI
import os
from langchain.vectorstores import Chroma
from typing import List, Tuple
import re
import ast
import html
from src.utils.load_config import LoadConfig

APPCFG = LoadConfig()
URL = "https://github.com/Farzad-R/LLM-Zero-to-Hundred/tree/master/RAG-GPT"
hyperlink = f"[RAG-GPT user guideline]({URL})"
client = OpenAI()

# def respond(chatbot: List, message: str, data_type: str = "Preprocessed doc", temperature: float = 0.0) -> Tuple:
chatbot = []
message = 'What is h1b visa?'
data_type = "Preprocessed doc"
temperature = 0

if data_type == "Preprocessed doc":
    # directories
    if os.path.exists(APPCFG.persist_directory):
        vectordb = Chroma(persist_directory=APPCFG.persist_directory,
                            embedding_function=APPCFG.embedding_model)
    else:
        chatbot.append(
            (message, f"VectorDB does not exist. Please first execute the 'upload_data_manually.py' module. For further information please visit {hyperlink}."))


elif data_type == "Upload doc: Process for RAG":
    if os.path.exists(APPCFG.custom_persist_directory):
        vectordb = Chroma(persist_directory=APPCFG.custom_persist_directory,
                            embedding_function=APPCFG.embedding_model)
    else:
        chatbot.append(
            (message, f"No file was uploaded. Please first upload your files using the 'upload' button."))


docs = vectordb.similarity_search(message, k=APPCFG.k)
print(docs)
question = "# User new question:\n" + message
retrieved_content = clean_references(docs)
# Memory: previous two Q&A pairs
chat_history = f"Chat history:\n {str(chatbot[-APPCFG.number_of_q_a_pairs:])}\n\n"
prompt = f"{chat_history}{retrieved_content}{question}"
print("========================")
print(prompt)
response = client.chat.completions.create(
    model=APPCFG.llm_engine,
    messages=[
        {"role": "system", "content": APPCFG.llm_system_role},
        {"role": "user", "content": prompt}
    ],
    temperature=temperature,
    # stream=False
)
chatbot.append(
    (message, response.choices[0].message.content))
time.sleep(2)

The directory '/app/data/vectordb/uploaded/chroma' does not exist.
[Document(page_content='– Allowed INS (now USCIS) to grant an extension of stay to H -1B nonimmigrant aliens who are the \nbeneficiaries of employment -based petitions under certain circumstances; \n– Modified the method of counting H -1B nonimmigrant aliens; \n– Provided that certain H -1B petitions that are revoked because of fraud or willful misrepresentation shall \nbe subtracted from the numerical count for the year in which the petition was revoked; \n· Public Law 106- 311: \n– Increased the additional filing fee for certain H -1B petitions to $1,000, with some exceptions; \n· Public Law 106- 396: \n– Amended section 214 of the Act to address whether an amended petition is required of an H -1B \npetitioner when the petitioner undergoes corporate restructuring. \nOn November 2, 2002, President Bush signed into law the Twenty -First Century Department of Justice \nAppropriations Authorization Act (21 st Century DOJ 

In [46]:
chatbot

[('What is h1b visa?',
  "The H-1B visa is a nonimmigrant classification for temporary workers in the United States. It is designated for individuals coming to perform services in a specialty occupation, which typically requires a bachelor's degree or equivalent. The H-1B category was created by redesignating the existing H-1 category in 1989. The visa allows U.S. employers to temporarily employ foreign workers in specialty occupations. Over the years, various laws have modified aspects of the H-1B visa, including numerical limitations, filing fees, and conditions under which extensions can be granted.")]

In [None]:
# def respond(chatbot: List, message: str, data_type: str = "Preprocessed doc", temperature: float = 0.0)

file_msg = upload_btn.upload(fn=UploadFile.process_uploaded_files, inputs=[
    upload_btn, chatbot, rag_with_dropdown], outputs=[input_txt, chatbot], queue=False)

txt_msg = input_txt.submit(fn=ChatBot.respond,
                            inputs=[chatbot, input_txt,
                                    rag_with_dropdown, temperature_bar],
                            outputs=[input_txt,
                                    chatbot, ref_output],
                            queue=False).then(lambda: gr.Textbox(interactive=True),
                                                None, [input_txt], queue=False)

txt_msg = text_submit_btn.click(fn=ChatBot.respond,
                                inputs=[chatbot, input_txt,
                                        rag_with_dropdown, temperature_bar],
                                outputs=[input_txt,
                                            chatbot, ref_output],
                                queue=False).then(lambda: gr.Textbox(interactive=True),
                                                    None, [input_txt], queue=False)

In [None]:
"""
    This module uses Gradio to create an interactive web application for a chatbot with various features.

    The application interface is organized into three rows:
    1. The first row contains a Chatbot component that simulates a conversation with a language model, along with a hidden
    reference bar initially. The reference bar can be toggled using a button. The chatbot supports feedback in the form
    of like and dislike icons.

    2. The second row consists of a Textbox for user input. Users can enter text or upload PDF/doc files.

    3. The third row includes buttons for submitting text, toggling the reference bar visibility, uploading PDF/doc files,
    adjusting temperature for GPT responses, selecting the document type, and clearing the input.

    The application processes user interactions:
    - Uploaded files trigger the processing of the files, updating the input and chatbot components.
    - Submitting text triggers the chatbot to respond, considering the selected document type and temperature settings.
    The response is displayed in the Textbox and Chatbot components, and the reference bar may be updated.

    The application can be run as a standalone script, launching the Gradio interface for users to interact with the chatbot.

    Note: The docstring provides an overview of the module's purpose and functionality, but detailed comments within the code
    explain specific components, interactions, and logic throughout the implementation.
"""
import gradio as gr
from utils.upload_file import UploadFile
from utils.chatbot import ChatBot
from utils.ui_settings import UISettings


with gr.Blocks() as demo:
    with gr.Tabs():
        with gr.TabItem("RAG-GPT"):
            ##############
            # First ROW:
            ##############
            with gr.Row() as row_one:
                with gr.Column(visible=False) as reference_bar:
                    ref_output = gr.Markdown()
                    # ref_output = gr.Textbox(
                    #     lines=22,
                    #     max_lines=22,
                    #     interactive=False,
                    #     type="text",
                    #     label="References",
                    #     show_copy_button=True
                    # )

                with gr.Column() as chatbot_output:
                    chatbot = gr.Chatbot(
                        [],
                        elem_id="chatbot",
                        bubble_full_width=False,
                        height=500,
                        avatar_images=(
                            ("images/AI_RT.png"), "images/openai_.png"),
                        # render=False
                    )
                    # **Adding like/dislike icons
                    chatbot.like(UISettings.feedback, None, None)
            ##############
            # SECOND ROW:
            ##############
            with gr.Row():
                input_txt = gr.Textbox(
                    lines=4,
                    scale=8,
                    placeholder="Enter text and press enter, or upload PDF files",
                    container=False,
                )

            ##############
            # Third ROW:
            ##############
            with gr.Row() as row_two:
                text_submit_btn = gr.Button(value="Submit text")
                sidebar_state = gr.State(False)
                btn_toggle_sidebar = gr.Button(
                    value="References")
                btn_toggle_sidebar.click(UISettings.toggle_sidebar, [sidebar_state], [
                    reference_bar, sidebar_state])
                upload_btn = gr.UploadButton(
                    "📁 Upload PDF or doc files", file_types=[
                        '.pdf',
                        '.doc'
                    ],
                    file_count="multiple")
                temperature_bar = gr.Slider(minimum=0, maximum=1, value=0, step=0.1,
                                            label="Temperature", info="Choose between 0 and 1")
                rag_with_dropdown = gr.Dropdown(
                    label="RAG with", choices=["Preprocessed doc", "Upload doc: Process for RAG", "Upload doc: Give Full summary"], value="Preprocessed doc")
                clear_button = gr.ClearButton([input_txt, chatbot])
            ##############
            # Process:
            ##############
            file_msg = upload_btn.upload(fn=UploadFile.process_uploaded_files, inputs=[
                upload_btn, chatbot, rag_with_dropdown], outputs=[input_txt, chatbot], queue=False)

            txt_msg = input_txt.submit(fn=ChatBot.respond,
                                       inputs=[chatbot, input_txt,
                                               rag_with_dropdown, temperature_bar],
                                       outputs=[input_txt,
                                                chatbot, ref_output],
                                       queue=False).then(lambda: gr.Textbox(interactive=True),
                                                         None, [input_txt], queue=False)

            txt_msg = text_submit_btn.click(fn=ChatBot.respond,
                                            inputs=[chatbot, input_txt,
                                                    rag_with_dropdown, temperature_bar],
                                            outputs=[input_txt,
                                                     chatbot, ref_output],
                                            queue=False).then(lambda: gr.Textbox(interactive=True),
                                                              None, [input_txt], queue=False)


if __name__ == "__main__":
    demo.launch()
