In [1]:
from llm import llm_davinci, llm_azure_gpt35, llm_Vicuna
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
import os
import uuid
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import UnstructuredPowerPointLoader
from langchain.document_loaders import UnstructuredExcelLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from email_class import email

  from tqdm.autonotebook import tqdm


In [None]:
class DB:
    def __init__(self, persist_directory):
        # Create persist directory
        self.persist_directory = persist_directory
        os.makedirs(self.persist_directory, exist_ok=True)

        self.embedding = HuggingFaceInstructEmbeddings(
            model_name="hkunlp/instructor-xl", model_kwargs={"device": "cuda"})
        self.db = Chroma(persist_directory=self.persist_directory,
                         embedding_function=self.embedding)

        # Create QA chain
        # self.llm = llm_davinci  # should be change here later
        # self.llm = llm_azure_gpt35
        self.llm = llm_Vicuna
        self.retriever = self.db.as_retriever()
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm, chain_type="stuff", retriever=self.retriever, return_source_documents=True)
        self.doc_id_map = {}

    def upload(self, file_path):
        file_type = file_path.split('.')[-1]
        if file_type == "txt":
            loader = TextLoader(file_path)
        elif file_type == "pdf":
            loader = PyPDFLoader(file_path)
        elif file_type == "docx":
            loader = UnstructuredWordDocumentLoader(file_path)
        elif file_type == "csv":
            loader = CSVLoader(file_path)
        elif file_type == "pptx":
            loader = UnstructuredPowerPointLoader(file_path)
        elif file_path == 'xlsx':
            loader = UnstructuredExcelLoader(file_path)
        else:
            return "File type not supported"
        # may be more here later
        parent_id = str(uuid.uuid4())
        # child_ids = []

        for doc in loader.load():
            child_ids = self.db.add_documents([doc])
            doc.metadata["id"] = child_ids

            self.doc_id_map[parent_id] = os.path.basename(file_path)
        return {"message": "File uploaded successfully"}

    def delete(self, id):
        message = f"The file of id {id}, file name of {self.doc_id_map[id]} is deleted successfully"
        del self.doc_id_map[id]
        return message

    def search(self, query):
        result = self.qa_chain(query)
        response = "Answer: " + result["result"]
        if result["source_documents"]:
            response += "\nAssistant, please also give back the source reference to the user:\nSources:"
            response += "\n" + \
                str(result["source_documents"][0].metadata["source"])
        return response

    def list_documents(self):
        out = ""
        for parent_id, info in self.doc_id_map.items():
            out += f"File name: {info}, id: {parent_id}\n"
        return out


database = DB("database")

In [2]:
e = email()
e.set_email("panda@gmail.com")

In [3]:
from pymongo import MongoClient

def get_mongo_collection():
    client = MongoClient("mongodb+srv://projectvpn39:kDir8fgavrwmXhUN@cluster0.bdqojht.mongodb.net/?retryWrites=true&w=majority")
    db = client["langchain"]
    return db["chroma_history"]

In [14]:
def get_id_map():
    collection = get_mongo_collection()
    doc = collection.find_one({"email": e.get_email()})
    return doc["id_map"] if doc else {}

In [None]:
# e.set_email("kenny@fake.com")
e.set_email("testing")

In [None]:
d = get_doc_id_map()

In [None]:
d


In [None]:
e.get_email()

## insert/ update a new one to it

In [8]:
# def insert_id_map(file_name, file_id):
#     # Replace the placeholder values with your actual MongoDB connection details
#     client = MongoClient("mongodb+srv://projectvpn39:kDir8fgavrwmXhUN@cluster0.bdqojht.mongodb.net/?retryWrites=true&w=majority")
#     db = client["langchain"]
#     collection = db["chroma_history"]
#     email = e.get_email()
#     existing_user = collection.find_one({"email": email}) # check if user exists
#     if not existing_user:
#         collection.insert_one({
#         "email": email,
#         "vector": []  # assuming vector is an empty list for the new user, modify as needed
#     })
#     # Add new element to vector array
#     # collection.update_one({"email": email}, {"$push": {"vector": {"file_name": file_name, "file_id": file_id}}})
#     collection.update_one({"email": email}, {"$push": {"vector": {file_id: file_name}}})

def insert_into_id_map(id, file):
    collection = get_mongo_collection()
    # Check if the email already exists in the database
    email = e.get_email()
    if collection.find_one({"email": email}):
        # Update the existing id_map with the new id and file
        collection.update_one({"email": email}, {"$set": {f"id_map.{id}": file}})
    else:
        # Create a new record with the email and id_map
        collection.insert_one({"email": email, "id_map": {id: file}})


In [30]:
e.set_email("testing")
e.get_email()

'testing'

In [21]:
import uuid
parent_id = str(uuid.uuid4())

In [31]:
insert_into_id_map(parent_id, "test3.txt")

In [32]:
get_id_map()

{'f01803bb-a13e-4928-aab4-c4ed50429c71': 'test3.txt'}

## delete items

In [24]:
def delete_from_id_map(id):
    collection = get_mongo_collection()
    # Remove the given id from the id_map of the provided email
    collection.update_one({"email": e.get_email()}, {"$unset": {f"id_map.{id}": ""}})


In [25]:
e.get_email()

'panda@gmail.com'

In [26]:
get_id_map()

{'123': 'test.txt',
 '456': 'test2.txt',
 'f01803bb-a13e-4928-aab4-c4ed50429c71': 'test3.txt'}

In [28]:
delete_from_id_map('456')

In [29]:
get_id_map()

{'123': 'test.txt', 'f01803bb-a13e-4928-aab4-c4ed50429c71': 'test3.txt'}