# loading and embedding additional knowledge into a vectorstore

In [9]:
# import needed libraries
from dotenv import dotenv_values
from langchain.document_loaders import DirectoryLoader, PyPDFDirectoryLoader, UnstructuredFileLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [10]:
def getOpenAIKey(path=""):
    """Gets OpenAI API key from .env file"""
    paths = ["/Users/samisaf/openai.env", "C:/Users/samis/openai.env", "C:/Users/samisaf/openai.env"]
    if len(path) > 0:
        return dotenv_values(path)['OPENAI_API_KEY']
    else:
        for p in paths:
            if len(dotenv_values(p)) > 0:
                return dotenv_values(p)['OPENAI_API_KEY']
    return None

In [15]:
def loadDirectorySplitDocs(path: str, chunk_size=400, chunk_overlap=50, loader_cls=UnstructuredFileLoader):
    """Loads files in a directory, then splits them"""
    documents = DirectoryLoader(path=path, loader_cls=loader_cls).load()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    text_splitter2 = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n"])
    splitDocs = text_splitter2.split_documents(documents)
    print(f"loaded {path}. Got {len(documents)} documents. Splitted into {len(splitDocs)} parts.")
    return splitDocs

In [16]:
# load the PDF documents and then split them
path = './data-kpmp-oct-23'
docs = loadDirectorySplitDocs(path=path, loader_cls=UnstructuredMarkdownLoader)
len(docs)

loaded ./data-kpmp-oct-23. Got 2 documents. Splitted into 1128 parts.


1128

In [17]:
# create vector dataset 
# then embed the data into a vector store
# last store it locally through chromadb
dbpath = "./db-kpmp-oct-23"
embedding = OpenAIEmbeddings(openai_api_key=getOpenAIKey())
db = Chroma.from_documents(docs, embedding, persist_directory=dbpath)
db.persist()
print(f"created database at {dbpath}")

created database at ./db-kpmp-oct-23


# loading vector store, and performing similarity search

In [18]:
embedding = OpenAIEmbeddings(openai_api_key=getOpenAIKey())
readdb = Chroma(persist_directory='./db-kpmp-oct-23', embedding_function=embedding)
res = readdb.similarity_search("hemoglobin drop")
print(res)
len(res)

[Document(page_content='a. Note that an Adverse Event should be reported for a hemoglobin drop of more than 1 gram/dL.', metadata={'source': 'data-kpmp-oct-23/Recruitment-Site-MOP.md'}), Document(page_content='a. Note that an Adverse Event should be reported for a hemoglobin drop of more than 1 gram/dL.', metadata={'source': 'data-kpmp-oct-23/Recruitment-Site-MOP.md'}), Document(page_content='\nHemoglobin concentration is serially measured for all participants at 4 hours and 8 hours post biopsy and then daily. A fall in hemoglobin of >2 grams/dL - OR - >1 gram/dL to less than 9 grams/dL requires a repeat measurement if attributable to bleeding from the biopsy site', metadata={'source': 'data-kpmp-oct-23/Recruitment-Site-MOP.md'}), Document(page_content='If admitted, participants must have a stable hemoglobin status and hemodynamics (less than 1g/dL drop) prior to discharge.', metadata={'source': 'data-kpmp-oct-23/Recruitment-Site-MOP.md'})]


4