In [1]:
import chromadb
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.llms import LlamaCpp
chroma_client = chromadb.Client()

In [2]:
collection = chroma_client.create_collection(name="my_collection")

In [3]:
collection.add(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ],
    ids=["id1", "id2"]
)


In [4]:
results = collection.query(
    query_texts=["This is a query document about hawaii"], 
    n_results=2 
)
print(results)


{'ids': [['id1', 'id2']], 'distances': [[1.0404009819030762, 1.2430799007415771]], 'metadatas': [[None, None]], 'embeddings': None, 'documents': [['This is a document about pineapple', 'This is a document about oranges']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


In [5]:
def load_docs(directory):
    loader = DirectoryLoader(directory)
    documents = loader.load()
    
    return documents

med_documents = load_docs("data")  # call the load_docs function
print(len(med_documents))  # check the number of files in the documents folder

  from .autonotebook import tqdm as notebook_tqdm


1


In [6]:
embeddings = SentenceTransformerEmbeddings(model_name="NeuML/pubmedbert-base-embeddings")

print(embeddings)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=70)

texts = text_splitter.split_documents(med_documents)

print(texts)

collection = chroma_client.get_or_create_collection(name="med_collection")

collection.upsert(
    documents=texts[1].page_content,
    ids=["id1"]
)
results = collection.query(
    query_texts=["HIV"], 
    n_results=2 
)
print(results)

  warn_deprecated(


client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
) model_name='NeuML/pubmedbert-base-embeddings' cache_folder=None model_kwargs={} encode_kwargs={} multi_process=False show_progress=False
[Document(page_content='By the end of 1998, the number of people living with HIV is estimated to have grown to 33.4 million, according to estimates from UNAIDS and WHO. Most of these people do not know that they are infected. The epidemic has not been overcome anywhere. Virtually every country in the world saw new infections in 1998 and the epidemic is frankly out of control in many places. More than 95 percent of all HIV-infected people now live in the d

Number of requested results 2 is greater than number of elements in index 1, updating n_results = 1


{'ids': [['id1']], 'distances': [[0.9814468026161194]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['years. The multiple repercussions of these deaths are reaching crisis level in some parts of the world. Whether measured against the yardstick of deteriorating child survival, crumbling life expectancy, overburdened health care systems, increasing orphanhood, or bottom-line losses to business, AIDS has never posed a bigger threat to development. According to UNAIDS/WHO estimates, 11 men, women and children around the world were infected per minute during 1998—close to 6 million people in all. One-tenth of newly-infected people were under age 15, which brings the number of children now alive with HIV to 1.2 million. Most of them are thought to have acquired their infection from their mother']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


In [7]:
results = collection.query(
    query_texts=["AIDS"], 
    n_results=2 
)
print(results)

Number of requested results 2 is greater than number of elements in index 1, updating n_results = 1


{'ids': [['id1']], 'distances': [[0.987060546875]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['years. The multiple repercussions of these deaths are reaching crisis level in some parts of the world. Whether measured against the yardstick of deteriorating child survival, crumbling life expectancy, overburdened health care systems, increasing orphanhood, or bottom-line losses to business, AIDS has never posed a bigger threat to development. According to UNAIDS/WHO estimates, 11 men, women and children around the world were infected per minute during 1998—close to 6 million people in all. One-tenth of newly-infected people were under age 15, which brings the number of children now alive with HIV to 1.2 million. Most of them are thought to have acquired their infection from their mother']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


In [8]:
local_llm = "biomistral-7b-q4_k_m.gguf"
db = Chroma(persist_directory="./chromadb", embedding_function=embeddings, collection_name="med_collection")

llm = LlamaCpp(
    model_path= local_llm,
    temperature=0.3,
    max_tokens=1024,
    top_p=1,
    
)
prompt_template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer. Answer must be detailed and well explained.
Helpful answer:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
chain_type_kwargs = {"prompt": prompt}
retriever = db.as_retriever(search_kwargs={"k":1})
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True, chain_type_kwargs=chain_type_kwargs, verbose=True)
response = qa("What is AIDS")

llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from biomistral-7b-q4_k_m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = BioMistral-7B
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 32768
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   7:              llama.attention.head_count_kv u32    



[1m> Entering new RetrievalQA chain...[0m



llama_print_timings:        load time =    9887.32 ms
llama_print_timings:      sample time =     130.93 ms /    80 runs   (    1.64 ms per token,   611.03 tokens per second)
llama_print_timings: prompt eval time =  337271.26 ms /   400 tokens (  843.18 ms per token,     1.19 tokens per second)
llama_print_timings:        eval time =  521576.04 ms /    80 runs   ( 6519.70 ms per token,     0.15 tokens per second)
llama_print_timings:       total time =  864090.11 ms /   480 tokens



[1m> Finished chain.[0m


In [9]:
print(response)

{'query': 'What is AIDS', 'result': 'AIDS is a disease caused by a virus called HIV (human immunodeficiency virus). HIV attacks a person’s immune system and slowly destroys their ability to fight infections. A person infected with HIV may look and feel healthy for several years before developing AIDS. AIDS is not curable and can be fatal within a few months or a few years after symptoms appear.', 'source_documents': [Document(page_content='By the end of 1998, the number of people living with HIV is estimated to have grown to 33.4 million, according to estimates from UNAIDS and WHO. Most of these people do not know that they are infected. The epidemic has not been overcome anywhere. Virtually every country in the world saw new infections in 1998 and the epidemic is frankly out of control in many places. More than 95 percent of all HIV-infected people now live in the developing world, which has experienced 95 percent of all deaths to date from AIDS. These deaths are largely among young a

In [10]:
response = qa("What is the death rate due to AIDS")
print(response)



[1m> Entering new RetrievalQA chain...[0m


Llama.generate: prefix-match hit

llama_print_timings:        load time =    9887.32 ms
llama_print_timings:      sample time =     194.97 ms /   106 runs   (    1.84 ms per token,   543.67 tokens per second)
llama_print_timings: prompt eval time =   32120.12 ms /    29 tokens ( 1107.59 ms per token,     0.90 tokens per second)
llama_print_timings:        eval time =  704357.09 ms /   105 runs   ( 6708.16 ms per token,     0.15 tokens per second)
llama_print_timings:       total time =  743708.02 ms /   134 tokens



[1m> Finished chain.[0m
{'query': 'What is the death rate due to AIDS', 'result': 'The death rate due to AIDS is difficult to estimate as it depends on many factors such as access to health care services, access to antiretroviral treatment and other medical interventions, and the stage of the epidemic in a particular region. However, according to UNAIDS estimates, approximately 20 million people have died from AIDS since the beginning of the epidemic. In 2017 alone, it is estimated that approximately 770,000 people died from AIDS-', 'source_documents': [Document(page_content='By the end of 1998, the number of people living with HIV is estimated to have grown to 33.4 million, according to estimates from UNAIDS and WHO. Most of these people do not know that they are infected. The epidemic has not been overcome anywhere. Virtually every country in the world saw new infections in 1998 and the epidemic is frankly out of control in many places. More than 95 percent of all HIV-infected peo

In [11]:
response = qa("Number of people having AIDS")
print(response)



[1m> Entering new RetrievalQA chain...[0m


Llama.generate: prefix-match hit

llama_print_timings:        load time =    9887.32 ms
llama_print_timings:      sample time =      64.91 ms /   109 runs   (    0.60 ms per token,  1679.14 tokens per second)
llama_print_timings: prompt eval time =   30575.08 ms /    28 tokens ( 1091.97 ms per token,     0.92 tokens per second)
llama_print_timings:        eval time =  135377.33 ms /   108 runs   ( 1253.49 ms per token,     0.80 tokens per second)
llama_print_timings:       total time =  170505.73 ms /   136 tokens



[1m> Finished chain.[0m
{'query': 'Number of people having AIDS', 'result': 'The estimated number of people living with AIDS by the end of 1998 was 33.4 million, according to estimates from UNAIDS and WHO. Most of these people do not know that they are infected. The epidemic has not been overcome anywhere. Virtually every country in the world saw new infections in 1998 and the epidemic is frankly out of control in many places. More than 95 percent of all HIV-infected people now live in the developing', 'source_documents': [Document(page_content='By the end of 1998, the number of people living with HIV is estimated to have grown to 33.4 million, according to estimates from UNAIDS and WHO. Most of these people do not know that they are infected. The epidemic has not been overcome anywhere. Virtually every country in the world saw new infections in 1998 and the epidemic is frankly out of control in many places. More than 95 percent of all HIV-infected people now live in the developing 

In [12]:
#chroma_client.delete_collection("med_collection")

In [13]:
print(chroma_client.count_collections())

2
