## Create a vector database on ChromaDB.
###### Embed the dataset into the vector database on which the query is to be done. This is a local chromaDB instance

#### Read CSV and load dataframe

In [1]:
import pandas as pd

diseaseDB = pd.read_csv("data\medicalInformation.csv")

In [2]:
diseaseDB.head()

Unnamed: 0.1,Unnamed: 0,source,url,disease,text
0,1,webmd,https://www.webmd.com/covid/coronavirus,Coronavirus,Editor's note: For the latest updates on the 2...
1,2,webmd,https://www.webmd.com/cold-and-flu/adult-flu-s...,Influenza,"Common Flu SymptomsUnlike cold symptoms, signs..."
2,3,webmd,https://www.webmd.com/cold-and-flu/common_cold...,Common Cold,"OverviewSneezing, scratchy throat, runny nose ..."
3,4,webmd,https://www.webmd.com/allergies/allergy-basics,Common Cold,What Is an Allergy?It's what happens when your...
4,5,webmd,https://www.webmd.com/lung/rsv-in-babies,Respiratory Syncytial Virus,What Is Respiratory Syncytial Virus (RSV)?Resp...


In [3]:
diseaseDB.drop(diseaseDB.columns[0], axis=1,inplace=True)

diseaseDB.sample(5)

Unnamed: 0,source,url,disease,text
7,webmd,https://www.webmd.com/lung/lung-what-is-sars,Severe acute respiratory syndrome,"Severe acute respiratory syndrome, or SARS, is..."
3,webmd,https://www.webmd.com/allergies/allergy-basics,Common Cold,What Is an Allergy?It's what happens when your...
10,National Health Service,https://www.nhs.uk/conditions/covid-19/how-to-...,Coronavirus,How COVID-19 is spread COVID-19 spreads very e...
15,National Health Service,https://www.nhs.uk/conditions/common-cold/,Common Cold,You can often treat a cold without seeing a GP...
0,webmd,https://www.webmd.com/covid/coronavirus,Coronavirus,Editor's note: For the latest updates on the 2...


In [4]:
# Rearrange serially
diseaseDB.index += 1

diseaseDB.head()

Unnamed: 0,source,url,disease,text
1,webmd,https://www.webmd.com/covid/coronavirus,Coronavirus,Editor's note: For the latest updates on the 2...
2,webmd,https://www.webmd.com/cold-and-flu/adult-flu-s...,Influenza,"Common Flu SymptomsUnlike cold symptoms, signs..."
3,webmd,https://www.webmd.com/cold-and-flu/common_cold...,Common Cold,"OverviewSneezing, scratchy throat, runny nose ..."
4,webmd,https://www.webmd.com/allergies/allergy-basics,Common Cold,What Is an Allergy?It's what happens when your...
5,webmd,https://www.webmd.com/lung/rsv-in-babies,Respiratory Syncytial Virus,What Is Respiratory Syncytial Virus (RSV)?Resp...


In [5]:
diseaseDB.shape

(32, 4)

#### Load data from CSV to ChromaDB

In [6]:
from langchain.schema import Document

# Function to create metadata of the vector database
def create_docs(docs, row):
    """Function to create the document and add metadata to load into ChromaDB"""
    document = Document(
        page_content=row['text'],
        metadata={
            "source":row['source'],
            "url":row['url'],
            "disease":row['disease'],
            "rating":5
        }
    )
    docs.append(document)

In [7]:
# Iterate over medical dataframe
docs = []
for i in diseaseDB.index:
    create_docs(docs, diseaseDB.loc[i])

In [8]:
len(docs)

32

#### Define embeddings

In [9]:
# Use LangChain's embedding model
from langchain.embeddings import SentenceTransformerEmbeddings

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


#### Chunk each document - query on smaller chunks - time taken is less

In [10]:
# Use LangChain's text splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 10
)

In [11]:
len(docs)

32

In [12]:
# Test chunking
test_document = docs[0]

len(test_document.page_content)

28957

In [13]:
test_text = text_splitter.split_documents([test_document])

# Check chunk
test_text

[Document(page_content="Editor's note: For the latest updates on the 2023Â\xa0coronavirus outbreak, see our news coverage.What Is COVID-19?COVID-19Â\xa0is a disease caused by SARS-CoV-2 that can trigger what doctors call a", metadata={'source': 'webmd', 'url': 'https://www.webmd.com/covid/coronavirus', 'disease': 'Coronavirus', 'rating': 5}),
 Document(page_content='call a respiratory tract infection. In early 2020, Â\xa0the World Health Organization identified SARS-CoV-2 as a new type ofÂ\xa0coronavirus. The outbreak quickly spread around the world. Â\xa0It can affect', metadata={'source': 'webmd', 'url': 'https://www.webmd.com/covid/coronavirus', 'disease': 'Coronavirus', 'rating': 5}),
 Document(page_content='affect your upper respiratory tract (sinuses, nose, and throat) or lower respiratory tract (windpipe andÂ\xa0lungs).It spreads the same way other coronaviruses do, mainly through person-to-person', metadata={'source': 'webmd', 'url': 'https://www.webmd.com/covid/coronavirus', '

In [14]:
texts = text_splitter.split_documents(docs)

len(texts)

633

#### Create vector database on ChromaDB

In [15]:
# Define ids
ids = [str(i) for i in range(1, len(texts)+1)]

In [16]:
print(ids)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '158', '

In [17]:
# Add to chroma
from langchain.vectorstores import Chroma

vectorDb = Chroma.from_documents(documents=texts, embedding=embeddings,
                                 ids =ids,
                                 persist_directory="./medical_chroma_db",
                                 collection_name="MedicalInformation")

In [18]:
# Save the vector database
vectorDb.persist()
vectorDb = None

In [19]:
# Save the vector database to a particular directory
vectorDb = Chroma(persist_directory='./medical_chroma_db',
                  embedding_function=embeddings,
                  collection_name="MedicalInformation")

In [20]:
# Test to fetch a particular datapoint
document = vectorDb.get(ids=['151'])

document

{'ids': ['151'],
 'embeddings': None,
 'metadatas': [{'disease': 'Coronavirus',
   'rating': 5,
   'source': 'webmd',
   'url': 'https://www.webmd.com/covid/coronavirus'}],
 'documents': ['died from MERS, which first appeared in Saudi Arabia and then in other countries in the Middle East, Africa, Asia, and Europe. In April 2014, the first American was hospitalized for MERS in Indiana,']}

In [21]:
# Create a doc using schema to update a datapoint in the vector database

doc = Document(
    ids=document['ids'][0],
    page_content=document['documents'][0],
    metadata={
        'source':document['metadatas'][0]['source'],
        'url':document['metadatas'][0]['url'],
        'disease':document['metadatas'][0]['disease'],
        'rating':document['metadatas'][0]['rating'],
    }
)

# Check the doc created
doc

Document(page_content='died from MERS, which first appeared in Saudi Arabia and then in other countries in the Middle East, Africa, Asia, and Europe. In April 2014, the first American was hospitalized for MERS in Indiana,', metadata={'source': 'webmd', 'url': 'https://www.webmd.com/covid/coronavirus', 'disease': 'Coronavirus', 'rating': 5})

In [22]:
# Upadate vector database with the new doc created
vectorDb.update_document('151', doc)

In [23]:
# Fetch updated doc
vectorDb.get(ids=['151'])

{'ids': ['151'],
 'embeddings': None,
 'metadatas': [{'disease': 'Coronavirus',
   'rating': 5,
   'source': 'webmd',
   'url': 'https://www.webmd.com/covid/coronavirus'}],
 'documents': ['died from MERS, which first appeared in Saudi Arabia and then in other countries in the Middle East, Africa, Asia, and Europe. In April 2014, the first American was hospitalized for MERS in Indiana,']}

#### Check if information retrieval is working fine using OpenAI

In [24]:
# Set up OpenAI
from langchain.llms import OpenAI
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

llm = OpenAI()

In [25]:
# Set retriever
retriever = vectorDb.as_retriever()

In [26]:
# Define sources to search in
sources =['CDC','webmd', 'National Health Service']

# Initialise filters list
or_filters = []

# Create temporary dictionary of sources
for source in sources:
    temp_dict = {
        'source':source
    }
    or_filters.append(temp_dict)

or_filters

[{'source': 'CDC'}, {'source': 'webmd'}, {'source': 'National Health Service'}]

In [27]:
# Define search kwargs (for metadata filtering)
search_kwargs = {
    'k':len(sources),
    'filter':{
        "$or": or_filters
    }
}

In [28]:
# Set up retrieval chain
from langchain.chains import RetrievalQA

retrievalqaChain =RetrievalQA.from_chain_type(llm = llm, chain_type="stuff",
                  retriever=vectorDb.as_retriever(search_kwargs=search_kwargs))

In [29]:
# Fetch information
retrievalqaChain({"query":"I have symptoms like coronavirus, what should I do?"})

{'query': 'I have symptoms like coronavirus, what should I do?',
 'result': " If you are experiencing any of the symptoms of coronavirus (runny nose, coughing, sore throat, etc.), consider getting tested for COVID-19. If you have already tested positive for COVID-19, learn more about CDC's isolation recommendations."}

In [30]:
from langchain.chains import ConversationalRetrievalChain

In [31]:
qa = ConversationalRetrievalChain.from_llm(llm =llm, retriever=vectorDb.as_retriever(), return_source_documents=True)

In [32]:
chat_history=[]
query="I have symptoms like coronavirus, what should I do?"
result = qa({"question": query, "chat_history": chat_history})

In [33]:
result['answer']

' If you are experiencing any of the symptoms associated with coronavirus, consider getting tested for COVID-19. Depending on the results of the tests and how sick you are, your doctor may recommend medications to treat the virus, reduce an overactive immune response, or treat COVID-19 complications.'

In [34]:
result['source_documents']

[Document(page_content='COVID-19 SYMPTOM CHECKER\n\t\t\t\t\t\t\t\t\t\t\t\n\n\n\nFind Out if You Have Symptoms of Coronavirus (COVID-19)See what to do about your symptoms and whether to call a doctor', metadata={'disease': 'Coronavirus', 'rating': 5, 'source': 'webmd', 'url': 'https://www.webmd.com/covid/coronavirus'}),
 Document(page_content='come down with a coronavirus infection at any time.The symptoms of most coronaviruses are similar to any other upper respiratory infection, including a runny nose, coughing, sore throat, and', metadata={'disease': 'Coronavirus', 'rating': 5, 'source': 'webmd', 'url': 'https://www.webmd.com/covid/coronavirus'}),
 Document(page_content='Feeling Sick? If you are experiencing any of these symptoms, consider the following options: Get tested for COVID-19 If you have already tested positive for COVID-19, learn more about CDC’s isolation', metadata={'disease': 'Coronavirus', 'rating': 5, 'source': 'CDC', 'url': 'https://www.cdc.gov/coronavirus/2019-ncov/