In [2]:
import pinecone # vector database designed to store, search, and manage large amounts of vector embeddings 
from langchain import PromptTemplate # It provides integrations with models and tools for chaining different components together to create applications such as question-answering systems, chatbots, and document processing pipelines.
                                     # define structured prompts for language models
from langchain.chains import RetrievalQA # It's used to build question-answering systems 
from langchain.embeddings import HuggingFaceBgeEmbeddings #  convert documents or queries into embeddings, which can then be stored in a vector database like Pinecone or used for similarity search.
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader #  can use it to load all files in a folder for document-based tasks
from langchain.text_splitter import  RecursiveCharacterTextSplitter      #for intercorpus to  chunks , to create chunks, breaks down long documents into smaller chunks of text
from langchain.llms import CTransformers


from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [3]:
#function for extracting the data
#This function loads all PDF files from the specified directory and prints a preview of the content of the loaded documents
#Returns a list of loaded documents.

def load_data(Data):
    loader = DirectoryLoader(Data, glob="*.pdf", loader_cls=PyPDFLoader)
    doc = loader.load()
    
    print(f"Number of documents loaded: {len(doc)}")
    
    # If documents are loaded, print a preview of the content
    if len(doc) > 0:
        for i, d in enumerate(doc):
            print(f"Document {i} content (first 500 chars): {d.page_content[:500]}...")
    else:
        print("No documents were loaded. Please check the file path and file format.")
    
    return doc

data_extracted = load_data("Data/")


Number of documents loaded: 759
Document 0 content (first 500 chars): TheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITION...
Document 1 content (first 500 chars): TheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITION
JACQUELINE L. LONGE, EDITOR
DEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR
VOLUME
C-F2...
Document 2 content (first 500 chars): STAFF
Jacqueline L. Longe, Project Editor
Deirdre S. Blanchfield, Associate Editor
Christine B. Jeryan, Managing Editor
Donna Olendorf, Senior Editor
Stacey Blachford, Associate Editor
Kate Kretschmann, Melissa C. McDade, Ryan
Thomason, Assistant Editors
Mark Springer, Technical Specialist
Andrea Lopeman, Programmer/Analyst
Barbara J. Yarrow, Manager, Imaging and Multimedia
Content
Robyn V . Young, Project Manager, Imaging and
Multimedia Content
Dean Dauphinais, Senior Editor, Imaging and
Multim...
Document 3 content (first 500 chars): Introduction .................................................... ix
Advisory Board .............................................. 

In [4]:
data_extracted

[Document(metadata={'source': 'Data/resource1.pdf', 'page': 0}, page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION'),
 Document(metadata={'source': 'Data/resource1.pdf', 'page': 1}, page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nC-F2'),
 Document(metadata={'source': 'Data/resource1.pdf', 'page': 2}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and\nMultimedia Content\nKelly A. Quin, Editor, Imaging and Multimedia 

In [5]:

# Converting the corpus to chunks 
#Converts long documents into smaller chunks for better processing and storage in the vector database. For embedding generation and similarity-based searches.
def textSplit(data_extracted):
    if not data_extracted or len(data_extracted) == 0:
        raise ValueError("No documents found for splitting.")
    
    # Initialize the text splitter
    text_chunk_split = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=20)
    
    # Perform the text split
    chunk = text_chunk_split.split_documents(data_extracted)

    # Debugging: Output the number of chunks and their contents
    print(f"Number of chunks created: {len(chunk)}")
    if len(chunk) > 0:
        for i, ch in enumerate(chunk):
            print(f"Chunk {i} content: {ch.page_content[:200]}...")  # Print first 200 characters of each chunk
    
    return chunk




In [6]:
chunksOfText = textSplit(data_extracted)

Number of chunks created: 11722
Chunk 0 content: TheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITION...
Chunk 1 content: TheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITION
JACQUELINE L. LONGE, EDITOR
DEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR
VOLUME
C-F2...
Chunk 2 content: STAFF
Jacqueline L. Longe, Project Editor
Deirdre S. Blanchfield, Associate Editor
Christine B. Jeryan, Managing Editor
Donna Olendorf, Senior Editor
Stacey Blachford, Associate Editor
Kate Kretschman...
Chunk 3 content: Andrea Lopeman, Programmer/Analyst
Barbara J. Yarrow, Manager, Imaging and Multimedia
Content
Robyn V . Young, Project Manager, Imaging and
Multimedia Content
Dean Dauphinais, Senior Editor, Imaging a...
Chunk 4 content: Leitha Etheridge-Sims, Mary K. Grimes, Dave Oblender,
Image Catalogers
Pamela A. Reed, Imaging Coordinator
Randy Bassett, Imaging Supervisor
Robert Duncan, Senior Imaging Specialist
Dan Newell, Imagin...
Chunk 5 content: Margaret A. Chamberlain, Permissions Specialist
Michelle DiMercurio, Seni

In [7]:
len(chunksOfText)

11722

In [8]:
#Have to store in vector DB, but before that we need to convert it to vector
#Retrieves the HuggingFace model for generating embeddings from the text chunks. These embeddings will be stored in Pinecone for later use in similarity-based queries.
def getModelEmbedding():
    embdeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embdeddings

In [9]:
model_embedding  = getModelEmbedding()



In [10]:
model_embedding 

HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='', show_progress=False)

In [11]:
result = model_embedding.embed_query("My name is  Joel")
print(len(result))

384


In [12]:
result

[0.015105356462299824,
 0.11961336433887482,
 0.05274144187569618,
 0.004884304013103247,
 -0.019889963790774345,
 0.08689246326684952,
 0.063105508685112,
 -0.03965267539024353,
 0.029514657333493233,
 -0.01047738827764988,
 -0.02369769662618637,
 -0.03502218425273895,
 0.06994754076004028,
 -0.07021068781614304,
 0.06614986807107925,
 0.06002500280737877,
 -0.058605652302503586,
 0.05102072283625603,
 -0.01661394163966179,
 -0.057549938559532166,
 -0.038430456072092056,
 0.08207067847251892,
 -0.04171161353588104,
 -0.027764106169342995,
 -0.06188815087080002,
 -0.020140308886766434,
 -0.0021948397625237703,
 0.018384000286459923,
 0.039919961243867874,
 -0.09872553497552872,
 -0.028862422332167625,
 -0.00979032926261425,
 0.09886303544044495,
 0.0549008883535862,
 0.004105696454644203,
 0.040451083332300186,
 -0.005908538121730089,
 0.021387869492173195,
 0.003074259264394641,
 -0.025264084339141846,
 0.006727087311446667,
 -0.044387590140104294,
 -0.018174996599555016,
 -0.05712436

In [13]:
import os
from pinecone import Pinecone, ServerlessSpec


os.environ["PINECONE_API_KEY"] = "45060129-0c79-4f9a-aa11-60095ca285b2"

# Function to initialize Pinecone
def initialize_pinecone(index_name):
    PINECONE_APIKEY = os.getenv("PINECONE_API_KEY")
    PINECONE_ENV = "us-east-1"
    
    # Initialize Pinecone using the instance-based method
    pc = Pinecone(api_key=PINECONE_APIKEY)

    # Check if the index exists, and create it if necessary
    if index_name not in pc.list_indexes().names():
        print(f"Creating index: {index_name}")
        pc.create_index(
            name=index_name,
            dimension=384,
            metric="cosine",
            spec=ServerlessSpec(cloud='aws', region=PINECONE_ENV)
        )
    else:
        print(f"Using existing index: {index_name}")

    return pc.Index(index_name)


In [14]:
# Converts the text chunks into embeddings and stores them in the Pinecone vector database.

def store_embeddings(chunks_of_text, model_embedding, index_name):
    # Initialize Pinecone index
    pinecone_index = initialize_pinecone(index_name)

    # Generating the embeddings for each chunk of text
    embeddings = model_embedding.embed_documents([chunk.page_content for chunk in chunks_of_text])

    # Prepare and upsert data into Pinecone
    for i, chunk in enumerate(chunks_of_text):
        # Each document is upserted with the embedding vector and metadata (the text)
        pinecone_index.upsert(
            vectors=[
                {
                    'id': str(i),  # Unique ID for each chunk
                    'values': embeddings[i],  # Embedding vector
                    'metadata': {'text': chunk.page_content}  # Store chunk content as metadata
                }
            ]
        )


In [15]:
index_name = "medchatbot"  
store_embeddings(chunksOfText, model_embedding, index_name)

Using existing index: medchatbot


In [16]:
from langchain.vectorstores import Pinecone as LangchainPinecone
from langchain.chains import RetrievalQA
from langchain.llms import CTransformers

# Initialize the LLM model 
llm_model = CTransformers(
    model="/Users/joeppan/Desktop/Medical-Chatbot/model/llama-2-7b-chat.ggmlv3.q4_0.bin",
    model_type="llama"
)



In [17]:
#Queries the Pinecone index using the user’s question, retrieves the most relevant text, and passes it to the language model for generating a response.
def query_pinecone(index_name, user_query, model_embedding, llm_model):
    vectorstore = LangchainPinecone.from_existing_index(
        index_name=index_name,
        embedding=model_embedding 
    )
    
    retriever = vectorstore.as_retriever()
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm_model,
        chain_type="stuff",
        retriever=retriever
    )
    
    # Perform the query and get the result
    return qa_chain.run(user_query)

In [18]:
# Example query
query = "What are the symptoms of diabetes?"

# Run the query and get the response
response = query_pinecone(index_name="medchatbot", user_query=query, model_embedding=model_embedding, llm_model=llm_model)

print(response)

  return qa_chain.run(user_query)


 According to the provided sources, the symp-
toms of diabetes include frequent urination, lethargy, exces-
sive thirst, and hunger. Other possible symptoms may include
sudden weight loss, slow wound healing, gum disease, blurred


In [19]:
query = "What are caffeine?"

# Run the query and get the response
response = query_pinecone(index_name="medchatbot", user_query=query, model_embedding=model_embedding, llm_model=llm_model)

print(response)

 Caffeine is a stimulant drug that can help people feel more alert, less drowsy, and improve coordination. It is found in many foods and drinks, such as coffee, tea, and chocolate, and is also available in some medications. However, it is important to use caffeine with caution, as excessive use can lead to sleep disorders like insomnia.


In [20]:
# Function to ask the user for a query and return a response
def chat_with_medbot(index_name, model_embedding, llm_model):
    while True:
        # Ask for user input
        query = input("Ask me a medical question (or type 'exit' to stop): ")

        # Break the loop if the user wants to exit
        if query.lower() == 'exit':
            print("Thank you for using the medical chatbot. Goodbye!")
            break

        # Run the query and get the response
        response = query_pinecone(index_name=index_name, user_query=query, model_embedding=model_embedding, llm_model=llm_model)

        # Print the response
        print("Response:", response)





In [25]:
chat_with_medbot(index_name="medchatbot", model_embedding=model_embedding, llm_model=llm_model)

Response:  Calcium channel blockers are a type of medicine used to treat high blood pressure and certain heart conditions. They work by blocking the entry of calcium into the cells of the heart and blood vessels, which relaxes the blood vessels and increases the supply of oxygen-rich blood to the heart, reducing the heart's workload. They are available only with a physician's prescription and come in tablet, capsule, and injectable forms.
Thank you for using the medical chatbot. Goodbye!
