In [1]:
# checking if the environment is working fine or not by printing a message
print("Everything is working fine.")

Everything is working fine.


In [2]:
# checking the current working directory using the pwd command
%pwd

'g:\\Jagadish\\AIDoctor-Advancing-Medical-Care-through-Gen-AI-Technology\\notebook'

In [3]:
# changing the current working directory to the parent directory
import os
os.chdir("../")

In [4]:
# checking again the current working directory using the pwd command
%pwd

'g:\\Jagadish\\AIDoctor-Advancing-Medical-Care-through-Gen-AI-Technology'

**Importing the required libraries for the project**

In [5]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceHubEmbeddings
#from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers


PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

**Defining a function to load PDF files from a directory**

In [6]:
def load_pdf_file(directory_path):
    """
    Loads all PDF files from the specified directory.

    Parameters:
    directory_path (str): The path to the directory containing the PDF files.

    Returns:
    list: A list of documents loaded from the PDF files.
    """
    try:
        loader = DirectoryLoader(directory_path, 
                                 glob="*.pdf", 
                                 loader_cls=PyPDFLoader)
        documents = loader.load()
        return documents
    # handling exceptions if any error occurs during the process of loading the PDF files
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [7]:
# loading the PDF files from the specified directory
extracted_text = load_pdf_file(directory_path="data/")

In [8]:
# showing some paragraph of the extracted text
extracted_text[:5]

[Document(metadata={'source': 'data\\Medical_book.pdf', 'page': 0}, page_content=''),
 Document(metadata={'source': 'data\\Medical_book.pdf', 'page': 1}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'source': 'data\\Medical_book.pdf', 'page': 2}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
 Document(metadata={'source': 'data\\Medical_book.pdf', 'page': 3}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow,Manager, Imaging and Multimedia\nContent\nRobyn V . Young,Project Manager, Imaging and\nMultimedia Content\nDean Daup

In [9]:
# splitting the extracted text into chunks of 500 characters with an overlap of 20 characters
def text_split(extracted_text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_text)

    return text_chunks

In [10]:
# splitting the extracted text into chunks of 500 characters
text_chunks = text_split(extracted_text)
print("Length of text chunks: ", len(text_chunks))

Length of text chunks:  5860


In [11]:
# showing some text chunks
text_chunks[:5]

[Document(metadata={'source': 'data\\Medical_book.pdf', 'page': 1}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'source': 'data\\Medical_book.pdf', 'page': 2}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
 Document(metadata={'source': 'data\\Medical_book.pdf', 'page': 3}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow,Manager, Imaging and Multimedia\nContent\nRobyn V . Young,Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and'),
 Document(metadata={'source': 'data\\Medical_boo

In [12]:
# importing the required modules from the langchain package
from langchain.embeddings import HuggingFaceEmbeddings

In [13]:
# defining a function to download embeddings from the Hugging Face model 'sentence-transformers/all-MiniLM-L6-v2'
def download_hugging_face_embeddings():
    """
    Downloads embeddings from the Hugging Face model 'sentence-transformers/all-MiniLM-L6-v2'.

    Returns:
    HuggingFaceEmbeddings: The embeddings from the specified Hugging Face model.
    """
    try:
        embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
        return embeddings
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [14]:
# downloading embeddings from the Hugging Face model 'sentence-transformers/all-MiniLM-L6-v2'
embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [15]:
# checking dimensions of the embeddings downloaded from the Hugging Face model 'sentence-transformers/all-MiniLM-L6-v2'
query_result = embeddings.embed_query("Hello, How are you?")
print("Length", len(query_result))

Length 384


In [16]:
# displaying the query result
query_result

[0.019096720963716507,
 0.03446514531970024,
 0.09162795543670654,
 0.0701652318239212,
 -0.02994656376540661,
 -0.08419138938188553,
 0.04581358656287193,
 0.004958597477525473,
 -0.09189330041408539,
 0.017400631681084633,
 -0.008816180750727654,
 -0.000661494501400739,
 -0.028556974604725838,
 -0.02194974571466446,
 0.05516670271754265,
 -0.04983646795153618,
 0.08988092839717865,
 -0.08895713835954666,
 -0.11235621571540833,
 0.039000503718853,
 -0.06607072800397873,
 0.02609509974718094,
 0.036530736833810806,
 0.061390381306409836,
 -0.057124894112348557,
 -0.05463934689760208,
 0.030365517362952232,
 0.032387565821409225,
 0.012644719332456589,
 -0.10568572580814362,
 -0.05834555625915527,
 0.06732939183712006,
 -0.04075585678219795,
 0.00643977802246809,
 0.005698682740330696,
 0.05285322293639183,
 -0.039775267243385315,
 -0.11855249851942062,
 0.0021162184420973063,
 -0.016692860051989555,
 0.0283381175249815,
 -0.03743797168135643,
 -0.021371401846408844,
 -0.041475184261798

In [17]:
from dotenv import load_dotenv
load_dotenv()

True

In [18]:
# setting the Pinecone API key
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [19]:
import os
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "aidoctor"

pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    )
)

In [20]:
# Embedding each chunk and upsert the embeddings into our Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [21]:
from langchain_pinecone import PineconeVectorStore

# Load Existing index and embed each chunk into the Pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,  # Name of the Pinecone index to load
    embedding=embeddings    # Embeddings to upsert into the Pinecone index
)

In [22]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x181b26da580>

In [23]:
# similarity search retriever to retrieve similar documents or answers based on the embeddings
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [24]:
retrieved_docs = retriever.invoke("what is Allergies?")

In [25]:
retrieved_docs

[Document(metadata={'page': 135.0, 'source': 'data\\Medical_book.pdf'}, page_content='Purpose\nAllergy is a reaction of the immune system. Nor-\nmally, the immune system responds to foreign microor-\nganisms and particles, like pollen or dust, by producing\nspecific proteins called antibodies that are capable of\nbinding to identifying molecules, or antigens, on the\nforeign organisms. This reaction between antibody and\nantigen sets off a series of reactions designed to protect\nthe body from infection. Sometimes, this same series of'),
 Document(metadata={'page': 129.0, 'source': 'data\\Medical_book.pdf'}, page_content='reaction. Allergic rhinitis is characterized by an itchy,\nrunny nose, often with a scratchy or irritated throat due\nto post-nasal drip. Inflammation of the thin membrane\ncovering the eye (allergic conjunctivitis) causes redness,\nirritation, and increased tearing in the eyes. Asthma caus-\nes wheezing, coughing, and shortness of breath. Symp-\ntoms of food allergie

In [27]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question "
    "If you don't know the answer, say that you don't know yet. "
    "Use three sentences maximum and keep the answer concise. "
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [30]:
# loading the Llama model for generating the answers to the questions based on the retrieved documents
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.5})

In [31]:
# creating a chain to retrieve similar documents or answers based on the embeddings
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [32]:
# invoking the RAG chain to get the answer to the question "what is Allergies?"
response = rag_chain.invoke({"input": "what is Allergies?"})
print(response["answer"])


Assistant: Allergies are abnormal reactions of the immune system. Normally, the immune system responds to foreign microorganisms and particles by producing specific proteins called antibodies that are capable of binding to identifying molecules on the foreign organisms. This reaction.  foreign organisms.
