In [3]:
%pwd

'd:\\Generative AI\\Projects\\Medical-Chatbot\\research'

In [4]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [5]:
## Extract the data from the pdf

def load_pdf_file(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)

    documents = loader.load()
    return documents

In [7]:
import os
os.chdir("../")

In [8]:
%pwd

'd:\\Generative AI\\Projects\\Medical-Chatbot'

In [9]:
extracted_data = load_pdf_file(data='Data/')

In [10]:
## Split the data into chuncks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [11]:
text_chunks = text_split(extracted_data)
print("length of text chunks", len(text_chunks))


length of text chunks 6970


In [12]:
## Download the embedding from hugging face
from langchain_huggingface import HuggingFaceEmbeddings
def download_huggging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2")
    return embeddings


In [13]:
embeddings = download_huggging_face_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
query_result = embeddings.embed_query("Hello World")
print("Length: ", len(query_result))

Length:  384


In [15]:
from dotenv import load_dotenv
load_dotenv()

pinecone_api_key = os.getenv('PINECONE_API_KEY')

In [16]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)

index_name = "medicalbot"

pc.create_index(
  name=index_name,
  dimension=384,
  metric="cosine",
  spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  )
)


In [17]:
os.environ["PINECONE_API_KEY"] = pinecone_api_key

In [None]:
## Embed each chunk and upsert it into your Pinecone Index

from langchain_pinecone import Pinecone

docsearch = Pinecone.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [22]:
## Load the existing Index

from langchain_pinecone import Pinecone
docsearch_retrieveal = Pinecone.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [23]:
docsearch_retrieveal

<langchain_pinecone.vectorstores.Pinecone at 0x260173d2260>

In [25]:
## do some search
retriever = docsearch_retrieveal.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
retrieved_docs = retriever.invoke("What is Chemotherapy")


[Document(metadata={'page': 128.0, 'source': 'Data\\medical_book.pdf.pdf'}, page_content='Chemotherapy\nDefinition\nChemotherapy is treatment of cancer with anti-\ncancer drugs.\nPurpose\nThe main purpose of chemotherapy is to kill cancer\ncells. It is usually used to treat patients with cancer that\nhas spread from the place in the body where it started\n(metastasized). Chemotherapy destroys cancer cells any-\nwhere in the body. It even kills cells that have broken off\nfrom the main tumor and traveled through the blood or\nlymph systems to other parts of the body.'),
 Document(metadata={'page': 24.0, 'source': 'Data\\medical_book.pdf.pdf'}, page_content='Chemotherapy\nChemotherapy is the use of drugs to kill cancer cells.\nIt destroys the hard-to-detect cancer cells that have\nspread and are circulating in the body. Chemotherapeutic\ndrugs can be taken either orally (by mouth) or intra-\nvenously, and may be given alone or in conjunction with\nsurgery, radiation or both.\nWhen chemot

In [32]:
retrieved_docs

[Document(metadata={'page': 128.0, 'source': 'Data\\medical_book.pdf.pdf'}, page_content='Chemotherapy\nDefinition\nChemotherapy is treatment of cancer with anti-\ncancer drugs.\nPurpose\nThe main purpose of chemotherapy is to kill cancer\ncells. It is usually used to treat patients with cancer that\nhas spread from the place in the body where it started\n(metastasized). Chemotherapy destroys cancer cells any-\nwhere in the body. It even kills cells that have broken off\nfrom the main tumor and traveled through the blood or\nlymph systems to other parts of the body.'),
 Document(metadata={'page': 24.0, 'source': 'Data\\medical_book.pdf.pdf'}, page_content='Chemotherapy\nChemotherapy is the use of drugs to kill cancer cells.\nIt destroys the hard-to-detect cancer cells that have\nspread and are circulating in the body. Chemotherapeutic\ndrugs can be taken either orally (by mouth) or intra-\nvenously, and may be given alone or in conjunction with\nsurgery, radiation or both.\nWhen chemot

In [37]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["GROQ_API_KEY"] = os.getenv('GROQ_API_KEY')

In [38]:
from langchain_groq import ChatGroq
llm = ChatGroq(model_name="Gemma2-9b-It")

In [41]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

system_prompt = (
    """ 
    "You are an assistant for question-answering tasks."
    "Use the following pieces of retrived context to answer "
    "the question. if you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"

    """
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human", "{input}"),
    ]
)


In [42]:
question_anwering_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_anwering_chain)

In [43]:
response = rag_chain.invoke({"input":"What is Chemotherapy"})
print(response["answer"])

Chemotherapy is a cancer treatment that uses drugs to kill cancer cells. 

It is often used to treat cancer that has spread to other parts of the body.  Chemotherapy drugs can be taken orally or intravenously.  


