In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.llms import Cohere
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import pinecone
import os
load_dotenv()

'd:\\Projects\\Medical_Chatbot_GenAI\\research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'd:\\Projects\\Medical_Chatbot_GenAI'

In [6]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [7]:
extracted_data=load_pdf_file(data='Data/')

In [8]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [9]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5860


# text_chunks

In [11]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [12]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [13]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


True

In [31]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
COHERE_API_KEY=os.environ.get('COHERE_API_KEY')

In [None]:
# Initialize Pinecone client
pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "medicalbot"

# Create index
pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=pinecone.ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)


{
    "name": "medicalbot",
    "metric": "cosine",
    "host": "medicalbot-wtpzm2h.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [None]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [26]:

docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1398e039c60>

In [28]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})


In [29]:
retrieved_docs = retriever.invoke("What is Acne?")


In [30]:
retrieved_docs

[Document(id='fb1207bb-8928-44df-8fbf-41b2374162b6', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='b8175304-d08e-4a5f-9fa0-d32db97e52d7', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM -

In [None]:
# Initialize Cohere LLM with API key
llm = Cohere(
    model="command",  # Options: "command-r", "command", "command-light"
    temperature=0.4,
    max_tokens=500,
    cohere_api_key=os.getenv("COHERE_API_KEY")  # Now correctly fetching from .env
)


In [None]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


In [42]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [43]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

 Acromegaly is a disorder caused by an abnormally large release of growth hormones after bone growth has stopped, which causes increased bone and soft tissue growth. It is a relatively rare disorder, occurring in approximately 50 out of every one million people. 

Symptoms of acromegaly appear gradually and include unusual growth in the hands, feet, and face, as well as a variety of other disturbances throughout the body. If the abnormality occurs during childhood before bone growth stops, the disorder is called gigantism. 

Diagnosis of acromegaly is often delayed because the symptoms appear gradually, and most patients are not identified until they are middle-aged. 
