In [1]:
print("Hello World")

Hello World


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import os
os.chdir("../")

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
# Extract Data from PDF FIle

def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob = "*.pdf",
                             loader_cls=PyPDFLoader)
    documents=loader.load()

    return documents

In [6]:
extracted_data = load_pdf_file(data='Data/')

In [7]:
# Split the Data into Text Chunks

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [8]:
text_chunks = text_split(extracted_data)
print("Length of Text Chunks: ", len(text_chunks))

Length of Text Chunks:  5860


In [9]:
from langchain.embeddings import HuggingFaceHubEmbeddings

#Downloading the Embedding model from HuggingFace

def download_hugging_face_embeddings():
    embeddings = HuggingFaceHubEmbeddings(repo_id='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings 

In [10]:
from dotenv import load_dotenv
load_dotenv()

embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceHubEmbeddings(repo_id='sentence-transformers/all-MiniLM-L6-v2')


In [11]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [12]:
import os
PINECONE_API_KEY=os.environ.get("PINECONE_API_KEY")


load_dotenv(dotenv_path='research\.env')

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medbot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

{
    "name": "medbot",
    "metric": "cosine",
    "host": "medbot-i39we5z.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [13]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [14]:
# Load existing indices

from langchain_pinecone import PineconeVectorStore

#Embedding each chunk and upsert the embeddings into our Pinecode index
docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings
)


In [15]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":5})

In [16]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='ee9953ca-b8ee-4af3-bf56-3ae53698e34f', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='839218d6-d970-4df4-913a-469cab66d16a', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM -

In [17]:
from langchain_anthropic import ChatAnthropic
ANTHROPIC_API_KEY=os.environ.get("ANTHROPIC_API_KEY")
llm = ChatAnthropic(model = 'claude-3-5-haiku-20241022', 
                    temperature = 0.4, 
                    max_tokens=500,
                    api_key=ANTHROPIC_API_KEY)


In [23]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an medical assistant chatbot for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. In case where you do not know the answer "
    "answer with a single sentence saying - " 
    "'Sorry, I do not know the answer to this question.'"
    "Use three sentences maximum and keep the "
    "answer concise.  "
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)



In [19]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [25]:
response = rag_chain.invoke({"input": "what is Gigantism?"})
print(response["answer"])

Based on the context provided, Gigantism is a variant of acromegaly that occurs in children whose bony growth plates have not yet closed. It is characterized by exceptional growth of long bones due to abnormal release of growth hormone (GH) from the pituitary gland, resulting in significantly increased height and bone growth.


In [24]:
response = rag_chain.invoke({"input": "what is a reservoir ?"})
print(response["answer"])

Based on the provided context, I cannot find a specific definition for "reservoir" related to medical terminology. Sorry, I do not know the answer to this question.
