In [227]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [228]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
        )
    documents = loader.load()
    return documents

In [229]:
extrated_data = load_pdf_files("E:\AI,ML,DS\AI Projects\-Medical-Chatbot-with-LLMs-LangChain-Pinecone-Flask-AWS-\data")

In [230]:
len(extrated_data)

637

In [231]:
from typing import List
from langchain.schema import Document

In [232]:
def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [233]:
minimal_docs = filter_to_minimal_docs(extrated_data)

In [234]:
# minimal_docs

In [235]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [236]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

Number of chunks: 5859


In [237]:
from langchain.embeddings import HuggingFaceEmbeddings
def download_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')  #this model return 384 dimensions
    return embeddings
embeddings = download_embeddings()

In [238]:
vector = embeddings.embed_query("hello world")

In [239]:
len(vector)

384

In [240]:
from dotenv import load_dotenv
import os
load_dotenv(override=True)

True

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)

In [262]:
from pinecone import ServerlessSpec
index_name = "medical-chatobot"
if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")

    )

index = pc.Index(index_name)


In [263]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embeddings,
    index_name=index_name
)

In [264]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,

)

In [265]:
retriever = docsearch.as_retriever(docsearch_type="similarity", search_kwargs={"k": 3})

In [266]:
retrieved_docs = retriever.invoke("What is acne")

In [267]:
retrieved_docs


[Document(id='46b01aad-5001-4706-8949-ff39893ea019', metadata={'source': 'E:\\AI,ML,DS\\AI Projects\\-Medical-Chatbot-with-LLMs-LangChain-Pinecone-Flask-AWS-\\data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='6247b875-7531-4b4d-857c-dc49166e7a3f', metadata={'source': 'E:\\AI,ML,DS\\AI Projects\\-Medical-Chatbot-with-LLMs-LangChain-Pinecone-Flask-AWS-\\data\\Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is\nthe most common skin disease. It affects nearly 17 million\npeople in the United States. While acne can arise at any'),
 Document(id='6e5d6780-1792-41aa-930c-30cf

In [268]:
from langchain_openai import ChatOpenAI
import os

chatModel = ChatOpenAI(
    model="mistralai/mistral-7b-instruct",   # free OpenRouter model
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    openai_api_base="https://openrouter.ai/api/v1"
)

In [269]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [286]:
system_prompt = (
    "You are a medical assistant for question-answering tasks. "
    "Use the retrieved context to answer the question. "
    "Always provide the best possible answer using the context, and do not provide empty answer even if brief. "
    "Use three sentences maximum and keep the answer concise.\n\n{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [287]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [294]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

 Acromegaly and gigantism are disorders caused by the excessive release of growth hormone from the pituitary gland. Acromegaly occurs when the abnormality happens after bone growth has stopped, leading to increased growth in bones and soft tissues. Gigantism, on the other hand, occurs when the abnormality happens before bone growth has stopped, resulting in unusual height.


In [295]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])

 Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. Acne vulgaris, the medical term for common acne, is the most common skin disease, affecting nearly 17 million people in the United States.


In [296]:
response = rag_chain.invoke({"input": "what is the acne?"})
print(response["answer"])

 Acne is a common skin disease characterized by pimples on the face, chest, and back, caused by clogged pores with oil, dead skin cells, and bacteria. It is the most common skin disease, affecting nearly 17 million people in the United States. Acne can occur at any age, but it is most common in adolescents and young adults.


In [299]:
response = rag_chain.invoke({"input": "what is the bone?"})
print(response["answer"])

 Bone is composed of a matrix primarily made up of collagen, strengthened by calcium and phosphate salts known as hydroxyapatite. It also contains various cells, including osteoblasts, osteocytes, and osteoclasts, which are responsible for bone formation, maintenance, and resorption. Bone serves as a structural support for the body, protects organs, and facilitates movement.
