In [227]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [228]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
        )
    documents = loader.load()
    return documents

In [229]:
extrated_data = load_pdf_files("E:\AI,ML,DS\AI Projects\-Medical-Chatbot-with-LLMs-LangChain-Pinecone-Flask-AWS-\data")

In [230]:
len(extrated_data)

637

In [231]:
from typing import List
from langchain.schema import Document

In [232]:
def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [233]:
minimal_docs = filter_to_minimal_docs(extrated_data)

In [234]:
# minimal_docs

In [235]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [236]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

Number of chunks: 5859


In [9]:
from langchain.embeddings import HuggingFaceEmbeddings
def download_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')  #this model return 384 dimensions
    return embeddings
embeddings = download_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')  #this model return 384 dimensions


In [238]:
vector = embeddings.embed_query("hello world")

In [239]:
len(vector)

384

In [38]:
from dotenv import load_dotenv
import os
load_dotenv(override=True)

True

In [39]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [40]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)

In [43]:
from pinecone import ServerlessSpec
index_name = "medical-chatobot"
if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")

    )

index = pc.Index(index_name)


In [42]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embeddings,
    index_name=index_name
)

NameError: name 'texts_chunk' is not defined

In [44]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,

)

In [45]:
retriever = docsearch.as_retriever(docsearch_type="similarity", search_kwargs={"k": 3})

In [46]:
retrieved_docs = retriever.invoke("What is acne")

In [47]:
retrieved_docs


[]

In [48]:
from langchain_openai import ChatOpenAI
import os

chatModel = ChatOpenAI(
    model="mistralai/mistral-7b-instruct",   # free OpenRouter model
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    openai_api_base="https://openrouter.ai/api/v1"
)

In [49]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [50]:
system_prompt = (
    "You are a medical assistant for question-answering tasks. "
    "Use the retrieved context to answer the question. "
    "Always provide the best possible answer using the context, and do not provide empty answer even if brief. "
    "Use three sentences maximum and keep the answer concise.\n\n{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [51]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [52]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

 Acromegaly and gigantism are both conditions caused by excessive growth hormone (GH) production, typically due to a pituitary gland tumor. Acromegaly occurs in adults, leading to enlargement of bones and tissues, particularly in the hands, feet, and face. Gigantism, on the other hand, affects children and adolescents before the growth plates close, resulting in excessive height and overall body growth.


In [53]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])

 Acne is a common skin condition that occurs when hair follicles become clogged with oil and dead skin cells. It often results in pimples, blackheads, and whiteheads, primarily on the face, chest, and back. Acne can be influenced by factors such as hormones, genetics, and lifestyle.


In [54]:
response = rag_chain.invoke({"input": "what is the acne?"})
print(response["answer"])

 Acne is a common skin condition characterized by the presence of pimples, blackheads, and whiteheads, typically occurring on the face, chest, and back. It is caused by the clogging of hair follicles with oil and dead skin cells, which can lead to inflammation and infection. Acne can range from mild to severe and is most common during adolescence but can affect people of all ages.


In [55]:
response = rag_chain.invoke({"input": "what is the bone?"})
print(response["answer"])

 The bone is a rigid organ that constitutes part of the vertebrate skeleton. It is composed of collagen and calcium phosphate, which gives it strength and flexibility. Bones serve multiple functions, including support, protection, and facilitating movement.
