In [2]:
# Import required libraries
import os
import requests
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone


In [3]:
%pwd

'c:\\Users\\ASUS\\OneDrive\\Desktop\\skill\\project\\Medical_chatbot\\research'

In [21]:
# Load environment variables
load_dotenv()
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
GROQ_API_KEY = os.environ.get('GROQ_API_KEY')


In [4]:

os.chdir("../")

In [5]:
%pwd

'c:\\Users\\ASUS\\OneDrive\\Desktop\\skill\\project\\Medical_chatbot'

In [6]:
# 2. Define a function to load PDFs

def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents                   
                             


In [7]:
%pwd

'c:\\Users\\ASUS\\OneDrive\\Desktop\\skill\\project\\Medical_chatbot'

In [9]:
# 3. Extract text from PDF
extracted_data = load_pdf_file(data = "data/")
print("Extracted Data Length:", len(extracted_data))

Extracted Data Length: 4505


In [12]:
# extracted_data
print("Extracted Data Length:", len(extracted_data))

Extracted Data Length: 4505


In [16]:
# 4. Split data into text chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [17]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 39994


In [None]:
# text_chunks

In [18]:
#5 Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [19]:
# from langchain.embeddings import HuggingFaceEmbeddings

# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# text = "This is a test sentence."
# vector = embeddings.embed_query(text)

# print(vector[:5])  # Print first 5 values to verify output


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



[0.08429645001888275, 0.05795374885201454, 0.004493365995585918, 0.10582111775875092, 0.007083425763994455]


In [34]:
embeddings = download_hugging_face_embeddings()


  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')





In [20]:
query_result = download_hugging_face_embeddings().embed_query("Hello world")
print("Length", len(query_result))


Length 384


In [None]:
# 6. Initialize Pinecone

from pinecone.grpc import PineconeGRPC as Pinecone 
from pinecone import ServerlessSpec


pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "healtcare-chatbot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [26]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [None]:
# 7. Embed chunks and upload to Pinecone
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [27]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
7 # Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [28]:
docsearch.search("Hello world", search_type="similarity")



[Document(id='d5eaf899-bf64-4392-9b47-767a8a52b88d', metadata={'page': 2928.0, 'page_label': '2899', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf'}, page_content='Pica'),
 Document(id='2327cba6-e039-416f-a86c-dbe1e41d1294', metadata={'page': 2482.0, 'page_label': '2453', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf'}, page_content='Minoxidil'),
 Document(id='42f92212-9332-4165-b9ff-36f91317358a', metadata={'page': 1616.0, 'page_label': '1587', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf'}, page_content='genuinely good and beautiful things in life.\n/C15Finding a purpose for one’s life and expressing one’s\nindividuality in fulfilling that purpose.\n/C15Keeping a healthy sense of modesty about one’s\ngoals or achievements.\nResources\nBOOKS\nAmerican Psychiatric Association.Diagnostic and Statistical\nManual of Mental Disorders.4th ed., revi

In [29]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})


In [30]:
retrieved_docs = retriever.invoke("What is Acne?")


In [31]:
retrieved_docs

[Document(id='7dce78e2-0871-413e-9002-04d9cbae586b', metadata={'page': 55.0, 'page_label': '26', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf'}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='1cc63aa1-f709-481e-bdca-8f971debf13c', metadata={'page': 55.0, 'page_label': '26', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf'}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='1b5a1d99-021f-4495-8a96-0301b1345887', metadata={'page': 55.0, 'page_label': '26', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf'}, page_content='Sebaceous follicles— A structure found within the\nskin that houses the oil-producing glands and hair\nfollicles, where pimples form.\nSebum— An oily skin moisturizer produced by\nsebaceous glands.\nT

In [32]:
from langchain_groq import ChatGroq
groq_llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)



In [34]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate 

system_prompt = (
    "You are an assistant for question - answering. You have to answer the questions related to healthcare or medicare."
    "Use the following pieces of retrived context to answer "
    "the question. If you don't know the answer, you can say 'I don't know'"
    "Use three sentences maximum and keep the"
    "answer concise and clear."
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system" , system_prompt),
        ("human", "{input}"),
    ]
)


In [35]:
question_answering_chain = create_stuff_documents_chain(groq_llm,prompt)
rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [38]:
response = rag_chain.invoke({"input" : "What is Acne?,give only medicine name"})
print(response["answer"])

Isotretinoin (Accutane) and benzoyl peroxide or tretinoin.
