In [2]:
%pwd

'/Users/krupa/Documents/Python Projects/LLM/MedicalChatbot-/resource'

In [3]:
import os 
os.chdir("../")

In [4]:
%pwd

'/Users/krupa/Documents/Python Projects/LLM/MedicalChatbot-'

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#extract text from pdf file
def load_pdf_files(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls= PyPDFLoader)
    
    documents = loader.load()
    return documents


In [5]:
extracted_data = load_pdf_files("/Users/krupa/Documents/Python Projects/LLM/MedicalChatbot-/Data")

In [6]:
extracted_data
len(extracted_data)

637

In [7]:
#filter operation
from typing import List
from langchain.schema import Document

def filter_to_minimal_documents(documents: List[Document]) -> List[Document]:
    #containing only source in metadata and page content 
    minimal_documents = []
    for doc in documents:
        minimal_doc = Document(
            page_content=doc.page_content,
            metadata={
                "source": doc.metadata.get("source", "")
            }
        )
        minimal_documents.append(minimal_doc)
    return minimal_documents
minimal_docs = filter_to_minimal_documents(extracted_data)
minimal_docs[1]

Document(metadata={'source': '/Users/krupa/Documents/Python Projects/LLM/MedicalChatbot-/Data/Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION')

In [8]:
#chunking the data
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size= 500, chunk_overlap=20)
    text = text_splitter.split_documents(minimal_docs)
    return text

chunked_data = text_split(minimal_docs)
print(f"Number of chunks: {len(chunked_data)}")

Number of chunks: 5859


In [9]:
#embedding model
from langchain.embeddings import HuggingFaceBgeEmbeddings


def download_embeddings():
    # downlload and return the embedding model
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embedding = HuggingFaceBgeEmbeddings(model_name=model_name)
    return embedding

embedding = download_embeddings()

  embedding = HuggingFaceBgeEmbeddings(model_name=model_name)


In [10]:
embedding

HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='', show_progress=False)

In [11]:
vector = embedding.embed_query("hello world")
print(vector)
print(f"Vector length: {len(vector)}")

[-0.010300861671566963, 0.18307925760746002, 0.03081122227013111, 0.004452868830412626, -0.0273361224681139, -0.033562541007995605, 0.03763148561120033, -0.03157336637377739, -0.0033910612110048532, -0.008950846269726753, 0.03803616017103195, -0.05129107087850571, 0.0003682838287204504, -0.023727070540189743, 0.09271015971899033, -0.027795815840363503, -0.03515257313847542, -0.003224184736609459, -0.07681784778833389, -0.05761215090751648, 0.07257598638534546, 0.1112855076789856, 0.01605852320790291, 0.015908457338809967, -0.08232702314853668, 0.007007331121712923, 0.029013117775321007, 0.00113868888001889, 0.11671742051839828, -0.03232735022902489, -0.03227164223790169, -0.0012590476544573903, 0.10591621696949005, 0.023600829765200615, 0.009664976969361305, 0.09834087640047073, 0.042936380952596664, -0.019547607749700546, 0.019267885014414787, -0.06417103856801987, 0.02392345666885376, -0.05288001149892807, -0.026469528675079346, 0.0055487132631242275, -0.017025168985128403, -0.030232

In [12]:
from dotenv import load_dotenv
load_dotenv()

True

In [15]:
import os 
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
#set the environment variables for Pinecone and OpenAI API keys
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENROUTER_API_KEY"] = OPENROUTER_API_KEY

In [16]:
from pinecone import Pinecone
pinconeapikey = PINECONE_API_KEY
pc = Pinecone(api_key=pinconeapikey)

In [17]:
pc

<pinecone.pinecone.Pinecone at 0x1792b41f0>

In [None]:
#creating databse
from pinecone import ServerlessSpec
index_name = "medicalchabot"
if not pc.has_index(index_name):
    pc.create_index(
                name=index_name,  
                dimension=384, #embedding vector length
                metric="cosine", #similarity metric
                spec =ServerlessSpec(cloud="aws", region="us-east-1")
           )
    
index = pc.Index(index_name)
    


In [18]:
#load exisiting index 
from langchain_pinecone import PineconeVectorStore


index_name = "medicalchabot"
docserach = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding,  
)  

In [90]:
from langchain_core.documents import Document

clean_docs = []
for d in chunked_data:
    md = dict(d.metadata or {})
    # Remove keys with None values
    md = {k: v for k, v in md.items() if v is not None}

    # Optional: ensure page_number is always an int if present
    if "page_number" in md and md["page_number"] is not None:
        md["page_number"] = int(md["page_number"])

    clean_docs.append(Document(page_content=d.page_content, metadata=md))

In [91]:
from langchain_pinecone import PineconeVectorStore
docserach = PineconeVectorStore.from_documents(
    documents = chunked_data, 
    embedding=embedding, 
    index_name=index_name
    )


In [19]:
retriever = docserach.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [20]:
retrieved_docs = retriever.invoke("What is acne?")
retrieved_docs

[Document(id='efa5575d-325b-4d06-b02f-1ef3573ee47d', metadata={'source': '/Users/krupa/Documents/Python Projects/LLM/MedicalChatbot-/Data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='0ba8e70c-86a8-425a-b059-49e885300740', metadata={'source': '/Users/krupa/Documents/Python Projects/LLM/MedicalChatbot-/Data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='7d3141b5-4676-400f-affa-eeba8670f98a', metadata={'source': '/Users/krupa/Documents/Python Projects/LLM/MedicalChatbot-/Data/Medical_book.pdf'}, page_content='Cliffs, NJ: Prentice Hall, 1995.\nGoldstein, Sanford M., and Rich

In [21]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(
    model="meta-llama/llama-3-8b-instruct",
    temperature=0,
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    openai_api_base="https://openrouter.ai/api/v1",
)


In [22]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [23]:
system_prompt = """You are a helpful medical assistant. 
Use the following retrieved documents to answer the question. 
If you don't know the answer, say you don't know.
Use three maximum retrieved sentences to answer the question.
Answer cosine similarity question based on the retrieved documents. 
Do not use any information that is not present in the retrieved documents.
"\n\n""
"{context}"
"""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)


question_answering_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [25]:
response = rag_chain.invoke({"input": "What is acne?"} )
print(response["answer"])

According to the retrieved documents, acne is a skin disorder in which the sebaceous glands become inflamed. It is also referred to as acne vulgaris.


In [27]:
response = rag_chain.invoke({"input": "i have a high fever what should i do"} )
print(response["answer"])


According to the retrieved documents, if you have a high fever, you should try to cool yourself. Additionally, if you suspect you have lymphangitis, you should call your doctor immediately or go to an emergency room.
