In [46]:
import sys
import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.embeddings import HuggingFaceEmbeddings

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore

In [252]:
from dotenv import load_dotenv
load_dotenv()

True

Extraction:

In [29]:
def pdf_loader(pdf):
    loader = DirectoryLoader(pdf, glob='*.pdf', loader_cls=PyPDFLoader)
    return loader.load()

In [30]:
extracted = pdf_loader(pdf = 'data/raw/')

Chunking:

In [31]:
def text_chunking(extracted):
    chunker = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = chunker.split_documents(extracted)
    return text_chunks

In [32]:
chunks = text_chunking(extracted)
print("Number of chunks:", len(chunks))

Number of chunks: 6356


In [33]:
chunks

[Document(metadata={'producer': 'Acrobat Distiller 10.1.16 (Macintosh)', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2017-05-02T19:09:54+01:00', 'gts_pdfxconformance': 'PDF/X-1a:2001', 'author': "Ian B. Wilkinson,Tim Raine,Kate Wiles,Anna Goodhart,Catriona Hall,Harriet O'Neill", 'gts_pdfxversion': 'PDF/X-1:2001', 'moddate': '2017-09-10T00:55:19+06:30', 'title': 'Oxford Handbook of Clinical Medicine', 'trapped': '/False', 'ebx_publisher': 'Oxford University Press USA', 'source': 'data\\raw\\Oxford_Handbook_of_Clinical_Medicine.pdf', 'total_pages': 911, 'page': 1, 'page_label': '2'}, page_content='OXFORD HANDBOOK OF \nCLINICAL\nMEDICINE\nTENTH EDITION\nIan B. Wilkinson\nTim Raine\nKate Wiles\nAnna Goodhart\nCatriona Hall \nHarriet O’Neill\n3\n_OHCM_10e.indb   i_OHCM_10e.indb   i 02/05/2017   19:0602/05/2017   19:06'),
 Document(metadata={'producer': 'Acrobat Distiller 10.1.16 (Macintosh)', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2017-05-02T19:09:54

Embeddings:

In [34]:
def hf_embeds():
    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-mpnet-base-v2')
    return embeddings

In [35]:
embeddings = hf_embeds()

In [36]:
query_embedding = embeddings.embed_query("the quick brown fox jumps over the lazy dog")
query_embedding

[-0.024103570729494095,
 -0.0020989729091525078,
 -0.0024423568975180387,
 -0.011133186519145966,
 0.019624071195721626,
 0.03319194167852402,
 -0.011965285055339336,
 0.024153107777237892,
 -0.03536922484636307,
 -0.0382525660097599,
 0.01748158037662506,
 0.05159488692879677,
 -0.0037229375448077917,
 0.05871324986219406,
 0.03909457102417946,
 -0.008387899957597256,
 0.05392386019229889,
 0.004543159622699022,
 -0.020542755722999573,
 -0.04200758412480354,
 -0.024228837341070175,
 0.0049254633486270905,
 0.004378827754408121,
 -0.019104639068245888,
 -0.02993282489478588,
 0.03004559315741062,
 0.021910808980464935,
 0.007841000333428383,
 -0.005183733534067869,
 -0.005088144447654486,
 0.0007611141772940755,
 0.014786995016038418,
 -0.03338577598333359,
 0.057806212455034256,
 1.525625521026086e-06,
 -0.025452421978116035,
 0.01325868908315897,
 0.032221611589193344,
 0.11303336173295975,
 0.00641715619713068,
 -0.019862959161400795,
 -0.027733901515603065,
 -0.071344755589962,
 -0

In [37]:
len(query_embedding)

768

Pinecone DB:

In [48]:
PINECONE_DB_KEY = os.environ.get('PINECONE_DB_KEY')
os.environ["PINECONE_API_KEY"] = PINECONE_DB_KEY
PINECONE_DB_KEY

'pcsk_6gD5mB_4nGMrAo4FYKgfLN7cVWgTeyVP9NSmfpw23wvXkwGwxzfhT3YcqFDxbBbLaTiPE9'

In [43]:
pc = Pinecone(api_key=PINECONE_DB_KEY)

In [44]:
index_name = "niramaya"
pc.create_index(
    name = index_name,
    dimension = 768, 
    metric = 'cosine', 
    spec=ServerlessSpec(
        cloud='aws', 
        region = "us-east-1"
    )
)

{
    "name": "niramaya",
    "metric": "cosine",
    "host": "niramaya-94s9kys.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}

In [49]:
doc_search = PineconeVectorStore.from_documents(
    documents = chunks, 
    index_name = index_name, 
    embedding= embeddings
)

In [50]:
doc_search

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x16112b48fa0>

In [56]:
retriever = doc_search.as_retriever(search_type = "similarity", search_kwargs={"k":5})

In [61]:
retriever_docs = retriever.invoke("What is Botulism?")

In [62]:
retriever_docs

[Document(id='2c8e2b0a-d4c8-4d09-a319-9f06dc84faed', metadata={'author': "Ian B. Wilkinson,Tim Raine,Kate Wiles,Anna Goodhart,Catriona Hall,Harriet O'Neill", 'creationdate': '2017-05-02T19:09:54+01:00', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'ebx_publisher': 'Oxford University Press USA', 'gts_pdfxconformance': 'PDF/X-1a:2001', 'gts_pdfxversion': 'PDF/X-1:2001', 'moddate': '2017-09-10T00:55:19+06:30', 'page': 449.0, 'page_label': '450', 'producer': 'Acrobat Distiller 10.1.16 (Macintosh)', 'source': 'data\\raw\\Oxford_Handbook_of_Clinical_Medicine.pdf', 'title': 'Oxford Handbook of Clinical Medicine', 'total_pages': 911.0, 'trapped': '/False'}, page_content='436\nInfectious diseases\nNeurological disease\nBotulism\nNeuroparalytic infection caused by neurotoxin from anaerobic, spore-forming \nClostridium  botulinum (rarely C. butyricum, C. barattii ). Food-borne due to toxin \nproduction in food (botulus is Latin for sausage), or wound botulism due to spore \ngermination in wound (

LLM parsing:

In [68]:
GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY')
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

In [85]:
from langchain_google_genai import ChatGoogleGenerativeAI, HarmBlockThreshold, HarmCategory
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

In [209]:
llm = ChatGoogleGenerativeAI(
    model = "gemini-2.0-flash", 
    google_api_key = GEMINI_API_KEY,
    temperature=0.2,
    max_tokens=1000,   
    safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE
    },
    )

In [253]:
language = "English"

In [254]:
SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT")
SYSTEM_PROMPT

"You are Niramaya, an advanced AI vernacular diagnostic assistant for medical information, \n    specializing in multilingual, verncular, and context-aware healthcare support.\n    Your goal is to provide accurate, evidence-based medical insights while ensuring \n    accessibility and inclusivity for all users across 22 Indian consitutional languages.\n\n    Instructions:\n    Rely only on the context give and on the validate medical literature, including guidelines from WHO, \n    ICMR, papers, and other\n    authoritative sources. DO NOT PROVIDE A DIAGNOSIS. Instead, list probable causes (if not then reassurance), offer risk factors and next steps\n    Respond in {language}, ensuring medical jargon is explained in simple terms. If language is not specified\n    default to English with simple explanations.\n    Ensure responses are empathic, non-alarming and supportive. If symptoms indicate urgency, recommend immediate\n    medical consultation. Avoid providing prescriptions or treatm

In [255]:
system_prompt = (
    str(SYSTEM_PROMPT)+ "CONTEXT : {context}"+"LANGUAGE: {language}"
)

In [257]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [258]:
qa_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, qa_chain)

In [259]:
response = rag_chain.invoke({"input": "I am suffering from high grade fever, runny nose and headache. What should I do?", "language": language})
print(response["answer"])

These symptoms can have different causes, some more serious than others. It could be a common cold, the flu, or another type of infection.

Risk Factors:
*   Exposure to viruses or bacteria
*   Weakened immune system
*   Seasonal changes

Next Steps:

1.  Rest and Hydration: Get plenty of rest and drink lots of fluids to help your body recover.
2.  Over-the-Counter Medication: You can take over-the-counter medications like paracetamol or ibuprofen for fever and headache. Follow the instructions on the label.
3.  Monitor Your Symptoms: Keep an eye on your symptoms. If they worsen or you develop new symptoms like stiff neck, confusion, or difficulty breathing, it's important to seek medical attention right away.
4.  Consult a Doctor: If your symptoms persist for more than a few days or if you are concerned, it's best to consult a doctor for further evaluation and guidance.

**Triage Severity Score (TSS):** 🟡 TSS-2 (Moderate): Persistent or discomforting symptoms. Recommend medical advice