# Setting up Env

In [3]:
import json
import os
import pickle
import numpy as np
from typing import List, Dict
from sentence_transformers import SentenceTransformer, util

In [4]:
!pip install transformers accelerate huggingface-hub sentence-transformers faiss-cpu streamlit python-dotenv --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m100.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m124.8 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m118.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m120.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6

In [5]:
# using this lightweight sentence transformer
import faiss
MODEL_NAME = 'all-MiniLM-L6-v2'
TOP_K=3

# RAG Pipeline

### Dataloading and Spliting

In [6]:
def load_data(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

kb_items = load_data("/kaggle/input/medicaldataset-nlp-a04/combined_rag_data.json")

medical_kb_texts = []
patient_case_texts = []
medical_kb_items = []
patient_case_items = []

for item in kb_items:
    if "medicalKB" in item:
        medical_kb_texts.append(item["medicalKB"])
        medical_kb_items.append(item)
    elif "patient_case" in item:
        txt = " ".join(item["patient_case"]["inputs"].values())
        patient_case_texts.append(txt)
        patient_case_items.append(item)

In [7]:
model = SentenceTransformer(MODEL_NAME, device="cuda")

def embed(texts):
    return model.encode(
        texts,
        batch_size=16,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    ).astype("float32")

kb_emb = embed(medical_kb_texts)
case_emb = embed(patient_case_texts)

def build_index(emb):
    dim = emb.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(emb)
    return index

kb_index = build_index(kb_emb)
case_index = build_index(case_emb)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [8]:
faiss.write_index(kb_index, "kb_index.faiss")
faiss.write_index(case_index, "patient_index.faiss")

with open("kb_items.pkl", "wb") as f:
    pickle.dump(medical_kb_items, f)

with open("patient_items.pkl", "wb") as f:
    pickle.dump(patient_case_items, f)

In [9]:
def search_top_k(query, k=TOP_K):
    q = model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")

    kb_scores, kb_idx = kb_index.search(q, k)
    case_scores, case_idx = case_index.search(q, k)

    kb_res = [(medical_kb_items[i], float(kb_scores[0][j])) for j, i in enumerate(kb_idx[0])]
    case_res = [(patient_case_items[i], float(case_scores[0][j])) for j, i in enumerate(case_idx[0])]

    return kb_res, case_res


In [10]:
def construct_prompt(query, kb_res, case_res):
    ctx = "Relevant Medical Knowledge:\n"
    for item, score in kb_res:
        ctx += f"- {item['medicalKB'][:300]}... (score {score:.2f})\n"

    ctx += "\nRelevant Patient Cases:\n"
    for item, score in case_res:
        txt = " ".join(item["patient_case"]["inputs"].values())
        ctx += f"- {txt[:300]}... (score {score:.2f})\n"

    prompt = f"""
You are a medical expert.
Use the given retrieved information to answer.

{ctx}

Patient Query:
{query}

Explain briefly, then give final answer.
"""
    return prompt


# Preparing HuggingFace LLM

In [11]:
from transformers import AutoTokenizer,AutoModelForCausalLM
import torch
LLM_Model="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
tokenizer=AutoTokenizer.from_pretrained(LLM_Model,use_fast=False)
llm=AutoModelForCausalLM.from_pretrained(LLM_Model,torch_dtype=torch.float16,device_map="auto")

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [12]:
def llm_generate(prompt, max_new_tokens=256):
    inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
    output = llm.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [13]:
def run_rag(patient_case):
    query = patient_case["input_text"]
    kb_res, case_res = search_top_k(query)
    prompt = construct_prompt(query, kb_res, case_res)
    answer = llm_generate(prompt)

    return answer, kb_res, case_res, prompt

In [14]:
def display_retrieved(kb_res, case_res):

    print("\n Retrieved Medical Knowledge ")
    for item, score in kb_res:
        print("\nScore:", score)
        print(item["medicalKB"][:350])

    print("\n Retrieved Patient Cases")
    for item, score in case_res:
        print("\nScore:", score)
        txt = " ".join(item["patient_case"]["inputs"].values())
        print(txt[:350])


# Model Inference

In [15]:
patient_case = {
    "input_text": "A 65-year-old male with aggressive behavior and recent falls..."
}

answer, kb_res, case_res, prompt = run_rag(patient_case)

print("\nRAG System final response \n")
print(answer)

display_retrieved(kb_res, case_res)


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



RAG System final response 


You are a medical expert.
Use the given retrieved information to answer.

Relevant Medical Knowledge:
- knowledge/Suspected Alzheimer/Symptoms: Cognitive decline, including memory loss and deterioration of other functions; impairment in at least two cognitive areas; progressive decline in memory and cognition without loss of consciousness; typically occurs between ages 40 and 90, with higher frequency... (score 0.32)
- knowledge/Suspected Stroke/Signs: facial drooping, arm weakness, speech difficulties, vision problems, severe headache, dizziness or loss of balance, confusion, difficulty walking, numbness or paralysis on one side of the body, sudden behavioral change, and loss of consciousness... (score 0.30)
- diagnostic/Suspected COPD/COPD/Moderate COPD: []... (score 0.26)

Relevant Patient Cases:
- Dementia with aggresive behavior Man with CAD, prostate cancer, meningioma, and gastritis, and two recent ER visits for aggressive behavior, sent from his nu

# Using API Models

In [None]:
!pip install google-generativeai python-dotenv --quiet

import os
from dotenv import load_dotenv
import google.generativeai as genai

# Get API key
api_key = "api_key"


genai.configure(api_key=api_key)

# Example prompt
prompt = """
First explain what it diesase could be then give all the reasons
Explain the symptoms of a 60-year-old male with sudden chest pain radiating to the back.
Use headings or bullet points where needed.
Give response in plain text without * Behave like medical assistant
"""

# Generate response
model = genai.GenerativeModel("gemini-2.0-flash")
response = model.generate_content(prompt)

print(response.text)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.9/319.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
google-cloud-translate 3.12.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 5.29.5 which is incompatible.
ray 2.51.1 requires click!=8.3.0,>=7.0, but you have click 8.3.0 which is incompatible.
bigframes 2.12.0 requires rich<14,>=12.4.4, but you have rich 14.2.0 which is incompatible.
pydrive2 1.21.3 requires cryptography<44, but you have cryptography 46.0.3 which is incompatible.
pydrive2 1.21.3 requires pyOpenSSL<=24.2.1,>=19.1.0, but you have pyopenssl 25.3.0 which is incompatible.
gcsfs 2025.3.