<a href="https://colab.research.google.com/github/hemhalatha/ML_projects/blob/main/rag_medical_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

!pip install faiss-cpu sentence-transformers beautifulsoup4 requests tqdm


Mounted at /content/drive
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.0


In [4]:
import requests
from bs4 import BeautifulSoup

def scrape_medlineplus(article_id):
    url = f"https://medlineplus.gov/ency/article/{article_id}.htm"
    response = requests.get(url)

    if response.status_code != 200:
        print("❌ Failed:", article_id)
        return None

    soup = BeautifulSoup(response.text, "html.parser")

    # Extract the main text
    main_sections = soup.find_all("div", {"id": ["ency_summary", "ency_content"]})

    full_text = ""
    for section in main_sections:
        full_text += section.get_text(separator=" ", strip=True) + "\n\n"

    return full_text


In [6]:
article_ids = [
    "002070", "000085", "000201", "000162", "003025", "002311",
    "000166", "000195", "000133", "000150", "001146", "000214",
    "000151", "000252", "000754", "000422", "000244", "000436",
    "002305", "000790", "001090", "000707", "001241", "000231",
    "001202", "000951", "001111", "000060", "000073", "000202",
    "000243", "001106", "000271", "001417", "000432", "002140",
    "002237", "000048", "004012", "002341", "002550", "002275",
    "000437", "001114", "003023", "001116", "001326", "001213",
    "001179", "000817", "003025", "000178", "000188", "000809",
    # add more if you want 200, 300, etc.
]


In [7]:
from tqdm import tqdm

dataset = []

for aid in tqdm(article_ids):
    text = scrape_medlineplus(aid)
    if text:
        dataset.append({"id": aid, "text": text})

len(dataset)


 22%|██▏       | 12/54 [00:02<00:11,  3.79it/s]

❌ Failed: 001146


 63%|██████▎   | 34/54 [00:07<00:06,  3.31it/s]

❌ Failed: 000271


 67%|██████▋   | 36/54 [00:08<00:06,  2.58it/s]

❌ Failed: 002140


 76%|███████▌  | 41/54 [00:09<00:03,  3.41it/s]

❌ Failed: 002550


 78%|███████▊  | 42/54 [00:10<00:04,  2.43it/s]

❌ Failed: 002275


100%|██████████| 54/54 [00:11<00:00,  4.53it/s]


49

In [8]:
def chunk_text(text, chunk_size=300):
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)

    return chunks

documents = []

for item in dataset:
    chunks = chunk_text(item["text"])
    for chunk in chunks:
        documents.append({"id": item["id"], "chunk": chunk})

len(documents)


53

In [9]:
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("all-MiniLM-L6-v2")

texts = [d["chunk"] for d in documents]
embeddings = embed_model.encode(texts, convert_to_numpy=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
import faiss
import numpy as np
import json
import os

# Convert to float32
embeddings = embeddings.astype("float32")

dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)


In [11]:
SAVE_DIR = "/content/drive/MyDrive/medical_rag_db"
os.makedirs(SAVE_DIR, exist_ok=True)

# Save FAISS index
faiss.write_index(index, f"{SAVE_DIR}/faiss_index.bin")

# Save metadata
with open(f"{SAVE_DIR}/metadata.json", "w") as f:
    json.dump(documents, f)

print("Saved successfully!")


Saved successfully!


In [12]:
# Load FAISS index from Drive
index = faiss.read_index("/content/drive/MyDrive/medical_rag_db/faiss_index.bin")

# Load metadata
import json
with open("/content/drive/MyDrive/medical_rag_db/metadata.json") as f:
    metadata = json.load(f)


In [13]:
query = "What are the symptoms of cold?"
q_emb = embed_model.encode([query]).astype("float32")

D, I = index.search(q_emb, 3)

for idx in I[0]:
    print(metadata[idx]["chunk"])
    print("\n---\n")


Antibiotics DO NOT cure colds and flus.

---

Vital signs reflect essential body functions, including your heartbeat rate, breathing rate, temperature, and blood pressure. Your health care provider may measure, or monitor your vital signs to check your level of physical functioning. Normal vital signs change with age, sex, weight, exercise capability, and overall health. Normal vital sign ranges for the average healthy adult while resting are: Blood pressure: between 90/60 mmHg and 120/80 mmHg Breathing: 12 to 18 breaths per minute Pulse: 60 to 100 beats per minute Temperature: 97.7Â°F to 99.1Â°F (36.5Â°C to 37.3Â°C); average 98.6Â°F (37Â°C)

---

Acute mountain sickness is an illness that can affect mountain climbers, hikers, skiers, or travelers at high altitudes, usually above 8000 feet (2400 meters).

---

