In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = "https://www.mayoclinic.org/drugs-supplements-{}"

"""def fetch_page(slug):
    url = BASE_URL.format(slug)
    html = requests.get(url).text
    return BeautifulSoup(html, "html.parser")"""

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

def fetch_page(slug):
    url = BASE_URL.format(slug)
    resp = requests.get(url, headers=HEADERS, timeout=10)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")


In [13]:
supplements_links = [
    "acidophilus/art-20361967","aloe/art-20362267","coenzyme-q10/art-20362602","creatine/art-20347591","dhea/art-20364199",
    "primrose/art-20364500","fish-oil/art-20364810","flaxseed-and-flaxseed-oil/art-20366457",
    "folate/art-20364625","ginkgo/art-20362032","glucosamine/art-20362874","honey/art-20363819","l-arginine/art-20364681",
    "marijuana/art-20364974","melatonin/art-20363071","milk-thistle/art-20362885","niacin/art-20364984","red-yeast-rice/art-20363074",
    "same/art-20364924","st-johns-wort/art-20362212","tea-tree-oil/art-20364246","vitamin-a/art-20365945","vitamin-b6/art-20363468",
    "vitamin-b12/art-20363663","vitamin-c/art-20363932","vitamin-d/art-20363792","vitamin-e/art-20364144","zinc/art-20366112"
]


In [14]:
def extract_sections(soup):
    sections = {}
    current_heading = None

    for el in soup.find_all(["h2","h3","h4","p","li"]):
        text = el.get_text(strip=True)
        if el.name in ["h2","h3","h4"]:
            current_heading = text
            sections[current_heading] = []
        elif current_heading:
            sections[current_heading].append(text)
    return sections


In [15]:
import re
def normalize_data(slug, sections):
    """
    Normalize Mayo Clinic supplement data into a flat, CSV-safe structure.
    """

    # -------------------------
    # NAME
    # -------------------------
    name = slug.split("/")[0].title()
    # -------------------------
    # CATEGORY
    # -------------------------
    overview_text = " ".join(sections.get("Overview", [])).lower()

    if "vitamin" in overview_text:
        category = "Vitamin"
    elif "mineral" in overview_text:
        category = "Mineral"
    elif any(w in overview_text for w in ["plant", "herb", "extract", "gel", "latex"]):
        category = "Herb / Plant-based"
    else:
        category = "Supplement"

    # -------------------------
    # SAFETY INFO
    # -------------------------
    safety_lines = sections.get("Safety and side effects", [])
    safety_info = " ".join(safety_lines) if safety_lines else ""

    # -------------------------
    # CONTRAINDICATIONS
    # -------------------------
    contraindications = []

    contraindication_keywords = [
        "should not",
        "do not take",
        "do not use",
        "avoid",
        "unsafe",
        "pregnant",
        "breastfeeding",
        "children",
        "surgery",
        "kidney",
        "liver"
    ]

    for line in safety_lines:
        if any(k in line.lower() for k in contraindication_keywords):
            contraindications.append(line.strip())

    contraindications_str = " | ".join(contraindications)

    # -------------------------
    # DRUG INTERACTIONS
    # -------------------------
    interaction_lines = sections.get("Interactions", [])
    interaction_entries = []
    
    # Unwanted string to filter out
    unwanted_text = "There is a problem with information submitted for this request"

    for line in interaction_lines:
        # Skip lines containing the unwanted text
        if unwanted_text in line:
            continue
            
        # Extract drug name (best-effort)
        match = re.match(r"^([A-Za-z0-9 ,\-]+)\s*\(", line)
        drug = match.group(1).strip() if match else line.split(".")[0]

        # Severity heuristic
        lower = line.lower()
        if any(w in lower for w in ["fatal", "kidney failure", "cancer"]):
            severity = "Severe"
        elif any(w in lower for w in ["bleeding", "hypoglycemia", "electrolyte"]):
            severity = "Moderate"
        else:
            severity = "Mild"

        interaction_entries.append(
            f"{drug} ({severity}): {line.strip()}"
        )

    drug_interactions_str = " | ".join(interaction_entries)
    
    # Also remove the unwanted string from the final result if it somehow got through
    # Handle both escaped newlines and actual newlines
    unwanted_patterns = [
        " | There is a problem with\\n                                information submitted for this request (Mild): There is a problem with\\n                                information submitted for this request. Review/update the\\n                                information highlighted below and resubmit the form.",
        " | There is a problem with\n                                information submitted for this request (Mild): There is a problem with\n                                information submitted for this request. Review/update the\n                                information highlighted below and resubmit the form.",
        "There is a problem with information submitted for this request"
    ]
    for pattern in unwanted_patterns:
        drug_interactions_str = drug_interactions_str.replace(pattern, "").strip()
    
    # Clean up any double separators that might result
    drug_interactions_str = re.sub(r'\s*\|\s*\|\s*', ' | ', drug_interactions_str).strip()



    return {
        "name": name,
        "category": category,
        "safety_info": safety_info,
        "drug_interactions": drug_interactions_str,
        "contraindications": contraindications_str
    }


In [53]:
all_data = []
for sup in supplements_links:
    soup = fetch_page(sup)
    sections = extract_sections(soup)
    data = normalize_data(sup, sections)
    all_data.append(data)

# Convert to DataFrame
df = pd.DataFrame(all_data)


In [54]:
# Generate CSV from DataFrame
df.to_csv("supplements_data.csv", index=False)


In [None]:
# Generate Supplements Dataset
import pandas as pd

df = pd.read_csv("raw_mayo_data.csv")

df["sup_id"] = ["SUP" + str(i).zfill(2) for i in range(1, len(df) + 1)]

cols = ["sup_id"] + [c for c in df.columns if c != "sup_id"]
df = df[["sup_id", "name"]]

df.to_csv("supplements_data.csv", index=False)


In [18]:
import re


def extract_symptom_phrases(text: str):
    """Heuristically extract short symptom-like phrases from safety text.

    Pure-Python, no external NLP models.
    """
    if not text:
        return []

    text = str(text).replace("\n", " ")

    # Rough sentence split
    sentences = re.split(r"[.!?]", text)

    trigger_words = (
        "cause", "causes", "causing",
        "side effects", "side effect",
        "can lead to", "may lead to", "might cause",
        "can result in", "may result in",
    )

    phrases = []

    for sent in sentences:
        s = sent.strip()
        if not s:
            continue

        lower = s.lower()
        if not any(tw in lower for tw in trigger_words):
            continue

        # Look for common patterns like "can cause X", "may cause X", "side effects include X"
        pattern_parts = [
            r"side effects? (include|may include|might include) (?P<rest>.+)",
            r"(can|may|might) cause (?P<rest>.+)",
            r"cause (?P<rest>.+)",
            r"can lead to (?P<rest>.+)",
            r"may lead to (?P<rest>.+)",
            r"can result in (?P<rest>.+)",
            r"may result in (?P<rest>.+)",
        ]

        extracted = None
        for pat in pattern_parts:
            m = re.search(pat, lower)
            if m:
                # Map back to original casing using span of match
                span = m.span("rest")
                extracted = s[span[0]:span[1]]
                break

        if not extracted:
            # Fallback: use entire sentence body after first comma
            parts = s.split(",", 1)
            extracted = parts[-1]

        # Now split the extracted span into candidate phrases
        for chunk in re.split(r",| and | or ", extracted):
            phrase = chunk.strip().strip("'\"()[] ")
            phrase = re.sub(r"^(such as|including|like) ", "", phrase, flags=re.IGNORECASE)
            if not phrase:
                continue

            # Filter: no digits, reasonable length
            if any(ch.isdigit() for ch in phrase):
                continue
            words = phrase.split()
            if not (1 <= len(words) <= 4):
                continue

            phrases.append(phrase.lower())

    # Deduplicate while preserving order
    seen = set()
    clean = []
    for p in phrases:
        if p not in seen:
            seen.add(p)
            clean.append(p)

    return clean



In [None]:
# Web-scrape Mayo Clinic again to build symptom vocabulary and mappings

all_symptom_phrases = set()
supp_symptom_rows = []

for idx, slug in enumerate(supplements_links):
    sup_id = f"SUP{idx+1:02d}"

    soup = fetch_page(slug)
    sections = extract_sections(soup)

    # Use only the safety section text
    safety_text = " ".join(sections.get("Safety and side effects", []))
    phrases = extract_symptom_phrases(safety_text)

    for p in set(phrases):  # unique per supplement
        all_symptom_phrases.add(p)
        supp_symptom_rows.append({"sup_id": sup_id, "symptom": p})

# Build global symptom vocabulary with IDs
all_symptom_phrases = sorted(all_symptom_phrases)

symptoms_df = pd.DataFrame({
    "symptom_id": [f"SMP{str(i).zfill(3)}" for i in range(1, len(all_symptom_phrases) + 1)],
    "symptom": all_symptom_phrases,
})

symptoms_df.to_csv("symptoms_data.csv", index=False)

