In [52]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = "https://www.mayoclinic.org/drugs-supplements-{}"

"""def fetch_page(slug):
    url = BASE_URL.format(slug)
    html = requests.get(url).text
    return BeautifulSoup(html, "html.parser")"""

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

def fetch_page(slug):
    url = BASE_URL.format(slug)
    resp = requests.get(url, headers=HEADERS, timeout=10)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")


In [5]:
supplements_links = [
    "acidophilus/art-20361967","aloe/art-20362267","coenzyme-q10/art-20362602","creatine/art-20347591","dhea/art-20364199",
    "primrose/art-20364500","fish-oil/art-20364810","flaxseed-and-flaxseed-oil/art-20366457",
    "folate/art-20364625","ginkgo/art-20362032","glucosamine/art-20362874","honey/art-20363819","l-arginine/art-20364681",
    "marijuana/art-20364974","melatonin/art-20363071","milk-thistle/art-20362885","niacin/art-20364984","red-yeast-rice/art-20363074",
    "same/art-20364924","st-johns-wort/art-20362212","tea-tree-oil/art-20364246","vitamin-a/art-20365945","vitamin-b6/art-20363468",
    "vitamin-b12/art-20363663","vitamin-c/art-20363932","vitamin-d/art-20363792","vitamin-e/art-20364144","zinc/art-20366112"
]


In [23]:
def extract_sections(soup):
    sections = {}
    current_heading = None

    for el in soup.find_all(["h2","h3","h4","p","li"]):
        text = el.get_text(strip=True)
        if el.name in ["h2","h3","h4"]:
            current_heading = text
            sections[current_heading] = []
        elif current_heading:
            sections[current_heading].append(text)
    return sections


In [45]:
import re
def normalize_data(slug, sections):
    """
    Normalize Mayo Clinic supplement data into a flat, CSV-safe structure.
    """

    # -------------------------
    # NAME
    # -------------------------
    name = slug.split("/")[0].title()
    # -------------------------
    # CATEGORY
    # -------------------------
    overview_text = " ".join(sections.get("Overview", [])).lower()

    if "vitamin" in overview_text:
        category = "Vitamin"
    elif "mineral" in overview_text:
        category = "Mineral"
    elif any(w in overview_text for w in ["plant", "herb", "extract", "gel", "latex"]):
        category = "Herb / Plant-based"
    else:
        category = "Supplement"

    # -------------------------
    # SAFETY INFO
    # -------------------------
    safety_lines = sections.get("Safety and side effects", [])
    safety_info = " ".join(safety_lines) if safety_lines else ""

    # -------------------------
    # CONTRAINDICATIONS
    # -------------------------
    contraindications = []

    contraindication_keywords = [
        "should not",
        "do not take",
        "do not use",
        "avoid",
        "unsafe",
        "pregnant",
        "breastfeeding",
        "children",
        "surgery",
        "kidney",
        "liver"
    ]

    for line in safety_lines:
        if any(k in line.lower() for k in contraindication_keywords):
            contraindications.append(line.strip())

    contraindications_str = " | ".join(contraindications)

    # -------------------------
    # DRUG INTERACTIONS
    # -------------------------
    interaction_lines = sections.get("Interactions", [])
    interaction_entries = []
    
    # Unwanted string to filter out
    unwanted_text = "There is a problem with information submitted for this request"

    for line in interaction_lines:
        # Skip lines containing the unwanted text
        if unwanted_text in line:
            continue
            
        # Extract drug name (best-effort)
        match = re.match(r"^([A-Za-z0-9 ,\-]+)\s*\(", line)
        drug = match.group(1).strip() if match else line.split(".")[0]

        # Severity heuristic
        lower = line.lower()
        if any(w in lower for w in ["fatal", "kidney failure", "cancer"]):
            severity = "Severe"
        elif any(w in lower for w in ["bleeding", "hypoglycemia", "electrolyte"]):
            severity = "Moderate"
        else:
            severity = "Mild"

        interaction_entries.append(
            f"{drug} ({severity}): {line.strip()}"
        )

    drug_interactions_str = " | ".join(interaction_entries)
    
    # Also remove the unwanted string from the final result if it somehow got through
    # Handle both escaped newlines and actual newlines
    unwanted_patterns = [
        " | There is a problem with\\n                                information submitted for this request (Mild): There is a problem with\\n                                information submitted for this request. Review/update the\\n                                information highlighted below and resubmit the form.",
        " | There is a problem with\n                                information submitted for this request (Mild): There is a problem with\n                                information submitted for this request. Review/update the\n                                information highlighted below and resubmit the form.",
        "There is a problem with information submitted for this request"
    ]
    for pattern in unwanted_patterns:
        drug_interactions_str = drug_interactions_str.replace(pattern, "").strip()
    
    # Clean up any double separators that might result
    drug_interactions_str = re.sub(r'\s*\|\s*\|\s*', ' | ', drug_interactions_str).strip()



    return {
        "name": name,
        "category": category,
        "safety_info": safety_info,
        "drug_interactions": drug_interactions_str,
        "contraindications": contraindications_str
    }


In [24]:

soup = fetch_page("aloe/art-20362267")
soup


<!DOCTYPE html>

<html dir="ltr" lang="en">
<head><title>
	Aloe - Mayo Clinic
</title><meta content="Â " name="application-name"/>
<link href="/-/media/web/gbs/shared/images/apple-touch-icon-152x152.svg" rel="apple-touch-icon"/>
<link href="/-/media/web/gbs/shared/images/favicon.png" rel="icon"/>
<meta content="#FFFFFF" name="msapplication-TileColor"/>
<meta content="/-/media/web/gbs/shared/images/mstile-144x144.png" name="msapplication-TileImage"/>
<meta content="initial-scale=1.0" name="viewport"/><meta content="telephone=no" name="format-detection"/><meta content="ART-20362267" name="PocID"/>
<meta content="Aloe (Aloe vera) " name="Subject"/>
<meta content="Health information library" name="Content Package"/>
<meta content="Mayo Clinic" property="og:site_name"/>
<meta content="@mayoclinic" name="twitter:site"/>
<meta content="summary" name="twitter:card"/>
<meta content="@mayoclinic" name="twitter:creator"/>
<meta content="Aloe" name="twitter:title"/>
<meta content="Aloe" property="

In [34]:
sections = extract_sections(soup)
normalize_data("aloe/art-20362267", sections)


Aloe
aloe is a plant that makes two substances used in healthcare products. they are clear gel and yellow latex. people mainly put aloe's clear gel on skin to treat burns, the skin condition psoriasis and even acne. some also take the gel by mouth to treat certain conditions. some people take aloe latex, a laxative, by mouth to treat trouble passing stool, called constipation. aloe gel is generally safe when used as suggested. but taking aloe latex by mouth may not be safe. in fact, taking 1 gram a day of aloe latex for a few days can cause kidney damage and might be fatal.


{'name': 'Aloe',
 'category': 'Herb / Plant-based',
 'safety_info': "Experts believe aloe gel is safe when put on the skin using directions. It might be safe to take small doses by mouth for a short time. Taking aloe latex or whole-leaf extract by mouth may be unsafe. It's likely unsafe in high doses. Taking 1 gram a day of aloe latex for a few days can cause short-term kidney failure. It can be fatal. Aloe latex also might cause cancer. Other side effects include stomach cramps and loose stools. Children younger than age 12 should not take aloe latex and whole-leaf extract by mouth. People who are pregnant or breastfeeding should not use aloe in either form.",
 'drug_interactions': "Mixing aloe with the following medicines may cause harm: (Mild): Mixing aloe with the following medicines may cause harm: | Anticoagulants and antiplatelet medicines, herbs and supplements (Moderate): Anticoagulants and antiplatelet medicines, herbs and supplements.These types of medicines, herbs and suppl

In [53]:
all_data = []
for sup in supplements_links:
    soup = fetch_page(sup)
    sections = extract_sections(soup)
    data = normalize_data(sup, sections)
    all_data.append(data)

# Convert to DataFrame
df = pd.DataFrame(all_data)


In [54]:
# Generate CSV from DataFrame
df.to_csv("supplements_data.csv", index=False)
