In [8]:
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://www.mayoclinic.org/drugs-supplements-{}"

"""def fetch_page(slug):
    url = BASE_URL.format(slug)
    html = requests.get(url).text
    return BeautifulSoup(html, "html.parser")"""

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

def fetch_page(slug):
    url = BASE_URL.format(slug)
    resp = requests.get(url, headers=HEADERS, timeout=10)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")


In [5]:
supplements_links = [
    "acidophilus/art-20361967","aloe/art-20362267","coenzyme-q10/art-20362602","creatine/art-20347591","dhea/art-20364199",
    "primrose/art-20364500","fish-oil/art-20364810","flaxseed-and-flaxseed-oil/art-20366457",
    "folate/art-20364625","ginkgo/art-20362032","glucosamine/art-20362874","honey/art-20363819","l-arginine/art-20364681",
    "marijuana/art-20364974","melatonin/art-20363071","milk-thistle/art-20362885","niacin/art-20364984","red-yeast-rice/art-20363074",
    "same/art-20364924","st-johns-wort/art-20362212","tea-tree-oil/art-20364246","vitamin-a/art-20365945","vitamin-b6/art-20363468",
    "vitamin-b12/art-20363663","vitamin-c/art-20363932","vitamin-d/art-20363792","vitamin-e/art-20364144","zinc/art-20366112"
]


In [12]:
def extract_sections(soup):
    sections = {}
    current_heading = None

    for el in soup.find_all(["h2","p","li"]):
        text = el.get_text(strip=True)
        if el.name == "h2":
            current_heading = text
            sections[current_heading] = []
        elif current_heading:
            sections[current_heading].append(text)
    return sections


In [14]:
"""def normalize_data(slug, sections):
    data = {
        "slug": slug,
        "name": None,
        "alt_names": [],
        "category": None,
        "safety_info": None,
        "drug_interactions": [],
        "contraindications": [],
        "evidence_summary": []
    }

    # NAME (page title)
    title = sections.get("Overview", [])
    data["name"] = slug.replace("-", " ").title()

    # CATEGORY (heuristic â€” later refine)
    if "vitamin" in slug:
        data["category"] = "Vitamin"
    elif "oil" in slug:
        data["category"] = "Herb/Plant Oil"
    else:
        data["category"] = "Supplement"

    # SAFETY
    safe = sections.get("Safety and side effects", [])
    if safe:
        data["safety_info"] = " ".join(safe)

    # DRUG INTERACTIONS
    interactions = sections.get("Interactions", [])
    for line in interactions:
        # The simplest heuristic match for medicines
        if any(word.isupper() for word in line.split()):
            data["drug_interactions"].append(line)

    # CONTRAINDICATIONS
    for line in safe:
        if "should not" in line.lower():
            data["contraindications"].append(line)

    # EVIDENCE (What the research says)
    research = sections.get("What the research says", [])
    for line in research:
        data["evidence_summary"].append(line)

    return data"""

import re
def normalize_data(slug, sections):
    """
    Normalize Mayo Clinic supplement data into a flat, CSV-safe structure.
    """

    # -------------------------
    # NAME
    # -------------------------
    name = slug.split("/")[0].title()
    print(name)
    # -------------------------
    # CATEGORY (text heuristic)
    # -------------------------
    overview_text = " ".join(sections.get("Overview", [])).lower()
    print(overview_text)

    if "vitamin" in overview_text:
        category = "Vitamin"
    elif "mineral" in overview_text:
        category = "Mineral"
    elif any(w in overview_text for w in ["plant", "herb", "extract", "gel", "latex"]):
        category = "Herb / Plant-based"
    else:
        category = "Supplement"

    # -------------------------
    # SAFETY INFO
    # -------------------------
    safety_lines = sections.get("Safety and side effects", [])
    safety_info = " ".join(safety_lines) if safety_lines else ""

    # -------------------------
    # CONTRAINDICATIONS
    # -------------------------
    contraindications = []

    contraindication_keywords = [
        "should not",
        "do not take",
        "do not use",
        "avoid",
        "unsafe",
        "pregnant",
        "breastfeeding",
        "children",
        "surgery",
        "kidney",
        "liver"
    ]

    for line in safety_lines:
        if any(k in line.lower() for k in contraindication_keywords):
            contraindications.append(line.strip())

    contraindications_str = " | ".join(contraindications)

    # -------------------------
    # DRUG INTERACTIONS
    # -------------------------
    interaction_lines = sections.get("Interactions", [])
    interaction_entries = []

    for line in interaction_lines:
        # Extract drug name (best-effort)
        match = re.match(r"^([A-Za-z0-9 ,\-]+)\s*\(", line)
        drug = match.group(1).strip() if match else line.split(".")[0]

        # Severity heuristic
        lower = line.lower()
        if any(w in lower for w in ["fatal", "kidney failure", "cancer"]):
            severity = "Severe"
        elif any(w in lower for w in ["bleeding", "hypoglycemia", "electrolyte"]):
            severity = "Moderate"
        else:
            severity = "Mild"

        interaction_entries.append(
            f"{drug} ({severity}): {line.strip()}"
        )

    drug_interactions_str = " | ".join(interaction_entries)

    # -------------------------
    # ALT NAMES (not explicitly listed by Mayo)
    # -------------------------
    alt_names = ""

    # -------------------------
    # FINAL FLAT RECORD
    # -------------------------
    return {
        "slug": slug,
        "name": name,
        "alt_names": alt_names,
        "category": category,
        "safety_info": safety_info,
        "drug_interactions": drug_interactions_str,
        "contraindications": contraindications_str
    }


In [10]:

soup = fetch_page("aloe/art-20362267")
print(soup.title.text)



	Aloe - Mayo Clinic



In [16]:
sections = extract_sections(soup)
data = normalize_data("aloe/art-20362267", sections)
data

Aloe



{'slug': 'aloe/art-20362267',
 'name': 'Aloe',
 'alt_names': '',
 'category': 'Supplement',
 'safety_info': '',
 'drug_interactions': '',
 'contraindications': ''}

In [None]:
all_data = []
for sup in supplements_links:
    soup = fetch_page(sup)
    sections = extract_sections(soup)
    data = normalize_data(sup, sections)
    all_data.append(data)


In [10]:
import csv

with open("supplements_data.csv","w",newline="") as out:
    writer = csv.writer(out)
    writer.writerow([
        "slug","name","alt_names","category","safety_info",
        "drug_interactions","contraindications","evidence_summary"
    ])
    for item in all_data:
        writer.writerow([
            item["slug"],
            item["name"],
            ";".join(item["alt_names"]),
            item["category"],
            item["safety_info"],
            ";".join(item["drug_interactions"]),
            ";".join(item["contraindications"]),
            ";".join(item["evidence_summary"])
        ])
