In [21]:
import time
import os
import csv
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm


In [22]:

def data_extractor(base_url, retries=3, delay=5):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }

    diagnosis_treatment_link = ""
    doctors_departments_link = ""

    for attempt in range(retries):
        try:
            response = requests.get(base_url, headers=headers, timeout=20)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Extract Diagnosis & Treatment link
            content1 = soup.find('a', id="et_genericNavigation_diagnosis-treatment")
            if not content1:
                # fallback: search by link text containing both words
                for a in soup.find_all('a'):
                    link_text = a.get_text(separator=' ').strip().lower()
                    if "diagnosis" in link_text and "treatment" in link_text:
                        content1 = a
                        break
            if content1:
                href1 = content1.get('href')
                diagnosis_treatment_link = f"https://www.mayoclinic.org{href1}" if href1 and href1.startswith("/") else href1

            # Extract Doctors & Departments link
            content2 = soup.find('a', id="et_genericNavigation_doctors-departments")
            if not content2:
                # fallback: search by link text containing both words
                for a in soup.find_all('a'):
                    link_text = a.get_text(separator=' ').strip().lower()
                    if "doctors" in link_text and "departments" in link_text:
                        content2 = a
                        break
            if content2:
                href2 = content2.get('href')
                doctors_departments_link = f"https://www.mayoclinic.org{href2}" if href2 and href2.startswith("/") else href2

            break  # success, exit retry loop

        except requests.exceptions.RequestException as e:
            print(f"[Attempt {attempt + 1}] Error fetching {base_url}: {e}")
            if attempt < retries - 1:
                time.sleep(delay)

    return diagnosis_treatment_link, doctors_departments_link

def web_scraping(base_url):
    # Define the expected headers in order
    expected_headers = ["disease", "main_link", "Diagnosis_treatment_link", "Doctors_departments_link"]
    
    # Check if file exists and read existing headers if it does
    file_exists = os.path.isfile("mayo_diseases.csv")
    existing_headers = []
    
    if file_exists:
        with open("mayo_diseases.csv", "r", encoding="utf-8") as file:
            reader = csv.reader(file)
            existing_headers = next(reader, [])
    
    # Determine if we need to write headers
    write_headers = not file_exists or existing_headers != expected_headers
    
    # Get the webpage content
    response = requests.get(base_url)
    if response.status_code != 200:
        print("Failed to retrieve page")
        exit()

    soup = BeautifulSoup(response.text, "html.parser")
    items = soup.select(".cmp-results-with-primary-name__see-link, .cmp-results-with-primary-name a")

    with open("mayo_diseases.csv", "a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        
        # Write headers if needed
        if write_headers:
            writer.writerow(expected_headers)
        
        for item in tqdm(items, desc="Scraping Diseases"):
            disease_name = item.text.strip()
            main_link = f"https://www.mayoclinic.org{item['href']}" if item['href'].startswith("/") else item['href']

            link1, link2 = data_extractor(main_link)
            
            # Create a row with all expected columns
            row_data = {
                "disease": disease_name,
                "main_link": main_link,
                "Diagnosis_treatment_link": link1,
                "Doctors_departments_link": link2
            }
            
            # If appending to existing file with different headers, align data with existing headers
            if file_exists and existing_headers:
                row = [row_data.get(header, "") for header in existing_headers]
            else:
                row = [row_data[header] for header in expected_headers]
            
            writer.writerow(row)

    print("Scraping Completed! Data Saved")

# Example usage:
# web_scraping("https://www.mayoclinic.org/diseases-conditions")

In [23]:
base_url = f"https://www.mayoclinic.org/diseases-conditions/index?letter=A"
web_scraping(base_url) 

Scraping Diseases: 100%|██████████| 132/132 [00:44<00:00,  2.98it/s]

Scraping Completed! Data Saved





In [20]:
from bs4 import BeautifulSoup


base_url="https://www.mayoclinic.org/diseases-conditions/egg-allergy/symptoms-causes/syc-20372115"
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }
response = requests.get(base_url, headers=headers, timeout=20)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
extraction_list = ["overview","symptoms","when-to-see-a-doctor","causes","complications","prevention","risk-factors"]

for i in extraction_list:
# Find the overview section by its aria-labelledby attribute
   overview_section = soup.find('section', {'aria-labelledby': i})

# Extract all paragraph text from the overview content
   overview_content = overview_section.find('div', class_='cmp-text__rich-content')
   overview_paragraphs = [p.get_text() for p in overview_content.find_all('p')]

# Join paragraphs into a single string
   overview_text = '\n\n'.join(overview_paragraphs)
   print(i)
   print(overview_text)

overview
Eggs are one of the most common allergy-causing foods for children.

Egg allergy symptoms usually occur a few minutes to a few hours after eating eggs or foods containing eggs. Signs and symptoms range from mild to severe and can include skin rashes, hives, nasal congestion, and vomiting or other digestive problems. Rarely, egg allergy can cause anaphylaxis — a life-threatening reaction.

Egg allergy can occur as early as infancy. Most children, but not all, outgrow their egg allergy before adolescence.
symptoms
Egg allergy reactions vary from person to person and usually occur soon after exposure to egg. Egg allergy symptoms can include:
when-to-see-a-doctor
See a doctor if you or your child has signs or symptoms of a food allergy shortly after eating eggs or an egg-containing product. If possible, see the doctor when the allergic reaction is occurring. This may help in making a diagnosis.

If you or your child has signs and symptoms of anaphylaxis, seek immediate emergency t

In [37]:
from bs4 import BeautifulSoup

def extract_overview(html_content):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }
    response = requests.get(base_url, headers=headers, timeout=20)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the Overview section - looking for h2 with text "Overview"
    overview_header = soup.find(lambda tag: tag.name == 'h2' and 'when-to-see-a-doctor"' in tag.text)
    
    if not overview_header:
        return "Overview section not found"
    
    # Initialize overview paragraphs
    overview_paragraphs = []
    
    # Get all siblings after the h2 until the next h2
    for sibling in overview_header.find_next_siblings():
        if sibling.name == 'h2':
            break  # Stop when we reach the next section
        if sibling.name == 'p':
            overview_paragraphs.append(sibling.get_text(strip=True))
    
    if not overview_paragraphs:
        return "No overview content found"
    
    return '\n\n'.join(overview_paragraphs)

# Example usage:
base_url="https://www.mayoclinic.org/diseases-conditions/hyperhidrosis/symptoms-causes/syc-20367152"

html_content = """ (your HTML content here) """
overview_text = extract_overview(base_url)
print(overview_text)

Overview section not found


In [43]:
from bs4 import BeautifulSoup
import requests

def extract_mayo_clinic_sections(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        main_content = soup.find('div', class_='content')
        if not main_content:
            return {"error": "Main content not found"}
        
        sections = {
            "Overview": None,
            "Symptoms": None,
            "When to see a doctor": None,
            "Causes": None,
            "Risk factors": None,
            "Complications": None,
            "Prevention": None
        }
        
        # Find all headings (h2 and h3)
        headings = main_content.find_all(['h2', 'h3'])
        
        for heading in headings:
            heading_text = heading.get_text(strip=True)
            
            # Handle the typo "dotor" in the HTML
            if "When to see a dotor" in heading_text:
                heading_text = "When to see a doctor"
                
            if heading_text in sections:
                content = []
                next_node = heading.next_sibling
                
                # Collect content until next heading
                while next_node and next_node.name not in ['h2', 'h3']:
                    if next_node.name == 'p':
                        content.append(next_node.get_text(strip=True))
                    elif next_node.name in ['ul', 'ol']:
                        items = [li.get_text(strip=True) for li in next_node.find_all('li')]
                        content.extend(items)
                    next_node = next_node.next_sibling
                
                sections[heading_text] = '\n'.join(content) if content else "No content found"
        
        return {k: v for k, v in sections.items() if v is not None}
    
    except requests.exceptions.RequestException as e:
        return {"error": f"Failed to fetch page: {str(e)}"}

# Example usage:
url = "https://www.mayoclinic.org/diseases-conditions/hyperhidrosis/symptoms-causes/syc-20367152"
sections = extract_mayo_clinic_sections(url)

for section_name, content in sections.items():
    print(f"=== {section_name.upper()} ===")
    print(content)
    print("\n" + "="*50 + "\n")

=== OVERVIEW ===
Hyperhidrosis (hi-pur-hi-DROE-sis) is excessive sweating that's not always related to heat or exercise. You may sweat so much that it soaks through your clothes or drips off your hands. Heavy sweating can disrupt your day and cause social anxiety and embarrassment.
Hyperhidrosis treatment usually helps. It often begins with antiperspirants. If these don't help, you may need to try different medications and therapies. In severe cases, your health care provider may suggest surgery to remove the sweat glands or to disconnect the nerves related to producing too much sweat.
Sometimes an underlying condition may be found and treated.


=== SYMPTOMS ===
The main symptom of hyperhidrosis is heavy sweating. This goes beyond the sweating from being in a hot environment, exercising, or feeling anxious or stressed. The type of hyperhidrosis that usually affects the hands, feet, underarms or face causes at least one episode a week when you're awake. And the sweating usually happens

In [8]:
import csv
from bs4 import BeautifulSoup
import requests
import os
from tqdm import tqdm

def extract_sections(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }

    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Try different possible main content containers
        main_content = soup.find('div', class_='content') or soup.find('article', id='main-content')
        if not main_content:
            return {}  # Don't return error; just skip

        # Section headings you want
        sections = {
            "Overview": None,
            "Symptoms": None,
            "When to see a doctor": None,
            "Causes": None,
            "Risk factors": None,
            "Complications": None,
            "Prevention": None
        }

        # Find all headings inside main content
        headings = main_content.find_all(['h2', 'h3'])
        for idx, heading in enumerate(headings):
            heading_text = heading.get_text(strip=True)
            # Fix potential typo in heading
            if "When to see a dotor" in heading_text:
                heading_text = "When to see a doctor"
            # If it's a section we care about
            if heading_text in sections:
                # Gather all paragraph and list content until next heading
                content = []
                next_node = heading.find_next_sibling()
                while next_node and next_node.name not in ['h2', 'h3']:
                    if next_node.name == 'p':
                        content.append(next_node.get_text(strip=True))
                    elif next_node.name in ['ul', 'ol']:
                        items = [li.get_text(strip=True) for li in next_node.find_all('li')]
                        content.extend(items)
                    next_node = next_node.find_next_sibling()
                if content:
                    sections[heading_text] = '\n'.join(content)
        # Only return sections with content
        return {k: v for k, v in sections.items() if v}
    except Exception as e:
        # Just return empty dict for error
        return {}

def update_csv_with_sections(csv_file):
    # Read existing data and headers
    rows = []
    existing_headers = []
    if os.path.exists(csv_file):
        with open(csv_file, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            existing_headers = reader.fieldnames
            rows = list(reader)

    # List of desired section headers
    section_headers = [
        'Overview', 'Symptoms', 'When to see a doctor',
        'Causes', 'Risk factors', 'Complications', 'Prevention'
    ]
    # Columns from csv plus any missing section headers
    all_headers = existing_headers.copy() if existing_headers else ['disease', 'main_link']
    for header in section_headers:
        if header not in all_headers:
            all_headers.append(header)

    # Process each row and update with extracted sections
    updated_rows = []
    for row in tqdm(rows):
        # Only process if main_link exists
        if 'main_link' in row and row['main_link']:
            print(f"Processing: {row.get('disease', 'Unknown')}")
            sections = extract_sections(row['main_link'])
            # Update row with whatever we could extract
            for section, content in sections.items():
                row[section] = content
        updated_rows.append(row)

    # Write back to CSV with updated headers and rows
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=all_headers)
        writer.writeheader()
        writer.writerows(updated_rows)
    print(f"CSV file updated successfully with {len(updated_rows)} rows")

# Example usage
if __name__ == "__main__":
    csv_file = "mayo_diseases.csv"
    update_csv_with_sections(csv_file)

  0%|          | 0/132 [00:00<?, ?it/s]

Processing: Atrial fibrillation


  1%|          | 1/132 [00:00<00:48,  2.70it/s]

Processing: Hyperhidrosis


  2%|▏         | 2/132 [00:00<00:49,  2.62it/s]

Processing: Bartholin's cyst


  2%|▏         | 3/132 [00:01<00:47,  2.70it/s]

Processing: Infant reflux


  3%|▎         | 4/132 [00:01<00:44,  2.89it/s]

Processing: Hidradenitis suppurativa


  4%|▍         | 5/132 [00:01<00:42,  3.01it/s]

Processing: HIV/AIDS


  5%|▍         | 6/132 [00:02<00:43,  2.91it/s]

Processing: Acute myelogenous leukemia


  5%|▌         | 7/132 [00:02<00:41,  2.99it/s]

Processing: Guillain-Barre syndrome


  6%|▌         | 8/132 [00:02<00:39,  3.13it/s]

Processing: Acute kidney injury


  7%|▋         | 9/132 [00:02<00:37,  3.29it/s]

Processing: Acute lymphocytic leukemia


  8%|▊         | 10/132 [00:03<00:37,  3.24it/s]

Processing: Acute lymphocytic leukemia


  8%|▊         | 11/132 [00:03<00:40,  2.98it/s]

Processing: Acute myelogenous leukemia


  9%|▉         | 12/132 [00:03<00:38,  3.10it/s]

Processing: Acute myelogenous leukemia


 10%|▉         | 13/132 [00:04<00:37,  3.14it/s]

Processing: Acute myelogenous leukemia


 11%|█         | 14/132 [00:04<00:36,  3.24it/s]

Processing: Radiation sickness


 11%|█▏        | 15/132 [00:04<00:35,  3.31it/s]

Processing: Radiation sickness


 12%|█▏        | 16/132 [00:05<00:38,  2.98it/s]

Processing: Acute kidney injury


 13%|█▎        | 17/132 [00:05<00:36,  3.13it/s]

Processing: ARDS


 14%|█▎        | 18/132 [00:05<00:35,  3.19it/s]

Processing: Acute sinusitis


 14%|█▍        | 19/132 [00:06<00:36,  3.11it/s]

Processing: Autoimmune epilepsy


 15%|█▌        | 20/132 [00:06<00:38,  2.93it/s]

Processing: Alcohol use disorder


 16%|█▌        | 21/132 [00:06<00:37,  2.94it/s]

Processing: Compulsive gambling


 17%|█▋        | 22/132 [00:07<00:37,  2.93it/s]

Processing: Nicotine dependence


 17%|█▋        | 23/132 [00:07<00:36,  3.01it/s]

Processing: Mesenteric lymphadenitis


 18%|█▊        | 24/132 [00:07<00:36,  2.98it/s]

Processing: Attention-deficit/hyperactivity disorder (ADHD) in children


 19%|█▉        | 25/132 [00:08<00:37,  2.86it/s]

Processing: Adult attention-deficit/hyperactivity disorder (ADHD)


 20%|█▉        | 26/132 [00:08<00:36,  2.91it/s]

Processing: Frozen shoulder


 20%|██        | 27/132 [00:08<00:34,  3.03it/s]

Processing: Childhood schizophrenia


 21%|██        | 28/132 [00:09<00:32,  3.19it/s]

Processing: Benign adrenal tumors


 22%|██▏       | 29/132 [00:09<00:32,  3.21it/s]

Processing: Autoimmune encephalitis


 23%|██▎       | 30/132 [00:09<00:33,  3.01it/s]

Processing: Acute flaccid myelitis (AFM)


 23%|██▎       | 31/132 [00:10<00:33,  2.98it/s]

Processing: Dry macular degeneration


 24%|██▍       | 32/132 [00:10<00:35,  2.84it/s]

Processing: Macular degeneration, wet


 25%|██▌       | 33/132 [00:10<00:33,  2.91it/s]

Processing: Myelofibrosis


 26%|██▌       | 34/132 [00:11<00:34,  2.83it/s]

Processing: Guillain-Barre syndrome


 27%|██▋       | 35/132 [00:11<00:32,  2.98it/s]

Processing: HIV/AIDS


 27%|██▋       | 36/132 [00:11<00:31,  3.02it/s]

Processing: Alcohol use disorder


 28%|██▊       | 37/132 [00:12<00:31,  3.00it/s]

Processing: Alcoholic hepatitis


 29%|██▉       | 38/132 [00:12<00:30,  3.08it/s]

Processing: Churg-Strauss syndrome


 30%|██▉       | 39/132 [00:12<00:32,  2.89it/s]

Processing: Churg-Strauss syndrome


 30%|███       | 40/132 [00:13<00:32,  2.85it/s]

Processing: Hay fever


 31%|███       | 41/132 [00:13<00:30,  2.96it/s]

Processing: Dust mite allergy


 32%|███▏      | 42/132 [00:14<00:30,  2.92it/s]

Processing: Egg allergy


 33%|███▎      | 43/132 [00:14<00:32,  2.78it/s]

Processing: Food allergy


 33%|███▎      | 44/132 [00:14<00:30,  2.92it/s]

Processing: Latex allergy


 34%|███▍      | 45/132 [00:14<00:27,  3.14it/s]

Processing: Milk allergy


 35%|███▍      | 46/132 [00:15<00:28,  3.07it/s]

Processing: Mold allergy


 36%|███▌      | 47/132 [00:15<00:30,  2.78it/s]

Processing: Nickel allergy


 36%|███▋      | 48/132 [00:16<00:31,  2.65it/s]

Processing: Peanut allergy


 37%|███▋      | 49/132 [00:16<00:29,  2.84it/s]

Processing: Penicillin allergy


 38%|███▊      | 50/132 [00:16<00:29,  2.83it/s]

Processing: Pet allergy


 39%|███▊      | 51/132 [00:17<00:30,  2.69it/s]

Processing: Shellfish allergy


 39%|███▉      | 52/132 [00:17<00:28,  2.77it/s]

Processing: Wheat allergy


 40%|████      | 53/132 [00:17<00:27,  2.82it/s]

Processing: Hair loss


 41%|████      | 54/132 [00:18<00:27,  2.87it/s]

Processing: Amyotrophic lateral sclerosis (ALS)


 42%|████▏     | 55/132 [00:18<00:25,  2.99it/s]

Processing: Dry socket


 42%|████▏     | 56/132 [00:18<00:27,  2.80it/s]

Processing: Atypical genitalia


 43%|████▎     | 57/132 [00:19<00:26,  2.84it/s]

Processing: Lazy eye (amblyopia)


 44%|████▍     | 58/132 [00:19<00:25,  2.85it/s]

Processing: Chagas disease


 45%|████▍     | 59/132 [00:19<00:25,  2.85it/s]

Processing: Transient global amnesia


 45%|████▌     | 60/132 [00:20<00:28,  2.55it/s]

Processing: Amnesia


 46%|████▌     | 61/132 [00:20<00:25,  2.81it/s]

Processing: Amyloidosis


 47%|████▋     | 62/132 [00:21<00:24,  2.81it/s]

Processing: Aplastic anemia


 48%|████▊     | 63/132 [00:21<00:24,  2.80it/s]

Processing: Thalassemia


 48%|████▊     | 64/132 [00:21<00:23,  2.88it/s]

Processing: Iron deficiency anemia


 49%|████▉     | 65/132 [00:22<00:25,  2.63it/s]

Processing: Thalassemia


 50%|█████     | 66/132 [00:22<00:24,  2.67it/s]

Processing: Sickle cell anemia


 51%|█████     | 67/132 [00:22<00:24,  2.70it/s]

Processing: Vitamin deficiency anemia


 52%|█████▏    | 68/132 [00:23<00:27,  2.36it/s]

Processing: Abdominal aortic aneurysm


 52%|█████▏    | 69/132 [00:23<00:26,  2.38it/s]

Processing: Aortic aneurysm


 53%|█████▎    | 70/132 [00:24<00:23,  2.60it/s]

Processing: Brain aneurysm


 54%|█████▍    | 71/132 [00:24<00:22,  2.75it/s]

Processing: Popliteal artery aneurysm


 55%|█████▍    | 72/132 [00:24<00:20,  2.88it/s]

Processing: Thoracic aortic aneurysm


 55%|█████▌    | 73/132 [00:25<00:21,  2.70it/s]

Processing: Vasculitis


 56%|█████▌    | 74/132 [00:25<00:21,  2.65it/s]

Processing: Angina


 57%|█████▋    | 75/132 [00:26<00:20,  2.74it/s]

Processing: Hives and angioedema


 58%|█████▊    | 76/132 [00:26<00:19,  2.90it/s]

Processing: Castleman disease


 58%|█████▊    | 77/132 [00:26<00:17,  3.06it/s]

Processing: Broken ankle


 59%|█████▉    | 78/132 [00:27<00:19,  2.71it/s]

Processing: Sprained ankle


 60%|█████▉    | 79/132 [00:27<00:19,  2.74it/s]

Processing: Tongue-tie (ankyloglossia)


 61%|██████    | 80/132 [00:27<00:17,  2.92it/s]

Processing: Anorexia nervosa


 61%|██████▏   | 81/132 [00:27<00:16,  3.08it/s]

Processing: ACL injury


 62%|██████▏   | 82/132 [00:28<00:17,  2.84it/s]

Processing: Anterior vaginal prolapse (cystocele)


 63%|██████▎   | 83/132 [00:28<00:17,  2.86it/s]

Processing: Pseudomembranous colitis


 64%|██████▎   | 84/132 [00:29<00:16,  2.92it/s]

Processing: Generalized anxiety disorder


 64%|██████▍   | 85/132 [00:29<00:15,  2.99it/s]

Processing: Social anxiety disorder (social phobia)


 65%|██████▌   | 86/132 [00:29<00:15,  2.95it/s]

Processing: Abdominal aortic aneurysm


 66%|██████▌   | 87/132 [00:30<00:16,  2.74it/s]

Processing: Coarctation of the aorta


 67%|██████▋   | 88/132 [00:30<00:15,  2.89it/s]

Processing: Aortic valve regurgitation


 67%|██████▋   | 89/132 [00:30<00:14,  2.94it/s]

Processing: Aortic valve regurgitation


 68%|██████▊   | 90/132 [00:31<00:14,  2.99it/s]

Processing: Auditory processing disorder


 69%|██████▉   | 91/132 [00:31<00:14,  2.91it/s]

Processing: Primary progressive aphasia


 70%|██████▉   | 92/132 [00:31<00:13,  3.06it/s]

Processing: Canker sore


 70%|███████   | 93/132 [00:32<00:12,  3.09it/s]

Processing: Canker sore


 71%|███████   | 94/132 [00:32<00:11,  3.24it/s]

Processing: Broken heart syndrome


 72%|███████▏  | 95/132 [00:32<00:11,  3.30it/s]

Processing: Antiphospholipid syndrome


 73%|███████▎  | 96/132 [00:33<00:12,  2.96it/s]

Processing: Viral hemorrhagic fevers


 73%|███████▎  | 97/132 [00:33<00:11,  3.09it/s]

Processing: Viral hemorrhagic fevers


 74%|███████▍  | 98/132 [00:33<00:10,  3.15it/s]

Processing: Broken arm


 75%|███████▌  | 99/132 [00:34<00:11,  2.95it/s]

Processing: Chiari malformation


 76%|███████▌  | 100/132 [00:34<00:10,  3.04it/s]

Processing: Heart arrhythmia


 77%|███████▋  | 101/132 [00:34<00:10,  2.87it/s]

Processing: Giant cell arteritis


 77%|███████▋  | 102/132 [00:35<00:10,  2.98it/s]

Processing: Takayasu's arteritis


 78%|███████▊  | 103/132 [00:35<00:09,  2.96it/s]

Processing: Thumb arthritis


 79%|███████▉  | 104/132 [00:35<00:09,  2.96it/s]

Processing: Osteoarthritis


 80%|███████▉  | 105/132 [00:36<00:09,  2.80it/s]

Processing: Gout


 80%|████████  | 106/132 [00:36<00:09,  2.84it/s]

Processing: Septic arthritis


 81%|████████  | 107/132 [00:36<00:08,  2.81it/s]

Processing: Juvenile idiopathic arthritis


 82%|████████▏ | 108/132 [00:37<00:08,  2.96it/s]

Processing: Osteoarthritis


 83%|████████▎ | 109/132 [00:37<00:08,  2.70it/s]

Processing: Psoriatic arthritis


 83%|████████▎ | 110/132 [00:37<00:07,  2.78it/s]

Processing: Reactive arthritis


 84%|████████▍ | 111/132 [00:38<00:07,  2.76it/s]

Processing: Rheumatoid arthritis


 85%|████████▍ | 112/132 [00:38<00:06,  2.88it/s]

Processing: Septic arthritis


 86%|████████▌ | 113/132 [00:39<00:07,  2.61it/s]

Processing: Thumb arthritis


 86%|████████▋ | 114/132 [00:39<00:06,  2.84it/s]

Processing: Atrial septal defect (ASD)


 87%|████████▋ | 115/132 [00:39<00:05,  2.97it/s]

Processing: Autism spectrum disorder


 88%|████████▊ | 116/132 [00:39<00:05,  3.14it/s]

Processing: Avascular necrosis (osteonecrosis)


 89%|████████▊ | 117/132 [00:40<00:05,  2.94it/s]

Processing: Childhood asthma


 89%|████████▉ | 118/132 [00:40<00:05,  2.70it/s]

Processing: Exercise-induced asthma


 90%|█████████ | 119/132 [00:41<00:04,  2.72it/s]

Processing: Occupational asthma


 91%|█████████ | 120/132 [00:41<00:04,  2.74it/s]

Processing: Atrial tachycardia


 92%|█████████▏| 121/132 [00:41<00:03,  2.90it/s]

Processing: Arteriosclerosis / atherosclerosis


 92%|█████████▏| 122/132 [00:42<00:03,  2.86it/s]

Processing: Atrioventricular canal defect


 93%|█████████▎| 123/132 [00:42<00:03,  2.63it/s]

Processing: Vaginal atrophy


 94%|█████████▍| 124/132 [00:42<00:02,  2.72it/s]

Processing: Reactive attachment disorder


 95%|█████████▍| 125/132 [00:43<00:02,  2.74it/s]

Processing: Adult attention-deficit/hyperactivity disorder (ADHD)


 95%|█████████▌| 126/132 [00:43<00:02,  2.78it/s]

Processing: Auditory processing disorder


 96%|█████████▌| 127/132 [00:43<00:01,  2.82it/s]

Processing: Autoimmune epilepsy


 97%|█████████▋| 128/132 [00:44<00:01,  2.92it/s]

Processing: Autoimmune encephalitis


 98%|█████████▊| 129/132 [00:44<00:00,  3.15it/s]

Processing: Bird flu (avian influenza)


 98%|█████████▊| 130/132 [00:44<00:00,  3.09it/s]

Processing: Avascular necrosis (osteonecrosis)


 99%|█████████▉| 131/132 [00:45<00:00,  2.85it/s]

Processing: Atrioventricular nodal reentry tachycardia (AVNRT)


100%|██████████| 132/132 [00:45<00:00,  2.90it/s]

CSV file updated successfully with 132 rows





In [24]:
import csv
from bs4 import BeautifulSoup
import requests
import os
from tqdm import tqdm

# Section slugs for aria-labelledby (new design) and pretty names
SECTION_SLUGS = [
    ("overview", "Overview"),
    ("symptoms", "Symptoms"),
    ("when-to-see-a-doctor", "When to see a doctor"),
    ("causes", "Causes"),
    ("risk-factors", "Risk factors"),
    ("complications", "Complications"),
    ("prevention", "Prevention"),
]

def extract_sections(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }

    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        result_sections = {}

        # Try method 1: Old layout (div.content or article#main-content)
        main_content = soup.find('div', class_='content') or soup.find('article', id='main-content')
        if main_content:
            headings = main_content.find_all(['h2', 'h3'])
            for heading in headings:
                heading_text = heading.get_text(strip=True)
                # Fix typo
                if "When to see a dotor" in heading_text:
                    heading_text = "When to see a doctor"
                # If it's a section we care about
                for _, section_name in SECTION_SLUGS:
                    if heading_text == section_name:
                        content = []
                        next_node = heading.find_next_sibling()
                        while next_node and next_node.name not in ['h2', 'h3']:
                            if next_node.name == 'p':
                                content.append(next_node.get_text(strip=True))
                            elif next_node.name in ['ul', 'ol']:
                                items = [li.get_text(strip=True) for li in next_node.find_all('li')]
                                content.extend(items)
                            next_node = next_node.find_next_sibling()
                        if content:
                            result_sections[section_name] = '\n'.join(content)
        # Try method 2: New layout (section[aria-labelledby] + cmp-text__rich-content)
        for slug, section_name in SECTION_SLUGS:
            if section_name in result_sections:
                continue  # Already found by old method
            # Find aria-labelledby section
            section = soup.find('section', {'aria-labelledby': slug})
            if section:
                content_div = section.find('div', class_='cmp-text__rich-content')
                if content_div:
                    paragraphs = [p.get_text(strip=True) for p in content_div.find_all('p')]
                    if paragraphs:
                        result_sections[section_name] = '\n\n'.join(paragraphs)
        return result_sections
    except Exception as e:
        # Skip on error, return empty
        return {}

def update_csv_with_sections(csv_file):
    # Read existing data and headers
    rows = []
    existing_headers = []
    if os.path.exists(csv_file):
        with open(csv_file, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            existing_headers = reader.fieldnames
            rows = list(reader)

    section_headers = [section for slug, section in SECTION_SLUGS]
    all_headers = existing_headers.copy() if existing_headers else ['disease', 'main_link']
    for header in section_headers:
        if header not in all_headers:
            all_headers.append(header)

    # --- SINGLE tqdm bar for ALL rows ---
    updated_rows = []
    with tqdm(total=len(rows), desc="Processing diseases", unit="disease") as pbar:
        for row in rows:
            if 'main_link' in row and row['main_link']:
                # (Optional) tqdm.write() to log messages without disrupting the bar
                tqdm.write(f"Processing: {row.get('disease', 'Unknown')}")
                sections = extract_sections(row['main_link'])
                for section, content in sections.items():
                    row[section] = content
            updated_rows.append(row)
            pbar.update(1)  # Always update once per disease

    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=all_headers)
        writer.writeheader()
        writer.writerows(updated_rows)
    print(f"CSV file updated successfully with {len(updated_rows)} rows")

# Example usage
if __name__ == "__main__":
    csv_file = "mayo_diseases.csv"
    update_csv_with_sections(csv_file)

Processing diseases:   0%|          | 0/132 [00:00<?, ?disease/s]

Processing: Atrial fibrillation


Processing diseases:   1%|          | 1/132 [00:00<00:42,  3.09disease/s]

Processing: Hyperhidrosis


Processing diseases:   2%|▏         | 2/132 [00:00<00:45,  2.83disease/s]

Processing: Bartholin's cyst


Processing diseases:   2%|▏         | 3/132 [00:01<00:43,  2.96disease/s]

Processing: Infant reflux


Processing diseases:   3%|▎         | 4/132 [00:01<00:39,  3.20disease/s]

Processing: Hidradenitis suppurativa


Processing diseases:   4%|▍         | 5/132 [00:01<00:37,  3.37disease/s]

Processing: HIV/AIDS


Processing diseases:   5%|▍         | 6/132 [00:01<00:41,  3.05disease/s]

Processing: Acute myelogenous leukemia


Processing diseases:   5%|▌         | 7/132 [00:02<00:42,  2.91disease/s]

Processing: Guillain-Barre syndrome


Processing diseases:   6%|▌         | 8/132 [00:02<00:41,  2.95disease/s]

Processing: Acute kidney injury


Processing diseases:   7%|▋         | 9/132 [00:02<00:39,  3.12disease/s]

Processing: Acute lymphocytic leukemia


Processing diseases:   8%|▊         | 10/132 [00:03<00:37,  3.25disease/s]

Processing: Acute lymphocytic leukemia


Processing diseases:   8%|▊         | 11/132 [00:03<00:40,  3.02disease/s]

Processing: Acute myelogenous leukemia


Processing diseases:   9%|▉         | 12/132 [00:03<00:38,  3.12disease/s]

Processing: Acute myelogenous leukemia


Processing diseases:  10%|▉         | 13/132 [00:04<00:40,  2.95disease/s]

Processing: Acute myelogenous leukemia


Processing diseases:  11%|█         | 14/132 [00:04<00:40,  2.91disease/s]

Processing: Radiation sickness


Processing diseases:  11%|█▏        | 15/132 [00:05<00:41,  2.79disease/s]

Processing: Radiation sickness


Processing diseases:  12%|█▏        | 16/132 [00:05<00:39,  2.95disease/s]

Processing: Acute kidney injury


Processing diseases:  13%|█▎        | 17/132 [00:05<00:38,  3.01disease/s]

Processing: ARDS


Processing diseases:  14%|█▎        | 18/132 [00:05<00:36,  3.09disease/s]

Processing: Acute sinusitis


Processing diseases:  14%|█▍        | 19/132 [00:06<00:35,  3.20disease/s]

Processing: Autoimmune epilepsy


Processing diseases:  15%|█▌        | 20/132 [00:06<00:39,  2.84disease/s]

Processing: Alcohol use disorder


Processing diseases:  16%|█▌        | 21/132 [00:07<00:38,  2.89disease/s]

Processing: Compulsive gambling


Processing diseases:  17%|█▋        | 22/132 [00:07<00:38,  2.84disease/s]

Processing: Nicotine dependence


Processing diseases:  17%|█▋        | 23/132 [00:07<00:36,  2.99disease/s]

Processing: Mesenteric lymphadenitis


Processing diseases:  18%|█▊        | 24/132 [00:07<00:34,  3.13disease/s]

Processing: Attention-deficit/hyperactivity disorder (ADHD) in children


Processing diseases:  19%|█▉        | 25/132 [00:08<00:36,  2.93disease/s]

Processing: Adult attention-deficit/hyperactivity disorder (ADHD)


Processing diseases:  20%|█▉        | 26/132 [00:08<00:36,  2.90disease/s]

Processing: Frozen shoulder


Processing diseases:  20%|██        | 27/132 [00:09<00:35,  2.97disease/s]

Processing: Childhood schizophrenia


Processing diseases:  21%|██        | 28/132 [00:09<00:37,  2.80disease/s]

Processing: Benign adrenal tumors


Processing diseases:  22%|██▏       | 29/132 [00:09<00:37,  2.73disease/s]

Processing: Autoimmune encephalitis


Processing diseases:  23%|██▎       | 30/132 [00:10<00:34,  2.92disease/s]

Processing: Acute flaccid myelitis (AFM)


Processing diseases:  23%|██▎       | 31/132 [00:10<00:34,  2.92disease/s]

Processing: Dry macular degeneration


Processing diseases:  24%|██▍       | 32/132 [00:10<00:32,  3.09disease/s]

Processing: Macular degeneration, wet


Processing diseases:  25%|██▌       | 33/132 [00:11<00:31,  3.16disease/s]

Processing: Myelofibrosis


Processing diseases:  26%|██▌       | 34/132 [00:11<00:33,  2.95disease/s]

Processing: Guillain-Barre syndrome


Processing diseases:  27%|██▋       | 35/132 [00:11<00:32,  2.94disease/s]

Processing: HIV/AIDS


Processing diseases:  27%|██▋       | 36/132 [00:12<00:31,  3.03disease/s]

Processing: Alcohol use disorder


Processing diseases:  28%|██▊       | 37/132 [00:12<00:30,  3.08disease/s]

Processing: Alcoholic hepatitis


Processing diseases:  29%|██▉       | 38/132 [00:12<00:32,  2.90disease/s]

Processing: Churg-Strauss syndrome


Processing diseases:  30%|██▉       | 39/132 [00:13<00:31,  2.94disease/s]

Processing: Churg-Strauss syndrome


Processing diseases:  30%|███       | 40/132 [00:13<00:31,  2.93disease/s]

Processing: Hay fever


Processing diseases:  31%|███       | 41/132 [00:13<00:31,  2.92disease/s]

Processing: Dust mite allergy


Processing diseases:  32%|███▏      | 42/132 [00:14<00:30,  2.96disease/s]

Processing: Egg allergy


Processing diseases:  33%|███▎      | 43/132 [00:14<00:32,  2.73disease/s]

Processing: Food allergy


Processing diseases:  33%|███▎      | 44/132 [00:14<00:30,  2.91disease/s]

Processing: Latex allergy


Processing diseases:  34%|███▍      | 45/132 [00:15<00:28,  3.07disease/s]

Processing: Milk allergy


Processing diseases:  35%|███▍      | 46/132 [00:15<00:27,  3.07disease/s]

Processing: Mold allergy


Processing diseases:  36%|███▌      | 47/132 [00:15<00:32,  2.64disease/s]

Processing: Nickel allergy


Processing diseases:  36%|███▋      | 48/132 [00:16<00:31,  2.69disease/s]

Processing: Peanut allergy


Processing diseases:  37%|███▋      | 49/132 [00:16<00:29,  2.83disease/s]

Processing: Penicillin allergy


Processing diseases:  38%|███▊      | 50/132 [00:16<00:29,  2.74disease/s]

Processing: Pet allergy


Processing diseases:  39%|███▊      | 51/132 [00:17<00:31,  2.55disease/s]

Processing: Shellfish allergy


Processing diseases:  39%|███▉      | 52/132 [00:17<00:28,  2.77disease/s]

Processing: Wheat allergy


Processing diseases:  40%|████      | 53/132 [00:18<00:29,  2.72disease/s]

Processing: Hair loss


Processing diseases:  41%|████      | 54/132 [00:18<00:28,  2.76disease/s]

Processing: Amyotrophic lateral sclerosis (ALS)


Processing diseases:  42%|████▏     | 55/132 [00:18<00:26,  2.87disease/s]

Processing: Dry socket


Processing diseases:  42%|████▏     | 56/132 [00:19<00:28,  2.68disease/s]

Processing: Atypical genitalia


Processing diseases:  43%|████▎     | 57/132 [00:19<00:28,  2.65disease/s]

Processing: Lazy eye (amblyopia)


Processing diseases:  44%|████▍     | 58/132 [00:20<00:30,  2.45disease/s]

Processing: Chagas disease


Processing diseases:  45%|████▍     | 59/132 [00:20<00:29,  2.43disease/s]

Processing: Transient global amnesia


Processing diseases:  45%|████▌     | 60/132 [00:20<00:30,  2.33disease/s]

Processing: Amnesia


Processing diseases:  46%|████▌     | 61/132 [00:21<00:28,  2.52disease/s]

Processing: Amyloidosis


Processing diseases:  47%|████▋     | 62/132 [00:21<00:26,  2.68disease/s]

Processing: Aplastic anemia


Processing diseases:  48%|████▊     | 63/132 [00:22<00:26,  2.62disease/s]

Processing: Thalassemia


Processing diseases:  48%|████▊     | 64/132 [00:22<00:26,  2.52disease/s]

Processing: Iron deficiency anemia


Processing diseases:  49%|████▉     | 65/132 [00:22<00:27,  2.40disease/s]

Processing: Thalassemia


Processing diseases:  50%|█████     | 66/132 [00:23<00:26,  2.51disease/s]

Processing: Sickle cell anemia


Processing diseases:  51%|█████     | 67/132 [00:23<00:25,  2.53disease/s]

Processing: Vitamin deficiency anemia


Processing diseases:  52%|█████▏    | 68/132 [00:24<00:26,  2.43disease/s]

Processing: Abdominal aortic aneurysm


Processing diseases:  52%|█████▏    | 69/132 [00:24<00:25,  2.45disease/s]

Processing: Aortic aneurysm


Processing diseases:  53%|█████▎    | 70/132 [00:24<00:23,  2.59disease/s]

Processing: Brain aneurysm


Processing diseases:  54%|█████▍    | 71/132 [00:25<00:21,  2.77disease/s]

Processing: Popliteal artery aneurysm


Processing diseases:  55%|█████▍    | 72/132 [00:25<00:20,  2.91disease/s]

Processing: Thoracic aortic aneurysm


Processing diseases:  55%|█████▌    | 73/132 [00:25<00:22,  2.64disease/s]

Processing: Vasculitis


Processing diseases:  56%|█████▌    | 74/132 [00:26<00:20,  2.83disease/s]

Processing: Angina


Processing diseases:  57%|█████▋    | 75/132 [00:26<00:20,  2.73disease/s]

Processing: Hives and angioedema


Processing diseases:  58%|█████▊    | 76/132 [00:26<00:20,  2.73disease/s]

Processing: Castleman disease


Processing diseases:  58%|█████▊    | 77/132 [00:27<00:20,  2.66disease/s]

Processing: Broken ankle


Processing diseases:  59%|█████▉    | 78/132 [00:27<00:20,  2.66disease/s]

Processing: Sprained ankle


Processing diseases:  60%|█████▉    | 79/132 [00:28<00:18,  2.80disease/s]

Processing: Tongue-tie (ankyloglossia)


Processing diseases:  61%|██████    | 80/132 [00:28<00:18,  2.87disease/s]

Processing: Anorexia nervosa


Processing diseases:  61%|██████▏   | 81/132 [00:28<00:17,  2.93disease/s]

Processing: ACL injury


Processing diseases:  62%|██████▏   | 82/132 [00:29<00:17,  2.83disease/s]

Processing: Anterior vaginal prolapse (cystocele)


Processing diseases:  63%|██████▎   | 83/132 [00:29<00:16,  2.91disease/s]

Processing: Pseudomembranous colitis


Processing diseases:  64%|██████▎   | 84/132 [00:29<00:16,  2.88disease/s]

Processing: Generalized anxiety disorder


Processing diseases:  64%|██████▍   | 85/132 [00:30<00:15,  3.07disease/s]

Processing: Social anxiety disorder (social phobia)


Processing diseases:  65%|██████▌   | 86/132 [00:30<00:16,  2.85disease/s]

Processing: Abdominal aortic aneurysm


Processing diseases:  66%|██████▌   | 87/132 [00:30<00:15,  2.94disease/s]

Processing: Coarctation of the aorta


Processing diseases:  67%|██████▋   | 88/132 [00:31<00:14,  2.95disease/s]

Processing: Aortic valve regurgitation


Processing diseases:  67%|██████▋   | 89/132 [00:31<00:13,  3.09disease/s]

Processing: Aortic valve regurgitation


Processing diseases:  68%|██████▊   | 90/132 [00:31<00:13,  3.12disease/s]

Processing: Auditory processing disorder


Processing diseases:  69%|██████▉   | 91/132 [00:32<00:13,  2.98disease/s]

Processing: Primary progressive aphasia


Processing diseases:  70%|██████▉   | 92/132 [00:32<00:13,  3.04disease/s]

Processing: Canker sore


Processing diseases:  70%|███████   | 93/132 [00:32<00:12,  3.03disease/s]

Processing: Canker sore


Processing diseases:  71%|███████   | 94/132 [00:33<00:12,  3.02disease/s]

Processing: Broken heart syndrome


Processing diseases:  72%|███████▏  | 95/132 [00:33<00:11,  3.15disease/s]

Processing: Antiphospholipid syndrome


Processing diseases:  73%|███████▎  | 96/132 [00:33<00:12,  2.87disease/s]

Processing: Viral hemorrhagic fevers


Processing diseases:  73%|███████▎  | 97/132 [00:34<00:11,  3.08disease/s]

Processing: Viral hemorrhagic fevers


Processing diseases:  74%|███████▍  | 98/132 [00:34<00:11,  3.07disease/s]

Processing: Broken arm


Processing diseases:  75%|███████▌  | 99/132 [00:34<00:10,  3.04disease/s]

Processing: Chiari malformation


Processing diseases:  76%|███████▌  | 100/132 [00:35<00:11,  2.72disease/s]

Processing: Heart arrhythmia


Processing diseases:  77%|███████▋  | 101/132 [00:35<00:11,  2.76disease/s]

Processing: Giant cell arteritis


Processing diseases:  77%|███████▋  | 102/132 [00:35<00:10,  2.87disease/s]

Processing: Takayasu's arteritis


Processing diseases:  78%|███████▊  | 103/132 [00:36<00:10,  2.79disease/s]

Processing: Thumb arthritis


Processing diseases:  79%|███████▉  | 104/132 [00:36<00:09,  2.85disease/s]

Processing: Osteoarthritis


Processing diseases:  80%|███████▉  | 105/132 [00:37<00:11,  2.29disease/s]

Processing: Gout


Processing diseases:  80%|████████  | 106/132 [00:37<00:10,  2.45disease/s]

Processing: Septic arthritis


Processing diseases:  81%|████████  | 107/132 [00:37<00:09,  2.50disease/s]

Processing: Juvenile idiopathic arthritis


Processing diseases:  82%|████████▏ | 108/132 [00:38<00:09,  2.55disease/s]

Processing: Osteoarthritis


Processing diseases:  83%|████████▎ | 109/132 [00:38<00:09,  2.33disease/s]

Processing: Psoriatic arthritis


Processing diseases:  83%|████████▎ | 110/132 [00:39<00:09,  2.30disease/s]

Processing: Reactive arthritis


Processing diseases:  84%|████████▍ | 111/132 [00:39<00:08,  2.41disease/s]

Processing: Rheumatoid arthritis


Processing diseases:  85%|████████▍ | 112/132 [00:39<00:07,  2.55disease/s]

Processing: Septic arthritis


Processing diseases:  86%|████████▌ | 113/132 [00:40<00:07,  2.50disease/s]

Processing: Thumb arthritis


Processing diseases:  86%|████████▋ | 114/132 [00:40<00:06,  2.67disease/s]

Processing: Atrial septal defect (ASD)


Processing diseases:  87%|████████▋ | 115/132 [00:40<00:06,  2.76disease/s]

Processing: Autism spectrum disorder


Processing diseases:  88%|████████▊ | 116/132 [00:41<00:05,  2.87disease/s]

Processing: Avascular necrosis (osteonecrosis)


Processing diseases:  89%|████████▊ | 117/132 [00:41<00:05,  2.61disease/s]

Processing: Childhood asthma


Processing diseases:  89%|████████▉ | 118/132 [00:42<00:05,  2.71disease/s]

Processing: Exercise-induced asthma


Processing diseases:  90%|█████████ | 119/132 [00:42<00:04,  2.83disease/s]

Processing: Occupational asthma


Processing diseases:  91%|█████████ | 120/132 [00:42<00:04,  2.99disease/s]

Processing: Atrial tachycardia


Processing diseases:  92%|█████████▏| 121/132 [00:42<00:03,  3.14disease/s]

Processing: Arteriosclerosis / atherosclerosis


Processing diseases:  92%|█████████▏| 122/132 [00:43<00:03,  2.87disease/s]

Processing: Atrioventricular canal defect


Processing diseases:  93%|█████████▎| 123/132 [00:43<00:03,  2.75disease/s]

Processing: Vaginal atrophy


Processing diseases:  94%|█████████▍| 124/132 [00:44<00:02,  2.81disease/s]

Processing: Reactive attachment disorder


Processing diseases:  95%|█████████▍| 125/132 [00:44<00:02,  2.82disease/s]

Processing: Adult attention-deficit/hyperactivity disorder (ADHD)


Processing diseases:  95%|█████████▌| 126/132 [00:44<00:02,  2.73disease/s]

Processing: Auditory processing disorder


Processing diseases:  96%|█████████▌| 127/132 [00:45<00:01,  2.71disease/s]

Processing: Autoimmune epilepsy


Processing diseases:  97%|█████████▋| 128/132 [00:45<00:01,  2.93disease/s]

Processing: Autoimmune encephalitis


Processing diseases:  98%|█████████▊| 129/132 [00:45<00:00,  3.18disease/s]

Processing: Bird flu (avian influenza)


Processing diseases:  98%|█████████▊| 130/132 [00:46<00:00,  3.39disease/s]

Processing: Avascular necrosis (osteonecrosis)


Processing diseases:  99%|█████████▉| 131/132 [00:46<00:00,  3.01disease/s]

Processing: Atrioventricular nodal reentry tachycardia (AVNRT)


Processing diseases: 100%|██████████| 132/132 [00:46<00:00,  2.82disease/s]

CSV file updated successfully with 132 rows





In [None]:
import csv
from bs4 import BeautifulSoup
import requests
import os
from tqdm import tqdm

# Section slugs for aria-labelledby (new design) and pretty names
SECTION_SLUGS = [
    ("diagnosis", "Diagnosis"),
    ("treatment", "Treatment"),
    ("coping-and-support", "Coping and support"),
    ("preparing-for-your-appointment", "Preparing for your appointment"),
    ("lifestyle-and-home-remedies", "Lifestyle and home remedies")
]

def extract_sections(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }

    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        result_sections = {}

        # Try method 1: Old layout (div.content or article#main-content)
        main_content = soup.find('div', class_='content') or soup.find('article', id='main-content')
        if main_content:
            headings = main_content.find_all(['h2', 'h3'])
            for heading in headings:
                heading_text = heading.get_text(strip=True)
                # Fix typo
                if "When to see a dotor" in heading_text:
                    heading_text = "When to see a doctor"
                # If it's a section we care about
                for _, section_name in SECTION_SLUGS:
                    if heading_text == section_name:
                        content = []
                        next_node = heading.find_next_sibling()
                        while next_node and next_node.name not in ['h2', 'h3']:
                            if next_node.name == 'p':
                                content.append(next_node.get_text(strip=True))
                            elif next_node.name in ['ul', 'ol']:
                                items = [li.get_text(strip=True) for li in next_node.find_all('li')]
                                content.extend(items)
                            next_node = next_node.find_next_sibling()
                        if content:
                            result_sections[section_name] = '\n'.join(content)
        # Try method 2: New layout (section[aria-labelledby] + cmp-text__rich-content)
        for slug, section_name in SECTION_SLUGS:
            if section_name in result_sections:
                continue  # Already found by old method
            # Find aria-labelledby section
            section = soup.find('section', {'aria-labelledby': slug})
            if section:
                content_div = section.find('div', class_='cmp-text__rich-content')
                if content_div:
                    paragraphs = [p.get_text(strip=True) for p in content_div.find_all('p')]
                    if paragraphs:
                        result_sections[section_name] = '\n\n'.join(paragraphs)
        return result_sections
    except Exception as e:
        # Skip on error, return empty
        return {}

def update_csv_with_sections(csv_file):
    # Read existing data and headers
    rows = []
    existing_headers = []
    if os.path.exists(csv_file):
        with open(csv_file, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            existing_headers = reader.fieldnames
            rows = list(reader)

    section_headers = [section for slug, section in SECTION_SLUGS]
    all_headers = existing_headers.copy() if existing_headers else ['disease', 'main_link']
    for header in section_headers:
        if header not in all_headers:
            all_headers.append(header)

    # --- SINGLE tqdm bar for ALL rows ---
    updated_rows = []
    with tqdm(total=len(rows), desc="Processing diseases", unit="disease") as pbar:
        for row in rows:
            if 'main_link' in row and row['Diagnosis_treatment_link']:
                # (Optional) tqdm.write() to log messages without disrupting the bar
                tqdm.write(f"Processing: {row.get('disease', 'Unknown')}")
                sections = extract_sections(row['Diagnosis_treatment_link'])
                for section, content in sections.items():
                    row[section] = content
            updated_rows.append(row)
            pbar.update(1)  # Always update once per disease

    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=all_headers)
        writer.writeheader()
        writer.writerows(updated_rows)
    print(f"CSV file updated successfully with {len(updated_rows)} rows")

# Example usage
if __name__ == "__main__":
    csv_file = "mayo_diseases.csv"
    update_csv_with_sections(csv_file)

Processing diseases:   0%|          | 0/132 [00:00<?, ?disease/s]

Processing: Atrial fibrillation


Processing diseases:   1%|          | 1/132 [00:00<00:54,  2.39disease/s]

Processing: Hyperhidrosis


Processing diseases:   2%|▏         | 2/132 [00:01<01:08,  1.90disease/s]

Processing: Bartholin's cyst


Processing diseases:   2%|▏         | 3/132 [00:01<01:04,  2.01disease/s]

Processing: Infant reflux


Processing diseases:   3%|▎         | 4/132 [00:02<01:20,  1.60disease/s]

Processing: Hidradenitis suppurativa


Processing diseases:   4%|▍         | 5/132 [00:02<01:15,  1.69disease/s]

Processing: HIV/AIDS


Processing diseases:   5%|▍         | 6/132 [00:03<01:15,  1.67disease/s]

Processing: Acute myelogenous leukemia


Processing diseases:   5%|▌         | 7/132 [00:04<01:51,  1.12disease/s]

Processing: Guillain-Barre syndrome


Processing diseases:   6%|▌         | 8/132 [00:05<01:43,  1.20disease/s]

Processing: Acute kidney injury


Processing diseases:   7%|▋         | 9/132 [00:06<01:32,  1.32disease/s]

Processing: Acute lymphocytic leukemia


Processing diseases:   8%|▊         | 10/132 [00:06<01:23,  1.46disease/s]

Processing: Acute lymphocytic leukemia


Processing diseases:   8%|▊         | 11/132 [00:07<01:07,  1.78disease/s]

Processing: Acute myelogenous leukemia


Processing diseases:   9%|▉         | 12/132 [00:07<00:59,  2.00disease/s]

Processing: Acute myelogenous leukemia


Processing diseases:  10%|▉         | 13/132 [00:07<00:53,  2.23disease/s]

Processing: Acute myelogenous leukemia


Processing diseases:  11%|█         | 14/132 [00:08<00:49,  2.40disease/s]

Processing: Radiation sickness


Processing diseases:  11%|█▏        | 15/132 [00:08<00:55,  2.10disease/s]

Processing: Radiation sickness


Processing diseases:  12%|█▏        | 16/132 [00:09<00:56,  2.04disease/s]

Processing: Acute kidney injury


Processing diseases:  13%|█▎        | 17/132 [00:10<01:21,  1.40disease/s]

Processing: ARDS


Processing diseases:  14%|█▎        | 18/132 [00:11<01:19,  1.44disease/s]

Processing: Acute sinusitis


Processing diseases:  14%|█▍        | 19/132 [00:12<01:35,  1.18disease/s]

Processing: Autoimmune epilepsy


Processing diseases:  15%|█▌        | 20/132 [00:13<01:31,  1.22disease/s]

Processing: Alcohol use disorder


Processing diseases:  16%|█▌        | 21/132 [00:13<01:33,  1.18disease/s]

Processing: Compulsive gambling


Processing diseases:  17%|█▋        | 22/132 [00:14<01:23,  1.32disease/s]

Processing: Nicotine dependence


Processing diseases:  17%|█▋        | 23/132 [00:15<01:16,  1.43disease/s]

Processing: Mesenteric lymphadenitis


Processing diseases:  18%|█▊        | 24/132 [00:15<01:02,  1.73disease/s]

Processing: Attention-deficit/hyperactivity disorder (ADHD) in children


Processing diseases:  19%|█▉        | 25/132 [00:15<01:00,  1.78disease/s]

Processing: Adult attention-deficit/hyperactivity disorder (ADHD)


Processing diseases:  20%|█▉        | 26/132 [00:16<01:02,  1.69disease/s]

Processing: Frozen shoulder


Processing diseases:  20%|██        | 27/132 [00:16<00:57,  1.84disease/s]

Processing: Childhood schizophrenia


Processing diseases:  21%|██        | 28/132 [00:17<01:00,  1.72disease/s]

Processing: Benign adrenal tumors


Processing diseases:  22%|██▏       | 29/132 [00:18<01:01,  1.69disease/s]

Processing: Autoimmune encephalitis


Processing diseases:  23%|██▎       | 30/132 [00:20<01:37,  1.05disease/s]

Processing: Acute flaccid myelitis (AFM)


Processing diseases:  23%|██▎       | 31/132 [00:22<02:14,  1.33s/disease]

Processing: Dry macular degeneration


Processing diseases:  24%|██▍       | 32/132 [00:22<01:46,  1.07s/disease]

Processing: Macular degeneration, wet


Processing diseases:  25%|██▌       | 33/132 [00:23<01:27,  1.13disease/s]

Processing: Myelofibrosis


Processing diseases:  26%|██▌       | 34/132 [00:23<01:13,  1.32disease/s]

Processing: Guillain-Barre syndrome


Processing diseases:  27%|██▋       | 35/132 [00:24<01:06,  1.45disease/s]

Processing: HIV/AIDS


Processing diseases:  27%|██▋       | 36/132 [00:24<01:00,  1.59disease/s]

Processing: Alcohol use disorder


Processing diseases:  28%|██▊       | 37/132 [00:25<00:54,  1.75disease/s]

Processing: Alcoholic hepatitis


Processing diseases:  29%|██▉       | 38/132 [00:27<01:31,  1.03disease/s]

Processing: Churg-Strauss syndrome


Processing diseases:  30%|██▉       | 39/132 [00:27<01:19,  1.17disease/s]

Processing: Churg-Strauss syndrome


Processing diseases:  30%|███       | 40/132 [00:27<01:02,  1.47disease/s]

Processing: Hay fever


Processing diseases:  31%|███       | 41/132 [00:28<00:55,  1.63disease/s]

Processing: Dust mite allergy


Processing diseases:  32%|███▏      | 42/132 [00:28<00:51,  1.75disease/s]

Processing: Egg allergy


Processing diseases:  33%|███▎      | 43/132 [00:29<00:48,  1.84disease/s]

Processing: Food allergy


Processing diseases:  33%|███▎      | 44/132 [00:29<00:47,  1.85disease/s]

Processing: Latex allergy


Processing diseases:  34%|███▍      | 45/132 [00:30<00:45,  1.93disease/s]

Processing: Milk allergy


Processing diseases:  35%|███▍      | 46/132 [00:30<00:44,  1.93disease/s]

Processing: Mold allergy


Processing diseases:  36%|███▌      | 47/132 [00:31<00:44,  1.93disease/s]

Processing: Nickel allergy


Processing diseases:  36%|███▋      | 48/132 [00:32<00:57,  1.46disease/s]

Processing: Peanut allergy


Processing diseases:  37%|███▋      | 49/132 [00:32<00:51,  1.61disease/s]

Processing: Penicillin allergy


Processing diseases:  38%|███▊      | 50/132 [00:33<00:47,  1.73disease/s]

Processing: Pet allergy


Processing diseases:  39%|███▊      | 51/132 [00:33<00:46,  1.75disease/s]

Processing: Shellfish allergy


Processing diseases:  39%|███▉      | 52/132 [00:34<00:43,  1.84disease/s]

Processing: Wheat allergy


Processing diseases:  40%|████      | 53/132 [00:36<01:19,  1.00s/disease]

Processing: Hair loss


Processing diseases:  41%|████      | 54/132 [00:37<01:08,  1.15disease/s]

Processing: Amyotrophic lateral sclerosis (ALS)


Processing diseases:  42%|████▏     | 55/132 [00:37<00:57,  1.34disease/s]

Processing: Dry socket


Processing diseases:  42%|████▏     | 56/132 [00:37<00:50,  1.50disease/s]

Processing: Atypical genitalia


Processing diseases:  43%|████▎     | 57/132 [00:38<00:47,  1.57disease/s]

Processing: Lazy eye (amblyopia)


Processing diseases:  44%|████▍     | 58/132 [00:39<00:45,  1.64disease/s]

Processing: Chagas disease


Processing diseases:  45%|████▍     | 59/132 [00:39<00:41,  1.74disease/s]

Processing: Transient global amnesia


Processing diseases:  45%|████▌     | 60/132 [00:40<00:58,  1.23disease/s]

Processing: Amnesia


Processing diseases:  46%|████▌     | 61/132 [00:41<00:51,  1.38disease/s]

Processing: Amyloidosis


Processing diseases:  47%|████▋     | 62/132 [00:42<01:07,  1.04disease/s]

Processing: Aplastic anemia


Processing diseases:  48%|████▊     | 63/132 [00:43<01:00,  1.14disease/s]

Processing: Thalassemia


Processing diseases:  48%|████▊     | 64/132 [00:44<00:53,  1.27disease/s]

Processing: Iron deficiency anemia


Processing diseases:  49%|████▉     | 65/132 [00:44<00:50,  1.34disease/s]

Processing: Thalassemia


Processing diseases:  50%|█████     | 66/132 [00:45<00:43,  1.51disease/s]

Processing: Sickle cell anemia


Processing diseases:  51%|█████     | 67/132 [00:46<00:45,  1.43disease/s]

Processing: Vitamin deficiency anemia


Processing diseases:  52%|█████▏    | 68/132 [00:47<00:48,  1.31disease/s]

Processing: Abdominal aortic aneurysm


Processing diseases:  53%|█████▎    | 70/132 [00:47<00:46,  1.33disease/s]

Processing: Brain aneurysm


Processing diseases:  54%|█████▍    | 71/132 [00:48<00:35,  1.72disease/s]

Processing: Popliteal artery aneurysm


Processing diseases:  55%|█████▍    | 72/132 [00:49<00:34,  1.75disease/s]

Processing: Thoracic aortic aneurysm


Processing diseases:  55%|█████▌    | 73/132 [00:49<00:32,  1.80disease/s]

Processing: Vasculitis


Processing diseases:  56%|█████▌    | 74/132 [00:50<00:34,  1.68disease/s]

Processing: Angina


Processing diseases:  57%|█████▋    | 75/132 [00:50<00:33,  1.71disease/s]

Processing: Hives and angioedema


Processing diseases:  58%|█████▊    | 76/132 [00:51<00:31,  1.78disease/s]

Processing: Castleman disease


Processing diseases:  58%|█████▊    | 77/132 [00:51<00:29,  1.84disease/s]

Processing: Broken ankle


Processing diseases:  59%|█████▉    | 78/132 [00:52<00:34,  1.55disease/s]

Processing: Sprained ankle


Processing diseases:  60%|█████▉    | 79/132 [00:53<00:31,  1.67disease/s]

Processing: Tongue-tie (ankyloglossia)


Processing diseases:  61%|██████    | 80/132 [00:53<00:30,  1.68disease/s]

Processing: Anorexia nervosa


Processing diseases:  61%|██████▏   | 81/132 [00:54<00:32,  1.55disease/s]

Processing: ACL injury


Processing diseases:  62%|██████▏   | 82/132 [00:55<00:29,  1.69disease/s]

Processing: Anterior vaginal prolapse (cystocele)


Processing diseases:  63%|██████▎   | 83/132 [00:57<00:51,  1.06s/disease]

Processing: Pseudomembranous colitis


Processing diseases:  64%|██████▎   | 84/132 [00:57<00:42,  1.12disease/s]

Processing: Generalized anxiety disorder


Processing diseases:  64%|██████▍   | 85/132 [00:58<00:37,  1.25disease/s]

Processing: Social anxiety disorder (social phobia)


Processing diseases:  65%|██████▌   | 86/132 [00:58<00:32,  1.41disease/s]

Processing: Abdominal aortic aneurysm


Processing diseases:  66%|██████▌   | 87/132 [00:59<00:26,  1.67disease/s]

Processing: Coarctation of the aorta


Processing diseases:  67%|██████▋   | 88/132 [00:59<00:26,  1.66disease/s]

Processing: Aortic valve regurgitation


Processing diseases:  67%|██████▋   | 89/132 [01:00<00:25,  1.71disease/s]

Processing: Aortic valve regurgitation


Processing diseases:  68%|██████▊   | 90/132 [01:00<00:22,  1.84disease/s]

Processing: Auditory processing disorder


Processing diseases:  69%|██████▉   | 91/132 [01:01<00:21,  1.92disease/s]

Processing: Primary progressive aphasia


Processing diseases:  70%|██████▉   | 92/132 [01:02<00:27,  1.44disease/s]

Processing: Canker sore


Processing diseases:  70%|███████   | 93/132 [01:02<00:24,  1.59disease/s]

Processing: Canker sore


Processing diseases:  71%|███████   | 94/132 [01:03<00:20,  1.81disease/s]

Processing: Broken heart syndrome


Processing diseases:  72%|███████▏  | 95/132 [01:04<00:33,  1.10disease/s]

Processing: Antiphospholipid syndrome


Processing diseases:  73%|███████▎  | 96/132 [01:05<00:28,  1.27disease/s]

Processing: Viral hemorrhagic fevers


Processing diseases:  73%|███████▎  | 97/132 [01:07<00:39,  1.12s/disease]

Processing: Viral hemorrhagic fevers


Processing diseases:  74%|███████▍  | 98/132 [01:07<00:30,  1.13disease/s]

Processing: Broken arm


Processing diseases:  75%|███████▌  | 99/132 [01:08<00:25,  1.27disease/s]

Processing: Chiari malformation


Processing diseases:  76%|███████▌  | 100/132 [01:08<00:24,  1.32disease/s]

Processing: Heart arrhythmia


Processing diseases:  77%|███████▋  | 101/132 [01:09<00:20,  1.48disease/s]

Processing: Giant cell arteritis


Processing diseases:  77%|███████▋  | 102/132 [01:10<00:21,  1.37disease/s]

Processing: Takayasu's arteritis


Processing diseases:  78%|███████▊  | 103/132 [01:10<00:19,  1.45disease/s]

Processing: Thumb arthritis


Processing diseases:  79%|███████▉  | 104/132 [01:11<00:18,  1.49disease/s]

Processing: Osteoarthritis


Processing diseases:  80%|███████▉  | 105/132 [01:12<00:18,  1.42disease/s]

Processing: Gout


Processing diseases:  80%|████████  | 106/132 [01:12<00:16,  1.58disease/s]

Processing: Septic arthritis


Processing diseases:  81%|████████  | 107/132 [01:13<00:15,  1.58disease/s]

Processing: Juvenile idiopathic arthritis


Processing diseases:  82%|████████▏ | 108/132 [01:13<00:14,  1.65disease/s]

Processing: Osteoarthritis


Processing diseases:  83%|████████▎ | 109/132 [01:14<00:12,  1.88disease/s]

Processing: Psoriatic arthritis


Processing diseases:  83%|████████▎ | 110/132 [01:14<00:11,  1.91disease/s]

Processing: Reactive arthritis


Processing diseases:  84%|████████▍ | 111/132 [01:16<00:16,  1.28disease/s]

Processing: Rheumatoid arthritis


Processing diseases:  85%|████████▍ | 112/132 [01:16<00:14,  1.42disease/s]

Processing: Septic arthritis


Processing diseases:  86%|████████▌ | 113/132 [01:17<00:11,  1.58disease/s]

Processing: Thumb arthritis


Processing diseases:  86%|████████▋ | 114/132 [01:17<00:09,  1.84disease/s]

Processing: Atrial septal defect (ASD)


Processing diseases:  87%|████████▋ | 115/132 [01:18<00:09,  1.79disease/s]

Processing: Autism spectrum disorder


Processing diseases:  88%|████████▊ | 116/132 [01:18<00:10,  1.54disease/s]

Processing: Avascular necrosis (osteonecrosis)


Processing diseases:  89%|████████▊ | 117/132 [01:19<00:09,  1.59disease/s]

Processing: Childhood asthma


Processing diseases:  89%|████████▉ | 118/132 [01:20<00:08,  1.58disease/s]

Processing: Exercise-induced asthma


Processing diseases:  90%|█████████ | 119/132 [01:21<00:10,  1.20disease/s]

Processing: Occupational asthma


Processing diseases:  91%|█████████ | 120/132 [01:22<00:09,  1.26disease/s]

Processing: Atrial tachycardia


Processing diseases:  92%|█████████▏| 121/132 [01:25<00:16,  1.54s/disease]

Processing: Arteriosclerosis / atherosclerosis


Processing diseases:  92%|█████████▏| 122/132 [01:25<00:12,  1.24s/disease]

Processing: Atrioventricular canal defect


Processing diseases:  93%|█████████▎| 123/132 [01:26<00:09,  1.05s/disease]

Processing: Vaginal atrophy


Processing diseases:  94%|█████████▍| 124/132 [01:27<00:07,  1.07disease/s]

Processing: Reactive attachment disorder


Processing diseases:  95%|█████████▍| 125/132 [01:28<00:07,  1.05s/disease]

Processing: Adult attention-deficit/hyperactivity disorder (ADHD)


Processing diseases:  95%|█████████▌| 126/132 [01:29<00:05,  1.03disease/s]

Processing: Auditory processing disorder


Processing diseases:  96%|█████████▌| 127/132 [01:29<00:04,  1.21disease/s]

Processing: Autoimmune epilepsy


Processing diseases:  97%|█████████▋| 128/132 [01:30<00:02,  1.39disease/s]

Processing: Autoimmune encephalitis


Processing diseases:  98%|█████████▊| 129/132 [01:30<00:01,  1.52disease/s]

Processing: Bird flu (avian influenza)


Processing diseases:  98%|█████████▊| 130/132 [01:31<00:01,  1.66disease/s]

Processing: Avascular necrosis (osteonecrosis)


Processing diseases:  99%|█████████▉| 131/132 [01:31<00:00,  1.92disease/s]

Processing: Atrioventricular nodal reentry tachycardia (AVNRT)


Processing diseases: 100%|██████████| 132/132 [01:34<00:00,  1.39disease/s]

CSV file updated successfully with 132 rows





In [31]:
extraction_list = ["overview","symptoms","when-to-see-a-doctor","causes","complications","prevention","risk-factors"]

In [40]:
import pandas as pd

In [41]:
df = pd.read_csv("mayo_diseases.csv")

In [42]:
df.isnull().sum()

disease                            0
main_link                          0
Diagnosis_treatment_link           1
Doctors_departments_link          25
Diagnosis                          1
Treatment                          1
Coping and support                76
Preparing for your appointment     6
dtype: int64

In [44]:
df.head()

Unnamed: 0,disease,main_link,Diagnosis_treatment_link,Doctors_departments_link,Diagnosis,Treatment,Coping and support,Preparing for your appointment
0,Atrial fibrillation,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,You may not know you have atrial fibrillation ...,The goals of atrial fibrillation treatment are...,,If you have an irregular or pounding heartbeat...
1,Hyperhidrosis,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,Diagnosing hyperhidrosis may start with your h...,Treating hyperhidrosis may start with treating...,Hyperhidrosis can be the cause of discomfort a...,You may start by seeing your primary care prov...
2,Bartholin's cyst,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,"To diagnose a Bartholin's cyst, your doctor ma...",Often a Bartholin's cyst requires no treatment...,,Your first appointment will likely be with eit...
3,Infant reflux,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,,"To diagnose infant reflux, a healthcare profes...","For most babies, making some changes to feedin...",,You may start by seeing your baby's primary he...
4,Hidradenitis suppurativa,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,Hidradenitis suppurativa can be mistaken for p...,"Treatment with medicines, surgery or both can ...",Hidradenitis suppurativa can be a challenge to...,You'll likely first see your primary care prov...


In [46]:
import pandas as pd

# Load your CSV
df = pd.read_csv('mayo_diseases.csv')

# List of columns to check for all-null per disease
cols_to_check = [
'Preparing for your appointment'
]

# Find rows where all these columns are null (or empty string, or only whitespace)
mask = df[cols_to_check].isnull() | (df[cols_to_check].applymap(lambda x: str(x).strip() == ""))

# Disease rows where ALL of the specified columns are null/empty
fully_null = df[mask.all(axis=1)]

print(f"Diseases with all info columns missing ({len(fully_null)} found):")


Diseases with all info columns missing (6 found):


  mask = df[cols_to_check].isnull() | (df[cols_to_check].applymap(lambda x: str(x).strip() == ""))


In [47]:
fully_null.head()

Unnamed: 0,disease,main_link,Diagnosis_treatment_link,Doctors_departments_link,Diagnosis,Treatment,Coping and support,Preparing for your appointment
14,Radiation sickness,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,,When a person has experienced known or probabl...,The treatment goals for radiation sickness are...,,
15,Radiation sickness,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,,When a person has experienced known or probabl...,The treatment goals for radiation sickness are...,,
17,ARDS,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,There's no specific test for ARDS. Healthcare ...,The first goal in treating ARDS is to improve ...,"Recovery from ARDS can take time, and you're l...",
69,Aortic aneurysm,https://www.mayoclinic.org/diseases-conditions...,,https://www.mayoclinic.org/diseases-conditions...,,,,
71,Popliteal artery aneurysm,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,"To diagnose popliteal artery aneurysm, a healt...",Treatment of popliteal artery aneurysm depends...,,
