In [2]:
import pandas

In [4]:
pip install requests beautifulsoup4 scrapy pandas lxml


Note: you may need to restart the kernel to use updated packages.


In [9]:
import requests
import xml.etree.ElementTree as ET

API_URL = "https://wsearch.nlm.nih.gov/ws/query?db=healthTopics&term=symptoms"

try:
    response = requests.get(API_URL)
    response.raise_for_status()

    root = ET.fromstring(response.content)

    count = root.find('count').text
    print(f"Number of results: {count}")

    for doc in root.findall('.//document'):  # Find all 'document' elements
        title_element = doc.find('content[@name="title"]')
        title = title_element.text if title_element is not None else "No Title"

        # Get the URL from the document element itself (not its children)
        url = doc.get('url', "No URL")  # Correct way to get URL

        print(f"Title: {title}")
        print(f"URL: {url}")
        print("-" * 20)

    with open("medlineplus_data.xml", "wb") as file:
        file.write(response.content)

    print("✅ MedlinePlus Data Collected Successfully!")

except requests.exceptions.RequestException as e:
    print(f"❌ Error fetching data: {e}")
except ET.ParseError as e:
    print(f"❌ Error parsing XML: {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

Number of results: 733
Title: Dual <span class="qt1">Diagnosis</span>
URL: https://medlineplus.gov/dualdiagnosis.html
--------------------
Title: Asthma
URL: https://medlineplus.gov/asthma.html
--------------------
Title: Asthma in Children
URL: https://medlineplus.gov/asthmainchildren.html
--------------------
Title: Atrial Fibrillation
URL: https://medlineplus.gov/atrialfibrillation.html
--------------------
Title: Attention Deficit Hyperactivity Disorder
URL: https://medlineplus.gov/attentiondeficithyperactivitydisorder.html
--------------------
Title: Autoimmune Diseases
URL: https://medlineplus.gov/autoimmunediseases.html
--------------------
Title: Bipolar Disorder
URL: https://medlineplus.gov/bipolardisorder.html
--------------------
Title: Carpal Tunnel Syndrome
URL: https://medlineplus.gov/carpaltunnelsyndrome.html
--------------------
Title: Celiac Disease
URL: https://medlineplus.gov/celiacdisease.html
--------------------
Title: Chronic Bronchitis
URL: https://medlineplus.g

In [33]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

API_URL = "https://wsearch.nlm.nih.gov/ws/query?db=healthTopics&term=symptoms"

try:
    response = requests.get(API_URL)
    response.raise_for_status()

    root = ET.fromstring(response.content)

    # Extract data for the DataFrame
    data = []
    for doc in root.findall('.//document'):
        title_element = doc.find('content[@name="title"]')
        title = title_element.text if title_element is not None else "No Title"
        url = doc.get('url', "No URL")

        # Extract other content elements
        content_elements = doc.findall('content')
        content_dict = {}
        for content in content_elements:
            name = content.get('name')
            text = content.text
            content_dict[name] = text

        # Combine extracted data
        row_data = {'title': title, 'url': url, **content_dict}  # Merge title, url, and other content
        data.append(row_data)

    df = pd.DataFrame(data)

    df.to_csv("medlineplus_data.csv", index=False, encoding="utf-8")  # Save to CSV
    print("✅ Data saved as CSV for further processing!")

except requests.exceptions.RequestException as e:
    print(f"❌ Error fetching data: {e}")
except ET.ParseError as e:
    print(f"❌ Error parsing XML: {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

✅ Data saved as CSV for further processing!


In [34]:
df.head(5)

Unnamed: 0,title,url,organizationName,FullSummary,mesh,groupName,snippet,altTitle
0,"Dual <span class=""qt1"">Diagnosis</span>",https://medlineplus.gov/dualdiagnosis.html,National Library of Medicine,"What is dual <span class=""qt1"">diagnosis</span...","<span class=""qt1"">Diagnosis</span>, Dual (Psyc...",Substance Use and Disorders,"What is dual <span class=""qt1"">diagnosis</span...",
1,Asthma,https://medlineplus.gov/asthma.html,National Library of Medicine,What is asthma?<p>Asthma is a chronic (long-te...,Asthma,Immune System,What is asthma? Asthma is a chronic (long-term...,Bronchial Asthma
2,Asthma in Children,https://medlineplus.gov/asthmainchildren.html,National Library of Medicine,What is asthma? <p>Asthma is a chronic (long-t...,Asthma,Immune System,What is asthma? Asthma is a chronic (long-ter...,Pediatric asthma
3,Atrial Fibrillation,https://medlineplus.gov/atrialfibrillation.html,National Library of Medicine,What is atrial fibrillation (AFib)?<p>Atrial f...,Atrial Fibrillation,"Blood, Heart and Circulation",What is atrial fibrillation (AFib)? Atrial fib...,AFib
4,Attention Deficit Hyperactivity Disorder,https://medlineplus.gov/attentiondeficithypera...,National Library of Medicine,What is attention deficit hyperactivity disord...,Attention Deficit Disorder with Hyperactivity,Brain and Nerves,What is attention deficit hyperactivity disord...,Hyperactivity


In [37]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re  # For regular expressions

URL = "https://www.webmd.com/cold-and-flu/default.htm"  # Or any other WebMD symptoms page

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

try:
    response = requests.get(URL, headers=HEADERS)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, "html.parser")

    symptom_list = []

    # Strategy 1: Look for common symptom list structures (adapt as needed)
    for element in soup.find_all(["li", "p", "div"], class_=re.compile(r".*\bsymptoms?\b.*", re.IGNORECASE)): #find elements with class names that contain "symptom" or "symptoms"
        text = element.get_text(strip=True)
        if text:
            symptom_list.append(text)

    # Strategy 2: Look for common symptom text within elements (more general)
    if not symptom_list: #if strategy 1 didn't work
        for element in soup.find_all(["li", "p", "div"]): #find all common elements
            text = element.get_text(strip=True)
            if re.search(r".*\bsymptoms?\b.*", text, re.IGNORECASE): #check if the text contains "symptom" or "symptoms"
                symptom_list.append(text)

    # Strategy 3: Look for any list-like elements (least specific)
    if not symptom_list: #if strategy 2 didn't work
      for element in soup.find_all("li"):
        text = element.get_text(strip=True)
        if text:
          symptom_list.append(text)

    if symptom_list:
        df = pd.DataFrame(symptom_list, columns=["Symptoms"])
        df.to_csv("webmd_symptoms.csv", index=False, encoding="utf-8")
        print("✅ Successfully Scraped WebMD Symptoms Data!")
    else:
        print("❌ No symptoms found on the page. Check the selectors.")

except requests.exceptions.RequestException as e:
    print(f"❌ Error fetching WebMD data: {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

✅ Successfully Scraped WebMD Symptoms Data!


In [53]:
df.Symptoms[0]

"Skip to main contentHomeConditionsBackConditionsView AllADD/ADHDAllergiesArthritisAtrial fibrillationBreast CancerCancerCrohn's DiseaseDepressionDiabetesDVTEczemaEye HealthHeart DiseaseHIV & AIDSLung DiseaseLupusMental HealthMultiple SclerosisMigrainePain ManagementPsoriasisPsoriatic ArthritisRheumatoid ArthritisSexual ConditionsSkin ProblemsSleep DisordersUlcerative ColitisView AllDrugs & SupplementsBackDrugs & SupplementsDrugsSupplementsPill IdentifierInteraction CheckerWell-BeingBackWell-BeingAging WellBabyBirth ControlChildren's HealthDiet & Weight ManagementFitness & ExerciseFood & RecipesHealth & BalanceHealthy BeautyMen's HealthParentingPet HealthPregnancySex & RelationshipsTeen HealthWomen's HealthSymptom CheckerFind a DoctorMoreBackMoreNewsBlogsPodcastsWebinarsNewslettersWebMD MagazineBest HospitalsSupport GroupsPrivacy & MoreSubscribeLog InSearchSubscribeCold & Flu Resource CenterTools and resources to help you understand and treat cold and flu symptoms.Ear InfectionCold and

In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

URL = "https://www.mayoclinic.org/diseases-conditions/index"  # Or a more specific index page
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

try:
    response = requests.get(URL, headers=HEADERS)
    response.raise_for_status()  # Check for HTTP errors

    soup = BeautifulSoup(response.content, "html.parser") #response.content for web scraping

    disease_list = []

    # More robust way to find disease links (several strategies)
    diseases = soup.find_all("a", class_=lambda x: x and "index-list__item" in x) #find all a tags with class names that contain "index-list__item"
    if diseases:
      for disease in diseases:
        disease_name = disease.get_text(strip=True)
        disease_list.append(disease_name)
    else: #if the previous strategy didn't work
      diseases = soup.find_all("a", href=lambda href: href and "/diseases-conditions/" in href) #find all a tags with hrefs that contain "/diseases-conditions/"
      if diseases:
        for disease in diseases:
          disease_name = disease.get_text(strip=True)
          disease_list.append(disease_name)
      else: #if the previous strategy didn't work
        diseases = soup.find_all("a") #find all a tags
        if diseases:
          for disease in diseases:
            disease_name = disease.get_text(strip=True)
            if disease_name and any(char.isalpha() for char in disease_name): #check if the text is not empty and contains at least one alphabet character
              disease_list.append(disease_name)

    if disease_list:
        df = pd.DataFrame(disease_list, columns=["Diseases"])
        df.to_csv("mayo_clinic_diseases.csv", index=False, encoding="utf-8")
        print("✅ Successfully Scraped Mayo Clinic Diseases Data!")
    else:
        print("❌ No diseases found on the page. Check the selectors.")

except requests.exceptions.RequestException as e:
    print(f"❌ Error fetching Mayo Clinic data: {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

✅ Successfully Scraped Mayo Clinic Diseases Data!


In [55]:
df.Symptoms[0]

"Skip to main contentHomeConditionsBackConditionsView AllADD/ADHDAllergiesArthritisAtrial fibrillationBreast CancerCancerCrohn's DiseaseDepressionDiabetesDVTEczemaEye HealthHeart DiseaseHIV & AIDSLung DiseaseLupusMental HealthMultiple SclerosisMigrainePain ManagementPsoriasisPsoriatic ArthritisRheumatoid ArthritisSexual ConditionsSkin ProblemsSleep DisordersUlcerative ColitisView AllDrugs & SupplementsBackDrugs & SupplementsDrugsSupplementsPill IdentifierInteraction CheckerWell-BeingBackWell-BeingAging WellBabyBirth ControlChildren's HealthDiet & Weight ManagementFitness & ExerciseFood & RecipesHealth & BalanceHealthy BeautyMen's HealthParentingPet HealthPregnancySex & RelationshipsTeen HealthWomen's HealthSymptom CheckerFind a DoctorMoreBackMoreNewsBlogsPodcastsWebinarsNewslettersWebMD MagazineBest HospitalsSupport GroupsPrivacy & MoreSubscribeLog InSearchSubscribeCold & Flu Resource CenterTools and resources to help you understand and treat cold and flu symptoms.Ear InfectionCold and

In [23]:
import pandas as pd

# Load CSV files
medlineplus_df = pd.read_csv("medlineplus_data.csv")
webmd_df = pd.read_csv("webmd_symptoms.csv")
mayo_df = pd.read_csv("mayo_clinic_diseases.csv")

# 1. Clean and prepare the data for merging (crucial!)

# MedlinePlus: Extract disease name (adapt if needed)
medlineplus_df['DiseaseName'] = medlineplus_df['title'].str.replace(r'<.*?>', '', regex=True).str.strip()  # Remove HTML tags, strip whitespace
medlineplus_df = medlineplus_df[['DiseaseName', 'url', 'FullSummary', 'snippet']] #select only the needed columns

# WebMD: Extract disease name (adapt if needed)
webmd_df['DiseaseName'] = webmd_df['Symptoms'].str.extract(r'(.+?)(?:symptoms?|signs?|causes?|treatments?|complications?|risk factors?)?\s*$', expand=False).str.strip() #extract the disease name from the symptoms column
webmd_df = webmd_df[['DiseaseName', 'Symptoms']] #select only the needed columns

# Mayo Clinic: Clean disease names (adapt if needed)
mayo_df['DiseaseName'] = mayo_df['Diseases'].str.strip()
mayo_df = mayo_df[['DiseaseName', 'Diseases']] #select only the needed columns

# 2. Merge DataFrames (using 'DiseaseName' as the key)

# Merge MedlinePlus and WebMD
merged_df = pd.merge(medlineplus_df, webmd_df, on='DiseaseName', how='outer')

# Merge with Mayo Clinic data
merged_df = pd.merge(merged_df, mayo_df, on='DiseaseName', how='outer')

# 3. Clean up and save

# Remove duplicate rows (if any)
merged_df = merged_df.drop_duplicates()

# Save the final dataset
merged_df.to_csv("final_medical_data.csv", index=False, encoding="utf-8")
print("✅ Final Medical Data Combined and Saved!")

✅ Final Medical Data Combined and Saved!


In [29]:
merged_df

Unnamed: 0,DiseaseName,url,FullSummary,snippet,Symptoms,Diseases
0,#,,,,,#
1,1 of 2 /Common ColdView AllWhat Is a Cold?A co...,,,,1 of 2 /Common ColdView AllWhat Is a Cold?A co...,
5,4 Ways You’re Making Your Cold WorseIf you can...,,,,4 Ways You’re Making Your Cold WorseIf you can...,
7,A,,,,,A
8,A doctor can usually tell if you have the flu ...,,,,A doctor can usually tell if you have the flu ...,
...,...,...,...,...,...,...
304,X,,,,,X
305,Y,,,,,Y
306,Z,,,,,Z
307,العربية,,,,,العربية


In [69]:
merged_df.Diseases

0            #
1          NaN
5          NaN
7            A
8          NaN
        ...   
304          X
305          Y
306          Z
307    العربية
309       简体中文
Name: Diseases, Length: 237, dtype: object

In [73]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.mayoclinic.org/diseases-conditions/common-cold/symptoms-causes/syc-20351605" # Example URL
try:
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the specific section (this is where you'll need to inspect!)
    symptoms_section = soup.find("div", class_="main-content") #find the div with class "main-content"
    if symptoms_section:
      symptoms_list = symptoms_section.find_all("li") #find all li tags within the selected section
      symptoms = [li.get_text(strip=True) for li in symptoms_list]
      if symptoms:
        df = pd.DataFrame(symptoms, columns=["text"])
        df.to_csv("mayoclinic_cold.csv", index=False, encoding="utf-8")
        print("✅ Successfully Scraped Mayo Clinic Cold Data!")
      else:
        print("❌ No symptoms found on the page. Check the selectors.")
    else:
      print("❌ No symptoms section found on the page. Check the selectors.")

except requests.exceptions.RequestException as e:
    print(f"❌ Error: {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

❌ No symptoms section found on the page. Check the selectors.
