In [3]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import csv
import time

# --- CONFIG ---
SEARCH_TERM = "Paracetamol"  # Change this to any drug
NUM_RESULTS = 3  # Number of results to scrape
CSV_FILENAME = "drugsdotcom_side_effects.csv"

# --- SETUP SELENIUM ---
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.drugs.com/")

# --- SEARCH DRUG ---
search_box = driver.find_element(By.ID, "livesearch-main")
search_box.send_keys(SEARCH_TERM)
driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
time.sleep(2)

# Determine if it’s a direct match or list page
current_url = driver.current_url
drug_links = []

if "search.php" in current_url:
    results = driver.find_elements(By.CSS_SELECTOR, ".ddc-media-list a")[:NUM_RESULTS]
    drug_links = [(r.text.strip(), r.get_attribute("href")) for r in results]
else:
    name = driver.title.split(" -")[0]
    drug_links = [(name, current_url)]

# --- FUNCTION TO SCRAPE SIDE EFFECTS ---
def scrape_side_effects(base_url):
    try:
        driver.get(base_url + "/side-effects.html")
        time.sleep(2)
        paragraphs = driver.find_elements(By.CSS_SELECTOR, ".contentBox p")
        return "\n".join([p.text.strip() for p in paragraphs if p.text.strip()])
    except:
        return "Side effects section not found"

# --- SCRAPE LOOP ---
data = []
for name, link in drug_links:
    print(f"\nScraping side effects for: {name}")
    base_url = link.rstrip(".html")
    side_effects = scrape_side_effects(base_url)
    data.append([name, link, side_effects])
    print("✅ Done")

driver.quit()

# --- SAVE TO CSV ---
with open(CSV_FILENAME, "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["Drug Name", "URL", "Side Effects"])
    writer.writerows(data)

print(f"\n📁 Side effects data saved to {CSV_FILENAME}")


TypeError: argument of type 'NoneType' is not iterable

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import csv
import time

# --- CONFIG ---
SEARCH_TERM = "Paracetamol"  # Change to any drug
NUM_RESULTS = 3  # Limit to avoid rate limiting
CSV_FILENAME = "drugsdotcom_full_data.csv"

# --- SETUP SELENIUM ---
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.drugs.com/")

# --- SEARCH DRUG ---
search_box = driver.find_element(By.ID, "livesearch-main")
search_box.send_keys(SEARCH_TERM)
driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
time.sleep(2)

# Determine if it's a direct hit (redirected to drug page) or list of results
current_url = driver.current_url
drug_links = []

if "search.php" in current_url:
    # It's a result list page
    results = driver.find_elements(By.CSS_SELECTOR, ".ddc-media-list a")[:NUM_RESULTS]
    drug_links = [(r.text.strip(), r.get_attribute("href")) for r in results]
else:
    # Direct drug page
    name = driver.title.split(" -")[0]
    drug_links = [(name, current_url)]

# --- HELPER FUNCTION TO FETCH SECTION FROM URL ---
def scrape_section(url_suffix):
    try:
        driver.get(url_suffix)
        time.sleep(2)
        paragraphs = driver.find_elements(By.CSS_SELECTOR, ".contentBox p")
        return "\n".join([p.text.strip() for p in paragraphs if p.text.strip()])
    except:
        return "Section not found"

# --- SCRAPE DETAILS ---
data = []
for name, link in drug_links:
    print(f"\nScraping: {name}\nURL: {link}")
    driver.get(link)
    time.sleep(2)

    # Description
    try:
        description = driver.find_element(By.CSS_SELECTOR, ".contentBox p").text.strip()
    except:
        description = "No description found"

    # Generate section links if base format is consistent
    base_url = link.rstrip(".html")
    side_effects = scrape_section(base_url + "/side-effects.html")
    interactions = scrape_section(base_url + "/drug-interactions.html")
    dosage = scrape_section(base_url + "/dosage.html")

    data.append([name, link, description, side_effects, interactions, dosage])
    print(f"✅ Done: {name}")

driver.quit()

# --- SAVE TO CSV ---
with open(CSV_FILENAME, "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["Drug Name", "URL", "Description", "Side Effects", "Interactions", "Dosage"])
    writer.writerows(data)

print(f"\n📁 Data saved to {CSV_FILENAME}")



Scraping: Paracetamol
Paracetamol (Panadol, Calpol, Alvedon) is an analgesic and antipyretic drug that is used to temporarily relieve mild-to-moderate pain and fever. It is commonly included as an ingredient in cold and flu medications and is also used on its own. Paracetamol is exactly the same drug as acetaminophen (Tylenol)....
URL: https://www.drugs.com/paracetamol.html
✅ Done: Paracetamol
Paracetamol (Panadol, Calpol, Alvedon) is an analgesic and antipyretic drug that is used to temporarily relieve mild-to-moderate pain and fever. It is commonly included as an ingredient in cold and flu medications and is also used on its own. Paracetamol is exactly the same drug as acetaminophen (Tylenol)....

Scraping: Side Effects
URL: https://www.drugs.com/sfx/paracetamol-side-effects.html
✅ Done: Side Effects

Scraping: Pregnancy / Breastfeeding
URL: https://www.drugs.com/pregnancy/acetaminophen.html
✅ Done: Pregnancy / Breastfeeding

📁 Data saved to drugsdotcom_full_data.csv


In [15]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import pandas as pd

# Setup headless browser
options = Options()
# options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

# Main index URL
index_url = "https://www.drugs.com/drug_information.html"
driver.get(index_url)
time.sleep(2)

# Get all A-Z letter links
letter_links = driver.find_elements(By.CSS_SELECTOR, ".ddc-list-column-2 li a")
letter_urls = [link.get_attribute("href") for link in letter_links]

# Store results
drug_data = []

# Loop over each letter's drug list
for letter_url in letter_urls:
    driver.get(letter_url)
    time.sleep(2)

    # Scroll to load all content (if needed)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)

    # Get all drug links on the page
    drug_links = driver.find_elements(By.CSS_SELECTOR, ".ddc-list-column-2 li a")
    for link in drug_links:
        name = link.text.strip()
        url = link.get_attribute("href")
        drug_data.append({"Drug Name": name, "URL": url})

    print(f"Scraped {len(drug_links)} drugs from {letter_url}")

# Close browser
driver.quit()

# Save to CSV
df = pd.DataFrame(drug_data)
df.to_csv("drugs_index.csv", index=False)
print("Saved to drugs_index.csv")


Saved to drugs_index.csv


In [32]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

# Setup browser
options = Options()
options.add_argument("--headless")  # Uncomment for headless mode
driver = webdriver.Chrome(options=options)

# Target page
drug_url = "https://www.drugs.com/aspirin.html"
driver.get(drug_url)
time.sleep(2)  # Wait for full page load

# Extract drug name
try:
    drug_div = driver.find_element(By.CLASS_NAME, "ddc-pronounce-title")
    drug_name = drug_div.find_element(By.TAG_NAME, "h1").text
    print("Drug Name:", drug_name)
except Exception as e:
    print("Error extracting drug name:", e)

# Extract Warnings section (full text under #warnings)
try:
    warnings_section = driver.find_element(By.ID, "warnings")
    # Some pages use <p> tags, others might use <div> or plain text, so get all text
    warning_text = warnings_section.text
    print("\nWarnings:\n", warning_text)
except Exception as e:
    print("\nError extracting warnings section:", e)

driver.quit()


Drug Name: Aspirin

