In [31]:
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv #
from pathlib import Path

In [16]:
URL = "https://edition.cnn.com/health"
USERAGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0'


In [None]:
def save_to_csv(headlines, filename="../data/politics.csv"):
    """
    Saves a list of headlines to a CSV file.
    It creates the directory if it does not already exist.
    """
    # Create a Path object for easier path manipulation
    filepath = Path(filename)
    
    # Create the parent directory if it doesn't exist
    filepath.parent.mkdir(parents=True, exist_ok=True)
    
    try:
        with open(filepath, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Headline'])  # Write the header
            for headline in headlines:
                writer.writerow([headline])  # Write each headline as a new row
        
        print(f"Successfully saved {len(headlines)} headlines to {filepath}")
    except Exception as e:
            print(f"An error occurred while saving the file: {e}")


In [18]:
async with async_playwright() as p:
        try:
            browser = await p.firefox.launch(headless=False)
            context = await browser.new_context(user_agent=USERAGENT)
            page = await context.new_page()

            print(f"Navigating to: {URL}")
            await page.goto(URL, wait_until="domcontentloaded")
            await page.wait_for_selector('.container__headline', state='visible', timeout=60000)

            html_content = await page.content()
            doc = BeautifulSoup(html_content, "html.parser")

            headlines = []
            # Find all elements that contain a headline
            headline_elements = doc.select('span.container__headline-text')

            # Extract headlines until we have 50 or run out of elements
            for element in headline_elements:
                headline_text = element.get_text(strip=True)
                if headline_text:
                    headlines.append(headline_text)
                if len(headlines) >= 250:
                    break

            for i, headline in enumerate(headlines):
                print(f"{i+1}. {headline}")

            print(f"\nSuccessfully scraped {len(headlines)} headlines.")
            
            save_to_csv(headlines)

        except Exception as e:
            print(f"An error occurred: {e}")
        finally:
            if 'browser' in locals():
                await browser.close()

Navigating to: https://edition.cnn.com/health
1. Flesh-eating bacteria cases are rising. Climate change is to blame, say scientists
2. Trump administration threatens to pull federal funding from sex ed programs that mention transgender people
3. Pain relief comes from more than meds. Dr. Sanjay Gupta explains your body’s natural painkillers
4. Blue states that sued kept most CDC grants, while red states feel brunt of Trump clawbacks
5. New GLP-1 pill for weight loss may be closer to reality as drugmaker Eli Lilly says it will file for regulatory approval
6. Rare human case of flesh-eating parasite New World screwworm identified in US
7. Thinking about trying out a weighted vest? Watch this first
8. Your AI therapist might be illegal soon. Here’s why
9. Good Chop’s meat and seafood delivery service is the missing piece to your meal prep routine
10. 5 easy ways college students — and everyone else — can practice sustainability
11. Type 2 diabetes prevented with adherence to Mediterranean

In [39]:
import asyncio
from playwright.async_api import async_playwright

"""
Fetches the first 100 headlines from a paginated health section of Time magazine
using Playwright's async API.
"""
base_url = "https://time.com/section/business/"
desired_headlines = 250
headlines = []
page_number = 1

# Selector for the headline on the page
HEADLINE_SELECTOR = 'article h2'

async with async_playwright() as p:
    # Launch a headless browser
    browser = await p.firefox.launch(headless=False)
    page = await browser.new_page()
    
    print("Starting to fetch headlines...")

    while len(headlines) < desired_headlines:
        # Construct the URL for the current page using the correct query parameter
        url = f"{base_url}?page={page_number}"
        
        print(f"Navigating to {url}...")
        
        try:
            await page.goto(url)
            
            # Wait for the headlines to be present on the page
            await page.wait_for_selector(HEADLINE_SELECTOR)
            
            # Scrape all currently visible headlines
            elements = await page.query_selector_all(HEADLINE_SELECTOR)
            
            for element in elements:
                headline_text = await element.text_content()
                if headline_text and headline_text.strip() not in headlines:
                    headlines.append(headline_text.strip())
            
            print(f"Collected {len(elements)} headlines from page {page_number}.")
            print(f"Total headlines collected so far: {len(headlines)}")
            
            # Check if we have collected enough headlines
            if len(headlines) >= desired_headlines:
                break
            
            page_number += 1

        except Exception as e:
            print(f"An error occurred on page {page_number}: {e}")
            print("Stopping the scraping process.")
            break

    await browser.close()

print("\n------------------------------------------------\n")
print(f"Successfully collected {len(headlines)} headlines.")

# Print the collected headlines
for i, title in enumerate(headlines[:desired_headlines]):
    print(f"{i+1}. {title}")



Starting to fetch headlines...
Navigating to https://time.com/section/business/?page=1...
Collected 15 headlines from page 1.
Total headlines collected so far: 15
Navigating to https://time.com/section/business/?page=2...
Collected 15 headlines from page 2.
Total headlines collected so far: 30
Navigating to https://time.com/section/business/?page=3...
Collected 15 headlines from page 3.
Total headlines collected so far: 45
Navigating to https://time.com/section/business/?page=4...
Collected 15 headlines from page 4.
Total headlines collected so far: 60
Navigating to https://time.com/section/business/?page=5...
Collected 15 headlines from page 5.
Total headlines collected so far: 75
Navigating to https://time.com/section/business/?page=6...
Collected 15 headlines from page 6.
Total headlines collected so far: 90
Navigating to https://time.com/section/business/?page=7...
Collected 15 headlines from page 7.
Total headlines collected so far: 105
Navigating to https://time.com/section/busin

In [None]:
save_to_csv(headlines, filename="../data/business.csv")

Successfully saved 255 headlines to ../data1/business.csv
