In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import os
import schedule
import time

# Define the base URL for Yahoo (Financial news section)
BASE_URL = "https://finance.yahoo.com/news"

# Scraping function for Reuters
def scrape_reuters_news():
    """
    Scrapes financial news articles from Reuters.
    """
    news_data = []
    page = 1
    while True:
        print(f"Scraping page {page}...")
        url = f"{BASE_URL}/?page={page}"  # Construct URL for pagination
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raise exception for bad responses (4xx or 5xx)
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            break  # Exit loop on request failure

        print(f"Status code for page {page}: {response.status_code}")

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            articles = soup.find_all('div', class_='content yf-1qcp8cc')  # Reuters articles are wrapped in <article> tags

            if not articles:  # If no articles are found, we can stop
                print("No more articles found. Ending scraping.")
                break

            print(f"Found {len(articles)} articles on page {page}.")

            for article in articles:
                # Extract headline
                headline_tag = article.find('h3', class_='clamp tw-line-clamp-3 sm:tw-line-clamp-2 yf-1qcp8cc')
                if headline_tag:
                    headline = headline_tag.text.strip()
                else:
                    print("No headline found. Skipping article.")
                    continue

                # Extract link
                link_tag = article.find('a', class_='subtle-link fin-size-small titles noUnderline yf-1e4diqp')
                if link_tag:
                    link = "https://www.reuters.com" + link_tag['href']
                else:
                    print("No link found. Skipping article.")
                    continue

                # Extract datetime (published date)
                date_tag = article.find('time')
                if date_tag and 'datetime' in date_tag.attrs:
                    date_str = date_tag['datetime']
                    date = datetime.datetime.fromisoformat(date_str)  # Convert ISO date string to datetime object
                else:
                    date = datetime.datetime.now()  # Fallback to current date if not available

                # Append the data to the news list
                news_data.append({
                    "Headline": headline,
                    "Link": link,
                    "Date": date,
                    "Source": "Reuters",
                    "Category": "Financial"
                })

            page += 1  # Move to the next page
        else:
            print(f"Failed to retrieve page {page}. Status code: {response.status_code}")
            break  # Stop if a page fails to load

    print(f"Scraping completed. Total articles scraped: {len(news_data)}")
    return news_data

# Filter news by the last 2 years (if needed)
def filter_news_by_date(news_data):
    """
    Filters the news data to include only articles from the last 2 years.
    """
    df = pd.DataFrame(news_data)
    df['Date'] = pd.to_datetime(df['Date'])

    # Define the cutoff date (2 years ago from today)
    two_years_ago = pd.Timestamp.today() - pd.Timedelta(days=365 * 2)
    df = df[df['Date'] >= two_years_ago]

    return df

# Save news to CSV
def save_to_csv(news_data, filename='reuters_news_data.csv'):
    """
    Saves the news data to a CSV file. Appends to the file if it exists.
    """
    df = pd.DataFrame(news_data)
    if os.path.exists(filename):
        # Append to the existing file
        df.to_csv(filename, mode='a', header=False, index=False)
    else:
        # Create a new file with headers
        df.to_csv(filename, index=False)

# Scheduler for automated scraping
def schedule_scraping(filename='reuters_news_data.csv'):
    """
    Scrapes and saves news articles automatically using a scheduler.
    """
    print("Starting scheduled scraping...")
    news_data = scrape_reuters_news()  # Now using the updated function
    if news_data:
        news_data_filtered = filter_news_by_date(news_data)
        save_to_csv(news_data_filtered, filename)
        print(f"Data saved to {filename}.")

# Main function for automation
def automate_scraping():
    """
    Automates the scraping process by directly calling the scraping function.
    """
    print("Starting scraping immediately...")
    schedule_scraping(filename='reuters_news_data.csv')  # Directly call the scraping function

# Run the automation
if __name__ == "__main__":
    automate_scraping()


Starting scraping immediately...
Starting scheduled scraping...
Scraping page 1...
Status code for page 1: 200
No more articles found. Ending scraping.
Scraping completed. Total articles scraped: 0
