In [4]:
import requests
from bs4 import BeautifulSoup
import json
import time

class NewsScraper:
    def __init__(self):
        self.headers = {
            "User-Agent": "AsmaNewsScraper/1.0 (+mailto:asma@example.com)"
        }

    def fetch_articles(self, url, title_selector, content_selector=None, max_articles=5):
        """
        Fetches articles from the specified website.

        Args:
            url (str): The URL to scrape.
            title_selector (dict): CSS selector for article titles and links.
            content_selector (dict): CSS selector for article content.
            max_articles (int): Maximum number of articles to fetch.

        Returns:
            list: A list of articles with title, link, and content.
        """
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract titles and links
            articles = soup.find_all(**title_selector)[:max_articles]
            article_list = []

            for article in articles:
                title = article.get_text(strip=True)
                link = article['href']
                content = self.fetch_article_content(link, content_selector)
                article_list.append({'title': title, 'link': link, 'content': content})
                time.sleep(2)  # Rate-limiting to avoid overwhelming the server

            return article_list

        except requests.exceptions.RequestException as e:
            print(f"Error fetching articles from {url}: {e}")
            return []

    def fetch_article_content(self, url, content_selector):
        """
        Fetches the content of an article.

        Args:
            url (str): The article URL.
            content_selector (dict): CSS selector for article content.

        Returns:
            str: The article content or an error message.
        """
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            if content_selector:
                content_div = soup.find(**content_selector)
                if content_div:
                    return content_div.get_text(strip=True)
            return "Content not found or format not supported."

        except requests.exceptions.RequestException as e:
            print(f"Error fetching content from {url}: {e}")
            return "Error fetching content."

    def save_to_json(self, articles, filename):
        """
        Saves articles to a JSON file.

        Args:
            articles (list): List of articles to save.
            filename (str): Name of the JSON file.
        """
        with open(filename, 'w') as f:
            json.dump(articles, f, indent=4)
        print(f"Articles saved to {filename}")

# Example Usage
if __name__ == "__main__":
    scraper = NewsScraper()

    # TechCrunch
    techcrunch_articles = scraper.fetch_articles(
        url="https://techcrunch.com/",
        title_selector={"name": "a", "class_": "loop-card__title-link"},
        content_selector={"name": "p", "id": "speakable-summary"}
    )

    # The Verge
    verge_articles = scraper.fetch_articles(
        url="https://www.theverge.com/tech",
        title_selector={"name": "a", "class_": "c-entry-box--compact__title"},
        content_selector={"name": "div", "class_": "duet--article--article-body-component"}
    )

    # Combine all articles
    all_articles = {
        "techcrunch": techcrunch_articles,
        "theverge": verge_articles
    }

    # Save to JSON
    scraper.save_to_json(all_articles, "news_articles.json")


Articles saved to news_articles.json


In [11]:
import requests
from bs4 import BeautifulSoup
import json
import time

class NewsScraper:
    def __init__(self):
        self.headers = {
            "User-Agent": "AsmaNewsScraper/1.0 (+mailto:asma@example.com)"
        }

    def fetch_articles(self, url, title_selector, content_selector=None, max_articles=5):
        """
        Fetches articles from the specified website.

        Args:
            url (str): The URL to scrape.
            title_selector (dict): CSS selector for article titles and links.
            content_selector (dict): CSS selector for article content.
            max_articles (int): Maximum number of articles to fetch.

        Returns:
            list: A list of articles with title, link, and content.
        """
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract titles and links
            articles = soup.find_all(**title_selector)[:max_articles]
            article_list = []

            for article in articles:
                title = article.get_text(strip=True)
                link = article['href']
                if not link.startswith('http'):  # Handle relative URLs
                    link = 'https://www.theverge.com' + link
                content = self.fetch_article_content(link, content_selector)
                article_list.append({'title': title, 'link': link, 'content': content})
                time.sleep(2)  # Rate-limiting to avoid overwhelming the server

            return article_list

        except requests.exceptions.RequestException as e:
            print(f"Error fetching articles from {url}: {e}")
            return []

    def fetch_article_content(self, url, content_selector):
        """
        Fetches the content of an article.

        Args:
            url (str): The article URL.
            content_selector (dict): CSS selector for article content.

        Returns:
            str: The article content or an error message.
        """
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            if content_selector:
                content_div = soup.find(**content_selector)
                if content_div:
                    return content_div.get_text(strip=True)
            return "Content not found or format not supported."

        except requests.exceptions.RequestException as e:
            print(f"Error fetching content from {url}: {e}")
            return "Error fetching content."

    def save_to_json(self, articles, filename):
        """
        Saves articles to a JSON file.

        Args:
            articles (list): List of articles to save.
            filename (str): Name of the JSON file.
        """
        with open(filename, 'w') as f:
            json.dump(articles, f, indent=4)
        print(f"Articles saved to {filename}")

# Example Usage
if __name__ == "__main__":
    scraper = NewsScraper()

    # TechCrunch
    techcrunch_articles = scraper.fetch_articles(
        url="https://techcrunch.com/",
        title_selector={"name": "a", "class_": "loop-card__title-link"},
        content_selector={"name": "p", "id": "speakable-summary"}
    )

    # The Verge
    verge_articles = scraper.fetch_articles(
        url="https://www.theverge.com/tech",
        title_selector={"name": "a", "class_": "after:absolute after:inset-0 group-hover:shadow-highlight-franklin dark:group-hover:shadow-highlight-blurple"},
        content_selector={"name": "div", "class_": "duet--article--article-body-component"}
    )

    # Combine all articles
    all_articles = {
        "techcrunch": techcrunch_articles,
        "theverge": verge_articles
    }

    # Save to JSON
    scraper.save_to_json(all_articles, "news_articles.json")


Articles saved to news_articles.json


In [14]:
import requests
from bs4 import BeautifulSoup
import json
import time

class NewsScraper:
    def __init__(self):
        self.headers = {
            "User-Agent": "AsmaNewsScraper/1.0 (+mailto:asma@example.com)"
        }

    def fetch_articles(self, url, title_selector, content_selector=None, max_articles=5):
        """
        Fetches articles from the specified website.

        Args:
            url (str): The URL to scrape.
            title_selector (dict): CSS selector for article titles and links.
            content_selector (dict): CSS selector for article content.
            max_articles (int): Maximum number of articles to fetch.

        Returns:
            list: A list of articles with title, link, and content.
        """
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract titles and links
            articles = soup.find_all(**title_selector)[:max_articles]
            article_list = []

            for article in articles:
                title = article.get_text(strip=True)
                link = article['href']
                if not link.startswith('http'):  # Handle relative URLs
                    link = 'https://www.wired.com' + link
                content = self.fetch_article_content(link, content_selector)
                article_list.append({'title': title, 'link': link, 'content': content})
                time.sleep(2)  # Rate-limiting to avoid overwhelming the server

            return article_list

        except requests.exceptions.RequestException as e:
            print(f"Error fetching articles from {url}: {e}")
            return []

    def fetch_article_content(self, url, content_selector):
        """
        Fetches the content of an article.

        Args:
            url (str): The article URL.
            content_selector (dict): CSS selector for article content.

        Returns:
            str: The article content or an error message.
        """
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            if content_selector:
                content_div = soup.find(**content_selector)
                if content_div:
                    return content_div.get_text(strip=True)
            return "Content not found or format not supported."

        except requests.exceptions.RequestException as e:
            print(f"Error fetching content from {url}: {e}")
            return "Error fetching content."

    def save_to_json(self, articles, filename):
        """
        Saves articles to a JSON file.

        Args:
            articles (list): List of articles to save.
            filename (str): Name of the JSON file.
        """
        with open(filename, 'w') as f:
            json.dump(articles, f, indent=4)
        print(f"Articles saved to {filename}")

# Example Usage
if __name__ == "__main__":
    scraper = NewsScraper()

    # TechCrunch
    techcrunch_articles = scraper.fetch_articles(
        url="https://techcrunch.com/",
        title_selector={"name": "a", "class_": "loop-card__title-link"},
        content_selector={"name": "p", "id": "speakable-summary"}
    )

    # Wired
    wired_articles = scraper.fetch_articles(
        url="https://www.wired.com/",
        title_selector={"name": "a", "attrs": {"data-testid": "ContentHeaderHed"}},
        content_selector={"name": "p", "class_": "paywall"}
    )

    # Combine all articles
    all_articles = {
        "techcrunch": techcrunch_articles,
        "wired": wired_articles
    }

    # Save to JSON
    scraper.save_to_json(all_articles, "news_articles.json")


Articles saved to news_articles.json


In [15]:
import json
import requests
import time
from bs4 import BeautifulSoup

class NewsScraper:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

    def fetch_articles(self, url, title_selector, content_selector, max_articles=5):
        articles_data = []
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Fetch article links
            articles = soup.find_all(title_selector["name"], class_=title_selector.get("class_"), limit=max_articles)

            for article in articles:
                title = article.get_text(strip=True)
                link = article["href"]
                if not link.startswith("http"):
                    link = url + link  # Handle relative URLs

                # Fetch article content
                try:
                    time.sleep(2)  # Wait for 2 seconds to be ethical
                    article_response = requests.get(link, headers=self.headers)
                    article_response.raise_for_status()
                    article_soup = BeautifulSoup(article_response.text, "html.parser")
                    content = " ".join(
                        [
                            p.get_text(strip=True)
                            for p in article_soup.find_all(content_selector["name"], class_=content_selector.get("class_"))
                        ]
                    )
                except Exception as e:
                    content = f"Error fetching content: {str(e)}"

                articles_data.append({"title": title, "link": link, "content": content})
        except Exception as e:
            print(f"Error fetching articles from {url}: {str(e)}")

        return articles_data

    def save_to_json(self, data, filename):
        try:
            with open(filename, "w") as f:
                json.dump(data, f, indent=4)
            print(f"Articles saved to {filename}")
        except Exception as e:
            print(f"Error saving to JSON: {str(e)}")

if __name__ == "__main__":
    scraper = NewsScraper()

    # TechCrunch
    techcrunch_articles = scraper.fetch_articles(
        url="https://techcrunch.com/",
        title_selector={"name": "a", "class_": "loop-card__title-link"},
        content_selector={"name": "p", "id": "speakable-summary"}
    )

    # Google News (replacing Wired)
    google_news_articles = scraper.fetch_articles(
        url="https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZ4ZERBU0FtVnVLQUFQAQ?oc=3",
        title_selector={"name": "a", "class_": "DY5T1d RZIKme"},
        content_selector={"name": "div", "class_": "xrnccd"}
    )

    # Combine all articles
    all_articles = {
        "techcrunch": techcrunch_articles,
        "google_news": google_news_articles
    }

    # Save to JSON
    scraper.save_to_json(all_articles, "news_articles.json")


Error fetching articles from https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZ4ZERBU0FtVnVLQUFQAQ?oc=3: 400 Client Error: Bad Request for url: https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFZ4ZERBU0FtVnVLQUFQAQ?oc=3&hl=en-CA&gl=CA&ceid=CA:en
Articles saved to news_articles.json
