In [1]:
# Importing the required libraries

import sys
import pprint
import logging
from datetime import datetime
from typing import List, Dict, Union

import requests
from bs4 import BeautifulSoup

In [2]:
# Return type of articles
Articles = Dict[str, Union[str, datetime]]

In [3]:
class GoogleNewsScraper:
    """
    GoogleNewsScraper scrapes articles from google news rss feeds.
    """

    # Constants
    DATE_TIME_FORMAT = "%a, %d %b %Y %H:%M:%S %Z"
    BASE_URL = "https://news.google.com/rss/search"

    def __init__(self, query: str):
        """
        Constuctor method initializes GoogleNewsScraper object
        to scrape google news rss feeds for the given query.

        Args:
            query (str): Query to scrape.
        """

        self._query = query
        self.url = f"{self.BASE_URL}?q={query}"

        self.setup_logger()

        self.pretty_printer = pprint.PrettyPrinter()

    @property
    def query(self):
        """
        Getter method for _query attribute.
        """

        return self._query

    @query.setter
    def query(self, query_string: str):
        """
        Setter method for _query attribute.

        Args:
            query_string
        """

        query_string_list = query_string.split(" ")
        query_string_list = list(map(lambda x: x.lower(), query_string_list))

        query_string = "+".join(query_string_list)

        self._query = query_string

    def setup_logger(self):
        """
        Method sets up the logger.
        """

        self.logger = logging.getLogger()
        self.logger.setLevel(logging.DEBUG)

        handler = logging.StreamHandler(sys.stdout)
        handler.setLevel(logging.DEBUG)

        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
        )
        handler.setFormatter(formatter)

        self.logger.addHandler(handler)

    def parse_string_to_datetime(self, date_time_str: str) -> datetime:
        """
        Method parses string to python datetime object.

        Args:
            date_time_str (str): Datetime string.

        Returns:
            date_time_obj (datetime): Parsed python datetime object.
        """

        date_time_obj = datetime.strptime(date_time_str, self.DATE_TIME_FORMAT)
        return date_time_obj

    def scrape_articles(self) -> List[Articles]:
        """
        Method scrapes google news rss feed articles.

        Returns:
            articles (List[Articles]): List of scraped articles of type Article.
        """

        self.logger.info(f"Started scraping {self.url}...")

        xml_content = requests.get(self.url).content
        soup = BeautifulSoup(xml_content, features="xml")
        items = soup.find_all("item")

        self.logger.info(f"Scraped {len(items)} articles.")

        articles: List[Articles] = []

        for item in items:
            article = {}

            # Articles Info
            article["link"] = item.find("link").text
            article["title"] = item.find("title").text
            article["description"] = item.find("description").text

            # Publisher info
            article["publisher"] = item.find("source").text
            article["published_date"] = self.parse_string_to_datetime(
                item.find("pubDate").text
            )

            articles.append(article)

        return articles

    def print_articles(self, articles: List[Articles]):
        """
        Method pretty prints scraped articles.

        Args:
            articles (List[Articles]): Scraped Articles.
        """

        self.pretty_printer.pprint(articles)

In [4]:
query = "Carbon Net Zero"

In [5]:
google_scaper = GoogleNewsScraper(query)

In [6]:
articles = google_scaper.scrape_articles()

2022-06-06 12:21:47,468 - root - INFO - Started scraping https://news.google.com/rss/search?q=Carbon Net Zero...
2022-06-06 12:21:47,471 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): news.google.com:443
2022-06-06 12:21:47,738 - urllib3.connectionpool - DEBUG - https://news.google.com:443 "GET /rss/search?q=Carbon%20Net%20Zero HTTP/1.1" 302 0
2022-06-06 12:21:48,339 - urllib3.connectionpool - DEBUG - https://news.google.com:443 "GET /rss/search?q=Carbon+Net+Zero&hl=en-IN&gl=IN&ceid=IN:en HTTP/1.1" 200 None
2022-06-06 12:21:48,373 - root - INFO - Scraped 99 articles.


In [7]:
google_scaper.print_articles(articles)

[{'description': '<a '
                 'href="https://www.thehindubusinessline.com/specials/clean-tech/carbon-credits-could-boom-in-the-age-of-net-zero/article65468468.ece" '
                 'target="_blank">Carbon credits could boom in the age of net '
                 'zero</a>&nbsp;&nbsp;<font '
                 'color="#6f6f6f">BusinessLine</font>',
  'link': 'https://www.thehindubusinessline.com/specials/clean-tech/carbon-credits-could-boom-in-the-age-of-net-zero/article65468468.ece',
  'published_date': datetime.datetime(2022, 5, 29, 15, 47, 9),
  'publisher': 'BusinessLine',
  'title': 'Carbon credits could boom in the age of net zero - BusinessLine'},
 {'description': '<a '
                 'href="https://www.weforum.org/agenda/2022/05/carbon-credits-could-help-india-reach-net-zero-2070/" '
                 'target="_blank">Carbon credits could help India reach '
                 'net-zero by 2070</a>&nbsp;&nbsp;<font color="#6f6f6f">World '
                 'Economic Forum</