In [1]:
# !pip install openpyxl

In [2]:
import feedparser
import pandas as pd
import numpy as np

from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup, NavigableString

In [3]:
def parse_rss_feeds(feed_links, scrape_article=False):
    
    feed_data = []
    
    for link in feed_links:
        
        # Get hostname of provided rss link
        urlparser = urlparse(link)
        link_hostname = urlparser.hostname
        # Get source name of link
        link_source = links_dict[link_hostname]

        # Parse the rss feed link content
        feed = feedparser.parse(link)
        
        # Iterate feed entries
        for entry in feed.entries:
            
            # Format datetime to yyyy-mm-dd format
            date_time = pd.to_datetime(entry.get('published', ''))
            date_time = date_time.date()
            date_time = pd.Timestamp(date_time.strftime('%Y-%m-%d'))
            
            # Article source url
            url = entry.get('link', '')

            # Initialize article content variable
            article_content = np.nan

            # Check scraping parameter boolean
            if scrape_article:
                # Check if rss feed source is DOJ (Department of Justice)
                if link_source == 'DOJ':
                    # Function call to parse article conetent
                    article_content = get_rss_article(url, 'DOJ')

            # Append feed data objects
            feed_data.append({
                'feed_link': link,
                'feed_source': link_source,
                'title': entry.get('title', ''),
                'date_published': date_time,
                'summary': entry.get('summary', ''),
                'link': entry.get('link', ''),
                'article_content': article_content
            })

    # Return feed data objects
    return feed_data


In [4]:
def get_rss_article(url, rss_source):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Check RSS feed source
    if rss_source == 'DOJ':
        # Scrape the article content via bs4
        content_body = soup.find(class_='field_body')
        paragraphs = content_body.find_all('p')

        # Variable for an observed ending text string
        text_strip = 'More information can be found at'
        
        # Initialize content string
        article_content = ""
        
        # Iterate content paragraph tags (i.e <p></p>)
        for p in paragraphs:
            # Check for any anchor tags (i.e. <a>some link</a>)
            for tag in p.findAll('a'):
                tag.replaceWithChildren()

            # Set variable with paragraph text
            p_text = p.text

            # Check for ending text string
            if text_strip in p_text:
                # Get the the index
                idx = p_text.index(text_strip)
                # Truncate the string on the index
                p_text = p_text[:idx]

            # Strip whitespace and append to content string 
            p_text = p_text.strip()
            article_content += p_text + " "
            
    # Return article content string
    return article_content

In [5]:
links_dict = {
    'www.justice.gov': 'DOJ',
    'www.sec.gov': 'SEC',
    'www.cftc.gov': 'CFTC',
    'www.ftc.gov': 'FTC',
    'feeds.finra.org': 'FINRA',
    'www.law360.com': 'Law360',
    'www.whitecollarbriefly.com': 'White Collar Briefly',
    'wp.nyu.edu': 'NYU'
}

In [6]:
rss_feed_links = [
    'https://www.justice.gov/news/rss?m=1',
    'https://www.sec.gov/news/pressreleases.rss',
    'https://www.cftc.gov/RSS/RSSENF/rssenf.xml',
    'https://www.ftc.gov/feeds/press-release.xml',
    'http://feeds.finra.org/FINRANotices',
    'https://www.cftc.gov/RSS/RSSGP/rssgp.xml',
    'https://www.justice.gov/news/rss?type=press_release&groupname=441&field_component=1981&search_api_language=en&require_all=1',
    'https://www.justice.gov/news/rss?type=press_release&groupname=236&field_component=1751&search_api_language=en&require_all=1',
    'https://www.justice.gov/news/rss?type=press_release&groupname=431&field_component=1971&search_api_language=en&require_all=1',
    'https://www.justice.gov/news/rss?type=press_release&groupname=291&field_component=1821&search_api_language=en&require_all=1',
    'https://www.justice.gov/news/rss?type=press_release&groupname=201&field_component=1721&search_api_language=en&require_all=1',
    'https://www.law360.com/whitecollar/rss',
    'https://www.whitecollarbriefly.com/feed/',
    'https://wp.nyu.edu/compliance_enforcement/feed/',
    'https://www.justice.gov/news/rss?end_date=05/08/2024&search_api_fulltext=&sort_by=field_date&start_date=05/01/2024&type=press_release&groupname=291&field_component=1821&search_api_language=en&require_all=0'
    # '',
    # Add more feed links as needed
]


In [7]:
%%time

# Parse RSS feeds
feed_data = parse_rss_feeds(rss_feed_links, scrape_article=True)

# Convert data to DataFrame
df = pd.DataFrame(feed_data)

CPU times: user 12.1 s, sys: 272 ms, total: 12.4 s
Wall time: 3min 31s


In [8]:
# Display DataFrame
df.head(5)

Unnamed: 0,feed_link,feed_source,title,date_published,summary,link,article_content
0,https://www.justice.gov/news/rss?m=1,DOJ,Kabbage Inc. Agrees to Resolve Allegations Tha...,2024-05-13,The Justice Department announced today that no...,https://www.justice.gov/opa/pr/kabbage-inc-agr...,The Justice Department announced today that no...
1,https://www.justice.gov/news/rss?m=1,DOJ,Attorney General Merrick B. Garland Delivers R...,2024-05-13,Good afternoon. Thank you all for being here –...,https://www.justice.gov/opa/speech/attorney-ge...,Remarks as Delivered Good afternoon. Thank you...
2,https://www.justice.gov/news/rss?m=1,DOJ,Deputy Attorney General Lisa Monaco Delivers R...,2024-05-13,"Good afternoon, everyone. The Department of Ju...",https://www.justice.gov/opa/speech/deputy-atto...,"Remarks as Delivered Good afternoon, everyone...."
3,https://www.justice.gov/news/rss?m=1,DOJ,Ohio Man Sentenced for Disabling Emissions Con...,2024-05-13,"A Hamilton County, Ohio, man was sentenced to ...",https://www.justice.gov/opa/pr/ohio-man-senten...,"A Hamilton County, Ohio, man was sentenced to ..."
4,https://www.justice.gov/news/rss?m=1,DOJ,Leveraging Federal Funding across Government t...,2024-05-13,The White House Legal Aid Interagency Roundtab...,https://www.justice.gov/atj/blog/leveraging-fe...,The White House Legal Aid Interagency Roundtab...


In [9]:
excel_file = 'rss_feeds.xlsx'
df.to_excel(excel_file, index=False)