In [71]:
import feedparser
import csv
import os
import datetime


In [72]:
# List of RSS feed URLs from major news sites
RSS_FEEDS = [
    #"https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
    "https://feeds.npr.org/1001/rss.xml",
    #"https://www.theguardian.com/world/rss",
    #"https://www.reutersagency.com/feed/?best-topics=top-news&post_type=best",
    #"https://rss.cnn.com/rss/edition.rss",
    #"https://feeds.a.dj.com/rss/RSSWorldNews.xml",
    #"https://www.politico.com/rss/politics08.xml",
    #"https://feeds.foxnews.com/foxnews/latest",
    #"https://www.huffpost.com/section/front-page/feed"
]

NPR_FEEDS = [
    "https://feeds.npr.org/1001/rss.xml",  # General News
    "https://feeds.npr.org/1003/rss.xml",  # National
    "https://feeds.npr.org/1004/rss.xml",  # World
    "https://feeds.npr.org/1012/rss.xml",  # Politics
    "https://feeds.npr.org/1007/rss.xml",  # Science
    "https://feeds.npr.org/1017/rss.xml",  # Economy
    "https://feeds.npr.org/1025/rss.xml",  # Technology
    "https://feeds.npr.org/1039/rss.xml",  # Health
]

NYT_FEEDS = [
    "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/Opinion.xml"
]
WAPO_FEEDS = [
    "https://feeds.washingtonpost.com/rss/world",           # World News
    "https://feeds.washingtonpost.com/rss/politics",        # Politics
    "https://feeds.washingtonpost.com/rss/business",        # Business
    "https://feeds.washingtonpost.com/rss/technology",      # Technology
    "https://feeds.washingtonpost.com/rss/national",        # National News
    "https://feeds.washingtonpost.com/rss/local",           # Local News (DC/MD/VA)
    "https://feeds.washingtonpost.com/rss/sports",          # Sports
    "https://feeds.washingtonpost.com/rss/opinions",        # Opinions
    "https://feeds.washingtonpost.com/rss/entertainment",   # Arts & Entertainment
    "https://feeds.washingtonpost.com/rss/lifestyle",       # Lifestyle
    "https://feeds.washingtonpost.com/rss/health",          # Health
    "https://feeds.washingtonpost.com/rss/environment",     # Environment
    "https://feeds.washingtonpost.com/rss/education",       # Education
    "https://feeds.washingtonpost.com/rss/science",         # Science
]
BLOOMBERG_FEEDS = [
    "https://feeds.bloomberg.com/markets/news.rss",       # Markets
    "https://feeds.bloomberg.com/business/news.rss",      # Business
    "https://feeds.bloomberg.com/politics/news.rss",      # Politics
    "https://feeds.bloomberg.com/economics/news.rss",     # Economics
    "https://feeds.bloomberg.com/technology/news.rss",    # Technology
    "https://feeds.bloomberg.com/wealth/news.rss",        # Wealth
    "https://feeds.bloomberg.com/green/news.rss",         # Climate & Environment
    "https://feeds.bloomberg.com/pursuits/travel/news.rss" # Travel & Lifestyle
]
CNN_FEEDS = [
    "http://rss.cnn.com/rss/edition.rss",                 # Top Stories
    "http://rss.cnn.com/rss/edition_world.rss",           # World News
    "http://rss.cnn.com/rss/edition_us.rss",              # U.S. News
    "http://rss.cnn.com/rss/edition_politics.rss",        # Politics
    "http://rss.cnn.com/rss/edition_business.rss",        # Business
    "http://rss.cnn.com/rss/edition_technology.rss",      # Technology
    "http://rss.cnn.com/rss/edition_health.rss",          # Health
    "http://rss.cnn.com/rss/edition_entertainment.rss",   # Entertainment
    "http://rss.cnn.com/rss/edition_sport.rss",           # Sports
    "http://rss.cnn.com/rss/cnn_latest.rss"               # Latest News
]



In [73]:
# Create output directory and filename based on today's date
today = datetime.date.today().isoformat()
output_dir = "../data/raw/rss_headlines"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"headlines_rss_{today}.csv")


In [74]:
# Initialize a set to store unique headlines
headlines = set()


In [75]:
# Loop through each RSS feed and extract headlines
for feed_url in CNN_FEEDS:
    print(f"\nFetching: {feed_url}")
    try:
        feed = feedparser.parse(feed_url)
        initial_count = len(headlines)
        for entry in feed.entries:
            title = entry.get("title", "").strip()
            if title and len(title.split()) > 4:
                headlines.add(title)
        new_count = len(headlines)
        print(f"Added {new_count - initial_count} new headlines from this feed.")
        print(f"Total headlines so far: {new_count}")
    except Exception as e:
        print(f"Error parsing {feed_url}: {e}")


Fetching: http://rss.cnn.com/rss/edition.rss
Added 50 new headlines from this feed.
Total headlines so far: 50

Fetching: http://rss.cnn.com/rss/edition_world.rss
Added 29 new headlines from this feed.
Total headlines so far: 79

Fetching: http://rss.cnn.com/rss/edition_us.rss
Added 29 new headlines from this feed.
Total headlines so far: 108

Fetching: http://rss.cnn.com/rss/edition_politics.rss
Added 0 new headlines from this feed.
Total headlines so far: 108

Fetching: http://rss.cnn.com/rss/edition_business.rss
Added 20 new headlines from this feed.
Total headlines so far: 128

Fetching: http://rss.cnn.com/rss/edition_technology.rss
Added 11 new headlines from this feed.
Total headlines so far: 139

Fetching: http://rss.cnn.com/rss/edition_health.rss
Added 0 new headlines from this feed.
Total headlines so far: 139

Fetching: http://rss.cnn.com/rss/edition_entertainment.rss
Added 30 new headlines from this feed.
Total headlines so far: 169

Fetching: http://rss.cnn.com/rss/edition

In [76]:
# Write scraped headlines to a pipe-delimited CSV file
with open(output_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file, delimiter="|")
    writer.writerow(["headline"])
    for headline in sorted(headlines):
        writer.writerow([headline])

print(f"\nExported {len(headlines)} headlines to: {output_path}")



Exported 218 headlines to: ../data/raw/rss_headlines\headlines_rss_2025-09-22.csv


In [77]:
# Preview the first 10 headlines for accuracy
print("\nSample Headlines:")
for i, h in enumerate(sorted(headlines)):
    print(f"{i+1:02d}. {h}")
    if i == 9:
        break



Sample Headlines:
01. $200,000 worth of supplies donated to PB schools
02. 'A definitive backslide.' Inside fashion's worrying runway trend
03. 'Glass Onion' sharpens the 'Knives Out' formula in a polished Netflix sequel
04. 'Harry & Meghan' series gets release date and new trailer
05. 'I know it happens': Boeing chief admits the company has retaliated against whistleblowers
06. 'I was up to my waist down a hippo's throat.' He survived, and here's his advice
07. 'Lego Batman' producer, then Treasury sec?
08. 'Love Actually' cast to reunite for 20th anniversary TV special
09. 'Love Actually' director feels 'a bit stupid' about movie's lack of diversity
10. 'Scary, cold, hungry and lonely': Volunteer soldier shares experience on front line
