Obtain NPR URL's

In [34]:
#Import necessary libraries
import time
import json
import requests
import urllib
import lxml.html
from bs4 import BeautifulSoup
import newspaper
import trafilatura
import re

In [64]:
#NPR

def obtain_page_urls(start="0",date="12-31-2023"):
    """
    Obtain the URLS of a page from the politics section using the NPR internal API
    Inputs:
    start(str/int): Article to start seach from (Similar to page)
    date(str): Date to start looking articles from sorted from newest to oldest
    Return:
    url_set(set): Set of articles
    month(int): Month of last article retrieved

    """
    url = "https://www.npr.org/sections/politics/archive?start={}&date={}".format(start,date)
    resp = requests.get(url)
    root = lxml.html.fromstring(resp.text)
    article_elements = root.cssselect("h2.title")
    if len(article_elements) == 0:
        return None
    links_list = []
    for element in article_elements:
        link = element.cssselect("a")
        href = link[0].get("href")
        links_list.append(href)
    month = re.search(r"(?<=\d{4}/)\d{2}",links_list[-1]).group()

    url_set = set(links_list)

    return url_set, int(month)

obtain_page_urls()

https://www.npr.org/sections/politics/archive?start=0&date=12-31-2023


({'https://www.npr.org/2023/06/26/1184305185/is-home-still-home-after-30-years-away',
  'https://www.npr.org/2023/07/11/1185499996/congress-tackles-food-stamp-changes-in-the-farm-bill',
  'https://www.npr.org/2023/07/11/1186847204/xylazine-fentanyl-biden-plan',
  'https://www.npr.org/2023/07/11/1186972147/pga-liv-golf-merger',
  'https://www.npr.org/2023/07/11/1186986556/marine-corps-tuberville-nominations-abortion',
  'https://www.npr.org/2023/07/11/1186991717/doug-burgum-gift-cards-gop-republican-primary-debate',
  'https://www.npr.org/2023/07/11/1187077331/whats-happening-at-the-nato-summit',
  'https://www.npr.org/2023/07/11/1187077373/israels-protests-start-again-after-prime-minister-tries-to-weaken-judiciary',
  'https://www.npr.org/2023/07/11/1187077387/senate-hearing-aimed-to-shed-light-on-the-planned-pga-tour-liv-golf-deal',
  'https://www.npr.org/2023/07/11/1187077453/swedens-deal-with-turkey-to-enter-nato-stirs-concern-in-kurdish-community',
  'https://www.npr.org/2023/07/11

In [65]:
last_day_month ={1:31,2:28,3:31,4:30,5:31,6:30,7:31,8:31,9:30,10:31,11:30,12:31}

In [85]:
def obtain_monthly_urls(start=0,month=12,year=2023):
    """ 
    Obtain the urls from the NPR politics section for a given month
    Inputs:
    - start(int): Article to start seach from (Similar to page)
    - month(int): Month to obtain articles from
    - year(int): Year to obtain articles from

    Return:
    month_urls (set): Set of articles of the politics section the month specified
    """
    
    date = "{}-{}-{}".format(month,last_day_month[month],year)
    """docstring"""
    month_urls = set()
    page = 1
    print("Obtaining links for ",month,"-",year, ",page:",page)
    current_month = month
    while  current_month == month:
        page_urls, current_month = obtain_page_urls(start,date)
        month_urls.update(page_urls)
        start += 15
        page += 1
        print("Obtaining links for ",month,"-",year, ",page:",page)
        time.sleep(.5)
    
    return month_urls


In [86]:
obtain_monthly_urls(0,6,2023)

Obtaining links for  6 - 2023 ,page: 1
https://www.npr.org/sections/politics/archive?start=0&date=6-30-2023
Obtaining links for  6 - 2023 ,page: 2
https://www.npr.org/sections/politics/archive?start=15&date=6-30-2023
Obtaining links for  6 - 2023 ,page: 3
https://www.npr.org/sections/politics/archive?start=30&date=6-30-2023
Obtaining links for  6 - 2023 ,page: 4
https://www.npr.org/sections/politics/archive?start=45&date=6-30-2023


KeyboardInterrupt: 

In [87]:
def crawl_npr(min_year):
    """ 
    Crawl the NPR politics section
    Inputs:
    - min_year(int): Oldest year to get results from
    Return:
    - npr_url(set): Set of all the NPR politics section url until the specified year
    """
    npr_urls = set()
    for year in range(min_year,2024):
        for month in range(1,13):
            npr_urls.update(obtain_monthly_urls(0,month,year))

    return npr_urls

In [88]:
crawl_npr(2022)

Obtaining links for  1 - 2022 ,page: 1
https://www.npr.org/sections/politics/archive?start=0&date=1-31-2022
Obtaining links for  1 - 2022 ,page: 2
https://www.npr.org/sections/politics/archive?start=15&date=1-31-2022
Obtaining links for  1 - 2022 ,page: 3
https://www.npr.org/sections/politics/archive?start=30&date=1-31-2022


KeyboardInterrupt: 