Obtain NPR URL's

In [1]:
#Import necessary libraries
import time
import json
import requests
import urllib
import lxml.html
from bs4 import BeautifulSoup
import newspaper
import trafilatura
import re

In [6]:
#NPR

def obtain_page_urls(start="0",date="12-31-2023"):
    """
    Obtain the URLS of a page from the politics section using the NPR internal API
    Inputs:
    start(str/int): Article to start seach from (Similar to page)
    date(str): Date to start looking articles from sorted from newest to oldest
    Return:
    url_set(set): Set of articles
    month(int): Month of last article retrieved

    """
    url = "https://www.npr.org/sections/politics/archive?start={}&date={}".format(start,date)
    resp = requests.get(url)
    root = lxml.html.fromstring(resp.text)
    article_elements = root.cssselect("h2.title")
    if len(article_elements) == 0:
        return None
    links_list = []
    for element in article_elements:
        link = element.cssselect("a")
        href = link[0].get("href")
        links_list.append(href)
    month = re.search(r"(?<=\d{4}/)\d{2}",links_list[-1]).group()

    url_set = set(links_list)

    return url_set, int(month)

obtain_page_urls()

({'https://www.npr.org/2023/07/19/1186746963/alabama-redistricting-map-gerrymandering-purcell-principle',
  'https://www.npr.org/2023/07/19/1188353064/ftc-doj-merger-competition-guidelines',
  'https://www.npr.org/2023/07/19/1188405402/qanon-supporters-are-promoting-sound-of-freedom-heres-why',
  'https://www.npr.org/2023/07/19/1188438846/illegal-border-crossings-are-down-one-big-reason-why-is-now-part-of-a-court-figh',
  'https://www.npr.org/2023/07/19/1188441351/israel-jayapal-herzog-house-progressives',
  'https://www.npr.org/2023/07/19/1188528155/young-conservatives-trump-desantis-voting-2024',
  'https://www.npr.org/2023/07/19/1188543449/what-does-the-word-woke-really-mean-and-where-does-it-come-from',
  'https://www.npr.org/2023/07/19/1188543463/attorneys-for-trump-attend-first-pretrial-hearing-in-classified-documents-case',
  'https://www.npr.org/2023/07/19/1188543484/alabama-supports-sen-tuberville-for-halting-military-promotions-over-abortion-is',
  'https://www.npr.org/2023/0

In [7]:
last_day_month ={1:31,2:28,3:31,4:30,5:31,6:30,7:31,8:31,9:30,10:31,11:30,12:31}

In [8]:
def obtain_monthly_urls(start=0,month=12,year=2023):
    """ 
    Obtain the urls from the NPR politics section for a given month
    Inputs:
    - start(int): Article to start seach from (Similar to page)
    - month(int): Month to obtain articles from
    - year(int): Year to obtain articles from

    Return:
    month_urls (set): Set of articles of the politics section the month specified
    """
    
    date = "{}-{}-{}".format(month,last_day_month[month],year)
    """docstring"""
    month_urls = set()
    page = 1
    print("Obtaining links for ",month,"-",year, ",page:",page)
    current_month = month
    while  current_month == month:
        page_urls, current_month = obtain_page_urls(start,date)
        month_urls.update(page_urls)
        start += 15
        page += 1
        print("Obtaining links for ",month,"-",year, ",page:",page)
        time.sleep(.5)
    
    return month_urls


In [10]:
obtain_monthly_urls(0,1,2022)

Obtaining links for  1 - 2022 ,page: 1
Obtaining links for  1 - 2022 ,page: 2
Obtaining links for  1 - 2022 ,page: 3
Obtaining links for  1 - 2022 ,page: 4
Obtaining links for  1 - 2022 ,page: 5
Obtaining links for  1 - 2022 ,page: 6


KeyboardInterrupt: 

In [None]:
def crawl_npr(min_year):
    """ 
    Crawl the NPR politics section
    Inputs:
    - min_year(int): Oldest year to get results from
    Return:
    - npr_url(set): Set of all the NPR politics section url until the specified year
    """
    npr_urls = set()
    for year in range(min_year,2024):
        for month in range(1,13):
            npr_urls.update(obtain_monthly_urls(0,month,year))

    return npr_urls

In [None]:
crawl_npr(2022)