Obtain NPR URL's

In [2]:
#Import necessary libraries
import time
import json
import requests
import urllib
import lxml.html
from bs4 import BeautifulSoup
import newspaper
import trafilatura
import re

In [3]:
#NPR

def obtain_page_urls(start="0",date="12-31-2023"):
    """
    Obtain the URLS of a page from the politics section using the NPR internal API
    Inputs:
    start(str/int): Article to start seach from (Similar to page)
    date(str): Date to start looking articles from sorted from newest to oldest
    Return:
    url_set(set): Set of articles
    month(int): Month of last article retrieved

    """
    url = "https://www.npr.org/sections/politics/archive?start={}&date={}".format(start,date)
    resp = requests.get(url)
    root = lxml.html.fromstring(resp.text)
    article_elements = root.cssselect("h2.title")
    if len(article_elements) == 0:
        return None
    links_list = []
    for element in article_elements:
        link = element.cssselect("a")
        href = link[0].get("href")
        links_list.append(href)
    month = re.search(r"(?<=\d{4}/)\d{2}",links_list[-1]).group()

    url_set = set(links_list)

    return url_set, int(month)

obtain_page_urls()

({'https://www.npr.org/2023/07/21/1189494854/alabama-redistricting-map-black-districts',
  'https://www.npr.org/2023/07/22/1189093540/detroit-bankruptcy-comeback-hurdle',
  'https://www.npr.org/2023/07/22/1189362839/no-labels-americans-elect-third-party',
  'https://www.npr.org/2023/07/22/1189580609/week-in-politics-congress-clash-over-military-promotions-trumps-legal-troubles',
  'https://www.npr.org/2023/07/22/1189580616/gov-spencer-cox-of-utah-wants-americans-to-learn-how-to-disagree',
  'https://www.npr.org/2023/07/22/1189580679/presidential-candidates-are-vying-for-an-endorsement-from-the-united-auto-worker',
  'https://www.npr.org/2023/07/22/1189625686/as-trumps-legal-woes-pile-up-iowa-supporters-are-unfazed',
  'https://www.npr.org/2023/07/23/1188726182/afghanistan-withdrawal-teen-alone-u-s',
  'https://www.npr.org/2023/07/23/1189659854/politics-chat-possible-third-indictment-for-trump-vp-harris-steps-up-on-the-trai',
  'https://www.npr.org/2023/07/23/1189664409/emmett-till-nati

In [7]:
last_day_month ={1:31,2:28,3:31,4:30,5:31,6:30,7:31,8:31,9:30,10:31,11:30,12:31}

In [8]:
def obtain_monthly_urls(start=0,month=12,year=2023):
    """ 
    Obtain the urls from the NPR politics section for a given month
    Inputs:
    - start(int): Article to start seach from (Similar to page)
    - month(int): Month to obtain articles from
    - year(int): Year to obtain articles from

    Return:
    month_urls (set): Set of articles of the politics section the month specified
    """
    
    date = "{}-{}-{}".format(month,last_day_month[month],year)
    """docstring"""
    month_urls = set()
    page = 1
    print("Obtaining links for ",month,"-",year, ",page:",page)
    current_month = month
    while  current_month == month:
        page_urls, current_month = obtain_page_urls(start,date)
        month_urls.update(page_urls)
        start += 15
        page += 1
        print("Obtaining links for ",month,"-",year, ",page:",page)
        time.sleep(.5)
    
    return month_urls


In [10]:
obtain_monthly_urls(0,1,2022)

Obtaining links for  1 - 2022 ,page: 1
Obtaining links for  1 - 2022 ,page: 2
Obtaining links for  1 - 2022 ,page: 3
Obtaining links for  1 - 2022 ,page: 4
Obtaining links for  1 - 2022 ,page: 5
Obtaining links for  1 - 2022 ,page: 6


KeyboardInterrupt: 

In [None]:
def crawl_npr(min_year):
    """ 
    Crawl the NPR politics section
    Inputs:
    - min_year(int): Oldest year to get results from
    Return:
    - npr_url(set): Set of all the NPR politics section url until the specified year
    """
    npr_urls = set()
    for year in range(min_year,2024):
        for month in range(1,13):
            npr_urls.update(obtain_monthly_urls(0,month,year))

    return npr_urls

In [None]:
crawl_npr(2022)

In [25]:
test_list = obtain_page_urls()
test_list

for article in list(test_list[0]):
    print("article:",article)
    downloaded = trafilatura.fetch_url(article)
    content_2 = trafilatura.bare_extraction(downloaded, include_images=True,include_comments=False)
    target_src = content_2["image"]
    print("Target src:",target_src)
    resp = requests.get(article)
    soup = BeautifulSoup(resp.text, "html.parser")
    image_element = soup.find("img", src_set=target_src)
    print(image_element)

article: https://www.npr.org/2023/07/24/1189719343/vp-harris-to-speak-at-conference-organized-by-group-advocating-for-latino-commun
Target src: https://media.npr.org/include/images/facebook-default-wide-s1400-c100.jpg
None
article: https://www.npr.org/2023/07/23/1189659854/politics-chat-possible-third-indictment-for-trump-vp-harris-steps-up-on-the-trai
Target src: https://media.npr.org/include/images/facebook-default-wide-s1400-c100.jpg
None
article: https://www.npr.org/2023/07/24/1189831506/the-doj-is-taking-legal-action-over-razer-wire-topped-floating-border-wall-in-te
Target src: https://media.npr.org/include/images/facebook-default-wide-s1400-c100.jpg
None
article: https://www.npr.org/sections/money/2023/07/24/1189443223/affirmative-action-for-rich-kids-its-more-than-just-legacy-admissions
Target src: https://media.npr.org/assets/img/2023/07/21/gettyimages-1268846574_wide-b616caddbe37f2ea9797277ae05de50dabf3148a-s1400-c100.jpg
None
article: https://www.npr.org/2023/07/23/1188726182