In [1]:
import requests
from bs4 import BeautifulSoup

In [None]:
def collect_npr_reviews(checkpoint=None):
    """Collect data on reviews from the npr book reviews page.
    
    Params
    ----------
    checkpoint: str
        Title of the most recent article scraped. Stops collecting articles
        when encountered. 
    Returns
    ----------
    List[dict] | None
        A list of dicts, each containing details of a review. 
        If an error occurs or checkpoint is first item, returns None."""
    
    url = 'https://www.npr.org/sections/book-reviews/'
    
    try:
        res = requests.get(url)
    except:
        return None
    
    if res.status_code != 200:
        return None
    
    soup = BeautifulSoup(res.content)
    article_list = []
    reviews = soup.find_all('article', class_='item has-image')
    
    for review in reviews:
        details = {}
        
        title = details['title'] = review.find('h2').get_text(strip=True)
        if title == checkpoint:
            return article_list or None
        details['url'] = review.find('a').get('href')
        details['blurb'] = review.find('p').get_text(strip=True).split('\x95')[1]
        
        article_list.append(details)
        
    return article_list

---

In [None]:
def collect_nyt_reviews(checkpoint=None):
    """Collect data on reviews from the nyt book reviews page.
    
    Params
    ----------
    checkpoint: str
        Title of the most recent article scraped. Stops collecting articles
        when encountered. 
    Returns
    ----------
    List[dict] | None
        A list of dicts, each containing details of a review. 
        If an error occurs or checkpoint is first item, returns None."""
    
    url = 'https://www.nytimes.com/section/books/review'
    base_url = 'https://www.nytimes.com'
    
    try:
        res = requests.get(url)
    except:
        return None
    
    if res.status_code != 200:
        return None
    
    soup = BeautifulSoup(res.content)
    article_list = []
    reviews = soup.find('section', id='stream-panel').select('li a')
    
    for review in reviews:
        details = {}
        
        header = review.find('h3')
        if not header:
            continue
            
        title = details['title'] = header.get_text(strip=True)
        if title == checkpoint:
            return article_list or None
        details['url'] = base_url + review.get('href')
        details['blurb'] = review.find('p').get_text(strip=True)
        
        article_list.append(details)
        
    return article_list

---

In [3]:
res = requests.get('https://www.npr.org/sections/book-reviews/')

print(BeautifulSoup(res.content).get_text())













NPR: Book Reviews : NPR












Accessibility links 
Skip to main content
Keyboard shortcuts for audio player








                    Open Navigation Menu
                









NPR Shop







>
                    Close Navigation Menu




Home



News
Expand/collapse submenu for News


National
World
Politics
Business
Health
Science
Climate
Race




Culture
Expand/collapse submenu for Culture


Books
Movies
Television
Pop Culture
Food
Art & Design 
Performing Arts
Life Kit
Gaming




Music
Expand/collapse submenu for Music



        Tiny Desk
    


        #NowPlaying
    


        All Songs Considered
    


        Music Features
    


        Live Sessions
    




Podcasts & Shows
Expand/collapse submenu for Podcasts & Shows


Daily




                                    Morning Edition
                                




                                    Weekend Edition Saturday
                                




                                

In [4]:
soup = BeautifulSoup(res.content)

In [10]:
reviews = soup.find_all('article', class_='item has-image')

In [11]:
len(reviews)

19

In [7]:
soup

<!DOCTYPE html>
<html class="no-js" lang="en"><head><!-- OneTrust Cookies Consent Notice start for npr.org -->
<script src="https://cdn.cookielaw.org/consent/82089dfe-410c-4e1b-a7f9-698174b62a86/OtAutoBlock.js" type="text/javascript"></script>
<script charset="UTF-8" data-domain-script="82089dfe-410c-4e1b-a7f9-698174b62a86" src="https://cdn.cookielaw.org/scripttemplates/otSDKStub.js" type="text/javascript"></script>
<script type="text/javascript">
function OptanonWrapper() {
    NPR_OptanonWrapper = true;
    document.dispatchEvent(new CustomEvent('npr:DataConsentAvailable'));
    
    OneTrust.OnConsentChanged(function() {
        document.dispatchEvent(new CustomEvent('npr:DataConsentChanged'));
    });
 }
</script>
<!-- OneTrust Cookies Consent Notice end for npr.org -->
<script ccpa-opt-out-geo="us" ccpa-opt-out-ids="C0004" ccpa-opt-out-lspa="false" charset="UTF-8" src="https://cdn.cookielaw.org/opt-out/otCCPAiab.js" type="text/javascript"></script><script class="optanon-category-C

In [12]:
reviews[0]

<article class="item has-image">
<div class="item-image">
<div class="imagewrap has-source-dimensions" data-crop-type="wide" style="
        --source-width: 1684;
        --source-height: 947;
    ">
<a data-metrics='{"action":"Click Featured Story Image 1-3","category":"Aggregation"}' href="https://www.npr.org/2023/06/10/1180103464/book-review-isabel-allende-novel-the-wind-knows-my-name">
<picture>
<source class="feat1_23 lazyOnLoad" data-format="webp" data-original="https://media.npr.org/assets/img/2023/06/09/windknows_wide-90773260c4789ccf71df8a967362e059ab82b2f6.jpg?s=800&amp;c=100&amp;f=webp" data-template="https://media.npr.org/assets/img/2023/06/09/windknows_wide-90773260c4789ccf71df8a967362e059ab82b2f6.jpg?s={width}&amp;c={quality}&amp;f={format}" srcset="https://media.npr.org/assets/img/2023/06/09/windknows_wide-90773260c4789ccf71df8a967362e059ab82b2f6.jpg?s=800&amp;c=15&amp;f=webp" type="image/webp"/>
<source class="feat1_23 lazyOnLoad" data-format="jpeg" data-original="https

In [14]:
# url
reviews[0].find('a').get('href')

'https://www.npr.org/2023/06/10/1180103464/book-review-isabel-allende-novel-the-wind-knows-my-name'

In [16]:
# 
reviews[0].find('a')

<a data-metrics='{"action":"Click Featured Story Image 1-3","category":"Aggregation"}' href="https://www.npr.org/2023/06/10/1180103464/book-review-isabel-allende-novel-the-wind-knows-my-name">
<picture>
<source class="feat1_23 lazyOnLoad" data-format="webp" data-original="https://media.npr.org/assets/img/2023/06/09/windknows_wide-90773260c4789ccf71df8a967362e059ab82b2f6.jpg?s=800&amp;c=100&amp;f=webp" data-template="https://media.npr.org/assets/img/2023/06/09/windknows_wide-90773260c4789ccf71df8a967362e059ab82b2f6.jpg?s={width}&amp;c={quality}&amp;f={format}" srcset="https://media.npr.org/assets/img/2023/06/09/windknows_wide-90773260c4789ccf71df8a967362e059ab82b2f6.jpg?s=800&amp;c=15&amp;f=webp" type="image/webp"/>
<source class="feat1_23 lazyOnLoad" data-format="jpeg" data-original="https://media.npr.org/assets/img/2023/06/09/windknows_wide-90773260c4789ccf71df8a967362e059ab82b2f6.jpg?s=800&amp;c=100&amp;f=jpeg" data-template="https://media.npr.org/assets/img/2023/06/09/windknows_wide

In [19]:
# title
reviews[0].find('h2').get_text(strip=True)

"'The Wind Knows My Name' is a reference and a refrain in the search for home"

In [26]:
# blurb 
reviews[2].find('p').get_text(strip=True).split('\x95')[1]

"S.A. Cosby's latest is a dark, wildly entertaining crime novel with religious undertones — and one that tackles timely issues while never losing itself or sounding preachy."

In [28]:
def collect_npr_reviews(checkpoint=None):
    """Collect data on reviews from the npr book reviews page.
    
    Params
    ----------
    checkpoint: str
        Title of the most recent article scraped. Stops collecting articles
        when encountered. 
    Returns
    ----------
    List[dict] | None
        A list of dicts, each containing details of a review. 
        If an error occurs or checkpoint is first item, returns None."""
    
    url = 'https://www.npr.org/sections/book-reviews/'
    
    try:
        res = requests.get(url)
    except:
        return None
    
    if res.status_code != 200:
        return None
    
    soup = BeautifulSoup(res.content)
    article_list = []
    reviews = soup.find_all('article', class_='item has-image')
    
    for review in reviews:
        details = {}
        
        title = details['title'] = review.find('h2').get_text(strip=True)
        if title == checkpoint:
            return article_list or None
        details['url'] = review.find('a').get('href')
        details['blurb'] = review.find('p').get_text(strip=True).split('\x95')[1]
        
        article_list.append(details)
        
    return article_list

In [29]:
details = collect_npr_reviews()

In [30]:
details

[{'title': "'The Wind Knows My Name' is a reference and a refrain in the search for home",
  'url': 'https://www.npr.org/2023/06/10/1180103464/book-review-isabel-allende-novel-the-wind-knows-my-name',
  'blurb': "Isabel Allende's latest is a tale of two child immigrants — a boy who escapes Nazi occupied Vienna in 1938 and a girl who escapes military gangs in El Salvador in 2019 — and their shared experience."},
 {'title': "Relationships are the true heart of 1940s dystopian novel 'Kallocain'",
  'url': 'https://www.npr.org/2023/06/09/1180105212/karin-boye-1940-dystopian-novel-kallocain-book-review',
  'blurb': "Karin Boye's novel is an outlier in that it was authored by a woman and, though narrated by a man, still expresses interest in women's inner life and acknowledges the subtleties of sexism."},
 {'title': "'All the Sinners Bleed' elegantly walks a fine line between horror and crime fiction",
  'url': 'https://www.npr.org/2023/06/09/1181236505/book-review-sa-cosby-all-the-sinners-b

In [31]:
details_w_checkpoint = collect_npr_reviews("'Wild Dances' puts consequences of a long-ago, faraway conflict at center")

In [32]:
details_w_checkpoint

[{'title': "'The Wind Knows My Name' is a reference and a refrain in the search for home",
  'url': 'https://www.npr.org/2023/06/10/1180103464/book-review-isabel-allende-novel-the-wind-knows-my-name',
  'blurb': "Isabel Allende's latest is a tale of two child immigrants — a boy who escapes Nazi occupied Vienna in 1938 and a girl who escapes military gangs in El Salvador in 2019 — and their shared experience."},
 {'title': "Relationships are the true heart of 1940s dystopian novel 'Kallocain'",
  'url': 'https://www.npr.org/2023/06/09/1180105212/karin-boye-1940-dystopian-novel-kallocain-book-review',
  'blurb': "Karin Boye's novel is an outlier in that it was authored by a woman and, though narrated by a man, still expresses interest in women's inner life and acknowledges the subtleties of sexism."},
 {'title': "'All the Sinners Bleed' elegantly walks a fine line between horror and crime fiction",
  'url': 'https://www.npr.org/2023/06/09/1181236505/book-review-sa-cosby-all-the-sinners-b

---

In [34]:
res = requests.get('https://www.nytimes.com/section/books/review')

print(BeautifulSoup(res.content).get_text())




Book Review - The New York Times
  


















Skip to contentSectionsSEARCHBook Review Today’s PaperAdvertisementContinue reading the main storySupported byContinue reading the main storyArtsBook ReviewWhat to ReadStaff CriticsBest SellersBy the BookBook Review PodcastHighlightsPhotoCredit24 Works of Fiction to Read This SummerA sequel to Colson Whitehead’s “Harlem Shuffle,” new stories from Jamel Brinkley, a debut novel about a teenager who worked for Andy Warhol — and more. By Kate DwyerPhotoCredit14 Nonfiction Books to Read This SummerBiographies of Anna May Wong and Alice Marble, a deep-sea exploration, a history of the race to the North Pole: Here’s what to watch for this season. By Joumana Khatib and Neima JahromiPhotoCreditRebecca ClarkeBy the BookThree Books That Make Tess Gunty Angry“So many come to mind,” says the author, whose novel “The Rabbit Hutch” won a National Book Award last year and will be out in paperback this month. “I guess I’m often furious?” PhotoCre

In [35]:
soup = BeautifulSoup(res.content)

In [41]:
panel = soup.find('section', id='stream-panel')

In [44]:
reviews = panel.select('li a')

In [47]:
# url
reviews[0].get('href')

'/2023/06/10/books/latin-american-science-fiction.html'

In [51]:
# title
reviews[0].find('h3').get_text(strip=True)

'Science Fiction From Latin America, With Zombie Dissidents and Aliens in the Amazon'

In [52]:
# blurb
reviews[0].find('p').get_text(strip=True)

'A new wave of writers is making the genre its own, rooting it in local homelands and histories.'

In [64]:
def collect_nyt_reviews(checkpoint=None):
    """Collect data on reviews from the nyt book reviews page.
    
    Params
    ----------
    checkpoint: str
        Title of the most recent article scraped. Stops collecting articles
        when encountered. 
    Returns
    ----------
    List[dict] | None
        A list of dicts, each containing details of a review. 
        If an error occurs or checkpoint is first item, returns None."""
    
    url = 'https://www.nytimes.com/section/books/review'
    base_url = 'https://www.nytimes.com'
    
    try:
        res = requests.get(url)
    except:
        return None
    
    if res.status_code != 200:
        return None
    
    soup = BeautifulSoup(res.content)
    article_list = []
    reviews = soup.find('section', id='stream-panel').select('li a')
    
    for review in reviews:
        details = {}
        
        header = review.find('h3')
        if not header:
            continue
            
        title = details['title'] = header.get_text(strip=True)
        if title == checkpoint:
            return article_list or None
        details['url'] = base_url + review.get('href')
        details['blurb'] = review.find('p').get_text(strip=True)
        
        article_list.append(details)
        
    return article_list
    

In [65]:
nyt_deets = collect_nyt_reviews()

In [66]:
nyt_deets

[{'title': 'Science Fiction From Latin America, With Zombie Dissidents and Aliens in the Amazon',
  'url': 'https://www.nytimes.com/2023/06/10/books/latin-american-science-fiction.html',
  'blurb': 'A new wave of writers is making the genre its own, rooting it in local homelands and histories.'},
 {'title': 'Masterpieces Galore: When Mozart Met the Enlightenment',
  'url': 'https://www.nytimes.com/2023/06/10/books/review/mozart-in-motion-patrick-mackie.html',
  'blurb': 'In Patrick Mackie’s “Mozart in Motion,” the socially observant composer embraces modernity.'},
 {'title': 'People We Meet on Vacation',
  'url': 'https://www.nytimes.com/2023/06/10/books/review/you-cant-stay-here-forever-katherine-lin.html',
  'blurb': 'In her debut novel, “You Can’t Stay Here Forever,” Katherine Lin follows a young widow and her best friend to the French Riviera.'},
 {'title': 'Summer Book Preview and 9 Thrillers to Read',
  'url': 'https://www.nytimes.com/2023/06/09/books/review/summer-book-preview-a

In [67]:
checkpoint = 'Can You Find the Hidden Titles of These 12 Books About Broadway Icons?'
nyt_deets_again = collect_nyt_reviews(checkpoint=checkpoint)

In [68]:
nyt_deets_again

[{'title': 'Science Fiction From Latin America, With Zombie Dissidents and Aliens in the Amazon',
  'url': 'https://www.nytimes.com/2023/06/10/books/latin-american-science-fiction.html',
  'blurb': 'A new wave of writers is making the genre its own, rooting it in local homelands and histories.'},
 {'title': 'Masterpieces Galore: When Mozart Met the Enlightenment',
  'url': 'https://www.nytimes.com/2023/06/10/books/review/mozart-in-motion-patrick-mackie.html',
  'blurb': 'In Patrick Mackie’s “Mozart in Motion,” the socially observant composer embraces modernity.'},
 {'title': 'People We Meet on Vacation',
  'url': 'https://www.nytimes.com/2023/06/10/books/review/you-cant-stay-here-forever-katherine-lin.html',
  'blurb': 'In her debut novel, “You Can’t Stay Here Forever,” Katherine Lin follows a young widow and her best friend to the French Riviera.'},
 {'title': 'Summer Book Preview and 9 Thrillers to Read',
  'url': 'https://www.nytimes.com/2023/06/09/books/review/summer-book-preview-a