In [1]:
import requests
from bs4 import BeautifulSoup

In [None]:
# function for collecting info on review articles
def collect_lwl_reviews(checkpoint=None):
    """Collect data on reviews from the littlewhitelies movie blog.
    
    Params
    ----------
    checkpoint: str
        Name of the most recent article scraped. Stops collecting articles
        when encountered. 
    Returns
    ----------
    List[dict] | None
        A list of dicts, each containing details of a review. 
        If an error occurs or checkpoint is first item, returns None."""
    
    
    res = requests.get('https://lwlies.com/reviews/')

    if res.status_code != 200:
        return None
    
    lwl_rev = BeautifulSoup(res.content)
    articles_list = []
    posts = lwl_rev.find_all('div', class_='postBlock')
    
    for post in posts:
        post_dict = {}
        
        post_dict['title'] = post.find('h3').get_text()
        
        if post_dict['title'] == checkpoint:
            if articles_list:
                return articles_list
            else:
                return None
        
        post_dict['author'] = post.select_one('p a').get_text()
        post_dict['url'] = post.find('a').get('href', None) 
        post_dict['blurb'] = post.find('p', class_='excerpt').get_text()
        
        score_types = ['anticipation', 'enjoyment', 'retrospect']
        marker = 0
        scores = post.find_all('span')
        for score in scores:
            if score.get('class', None) and score['class'][-1].startswith('icon-rating'):
                post_dict[score_types[marker]] = int(score['class'][-1][-1])
                marker += 1
                
        
        articles_list.append(post_dict)
        
    return articles_list
    

In [None]:
# function for collecting info on feature articles
def collect_lwl_features(checkpoint=None):
    """Collect data on features from the littlewhitelies movie blog.
    
    Params
    ----------
    checkpoint: str
        Name of the most recent article scraped. Stops collecting articles
        when encountered. 
    Returns
    ----------
    List[dict] | None
        A list of dicts, each containing details of a feature article. 
        If an error occurs or checkpoint is first item, returns None."""
    
    
    res = requests.get('https://lwlies.com/features/')

    if res.status_code != 200:
        return None
    
    lwl_rev = BeautifulSoup(res.content)
    articles_list = []
    posts = lwl_rev.find_all('div', class_='postBlock')
    
    for post in posts:
        post_dict = {}
        
        post_dict['title'] = post.find('h3').get_text()
        
        if post_dict['title'] == checkpoint:
            if articles_list:
                return articles_list
            else:
                return None
        
        post_dict['author'] = post.select_one('p a').get_text()
        post_dict['url'] = post.find('a').get('href', None) 
        post_dict['blurb'] = post.find('p', class_='excerpt').get_text()
        
        articles_list.append(post_dict)
        
    return articles_list

___

In [8]:
res = requests.get('https://lwlies.com/features/')

res.status_code

200

In [11]:
lwl_feat = BeautifulSoup(res.content)

In [12]:
print(lwl_feat.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width,initial-scale=1,maximum-scale=1" name="viewport"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <title>
   Features - Little White Lies
  </title>
  <link href="https://lwlies.com/wp-content/themes/littlewhitelies/assets/images/favicons/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
  <link href="https://lwlies.com/wp-content/themes/littlewhitelies/assets/images/favicons/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
  <link href="https://lwlies.com/wp-content/themes/littlewhitelies/assets/images/favicons/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
  <link href="https://lwlies.com/wp-content/themes/littlewhitelies/assets/images/favicons/manifest.json" rel="manifest"/>
  <link color="#000000" href="https://lwlies.com/wp-content/themes/littlewhitelies/assets/images/favicons/safa

In [13]:
res = requests.get('https://lwlies.com/reviews/')

res.status_code

200

In [14]:
lwl_rev = BeautifulSoup(res.content)

In [15]:
print(lwl_rev.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width,initial-scale=1,maximum-scale=1" name="viewport"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <title>
   Reviews Archive - Little White Lies
  </title>
  <link href="https://lwlies.com/wp-content/themes/littlewhitelies/assets/images/favicons/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
  <link href="https://lwlies.com/wp-content/themes/littlewhitelies/assets/images/favicons/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
  <link href="https://lwlies.com/wp-content/themes/littlewhitelies/assets/images/favicons/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
  <link href="https://lwlies.com/wp-content/themes/littlewhitelies/assets/images/favicons/manifest.json" rel="manifest"/>
  <link color="#000000" href="https://lwlies.com/wp-content/themes/littlewhitelies/assets/images/favico

In [71]:
reviews = lwl_rev.find_all('div', class_='postBlock')

In [19]:
len(reviews)

17

In [53]:
# URL
reviews[0].find('a')['href']

'https://lwlies.com/reviews/war-pony/'

In [45]:
reviews[0].find('h3').get_text()

'War Pony'

In [56]:
# scores: anticipation, enjoyment, and retrospect. Maybe only keep retrospect? 
reviews[0].find_all('span')[0]['class'][-1][-1] # str

'3'

In [76]:
reviews[0].find_all('span')[4]['class']

['icon-rating4']

In [32]:
# blurb
reviews[0].find('p', class_='excerpt').get_text()

'Gina Gammell and Riley Keough’s debut feature focuses on two Oglala Lakota teenagers as they come of age in South Dakota.'

In [52]:
# author
reviews[0].select_one('p a').get_text()

'Charles Bramesco'

In [43]:
reviews[0].find('h3')

<h3><a href="https://lwlies.com/reviews/war-pony/">War Pony</a></h3>

In [104]:
def collect_lwl_reviews(checkpoint=None):
    """Collect data on reviews from the littlewhitelies movie blog.
    
    Params
    ----------
    checkpoint: str
        Name of the most recent article scraped. Stops collecting articles
        when encountered. 
    Returns
    ----------
    List[dict] | None
        A list of dicts, each containing details of a review. 
        If an error occurs or checkpoint is first item, returns None."""
    
    
    res = requests.get('https://lwlies.com/reviews/')

    if res.status_code != 200:
        return None
    
    lwl_rev = BeautifulSoup(res.content)
    articles_list = []
    posts = lwl_rev.find_all('div', class_='postBlock')
    
    for post in posts:
        post_dict = {}
        
        post_dict['title'] = post.find('h3').get_text()
        
        if post_dict['title'] == checkpoint:
            if articles_list:
                return articles_list
            else:
                return None
        
        post_dict['author'] = post.select_one('p a').get_text()
        post_dict['url'] = post.find('a').get('href', None) 
        post_dict['blurb'] = post.find('p', class_='excerpt').get_text()
        
        score_types = ['anticipation', 'enjoyment', 'retrospect']
        marker = 0
        scores = post.find_all('span')
        for score in scores:
            if score.get('class', None) and score['class'][-1].startswith('icon-rating'):
                post_dict[score_types[marker]] = int(score['class'][-1][-1])
                marker += 1
                
        
        articles_list.append(post_dict)
        
    return articles_list
    

In [106]:
review_list = collect_lwl_reviews('The Eight Mountains')

In [107]:
review_list

[{'title': 'War Pony',
  'author': 'Charles Bramesco',
  'url': 'https://lwlies.com/reviews/war-pony/',
  'blurb': 'Gina Gammell and Riley Keough’s debut feature focuses on two Oglala Lakota teenagers as they come of age in South Dakota.',
  'anticipation': 3,
  'enjoyment': 4,
  'retrospect': 4},
 {'title': 'Medusa Deluxe',
  'author': 'Cheyenne Bunsie',
  'url': 'https://lwlies.com/reviews/medusa-deluxe/',
  'blurb': 'The cutthroat world of hairdressing is the setting for this sparky murder mystery – a debut from Thomas Hardiman.',
  'anticipation': 3,
  'enjoyment': 3,
  'retrospect': 3},
 {'title': 'Mad About the Boy: The Noël Coward Story',
  'author': 'Marina Ashioti',
  'url': 'https://lwlies.com/reviews/mad-about-the-boy-the-noel-coward-story/',
  'blurb': 'Barnaby Thompson celebrates the multifaceted life and work of legendary playwright Noël Coward with a perfunctory profile doc.',
  'anticipation': 3,
  'enjoyment': 2,
  'retrospect': 2},
 {'title': 'Carmen',
  'author': 'Ro

---

In [109]:
features = lwl_feat.find_all('div', class_='postBlock')

In [116]:
# Title
features[0].find('h3').get_text()

'Emile Mosseri and Joe Talbot: ‘We’re just following the feeling of what we like’'

In [117]:
# link
features[0].find('a').get('href')

'https://lwlies.com/interviews/emile-mosseri-and-joe-talbot/'

In [120]:
# author
features[0].select_one('p a').get_text()

'Kambole Campbell'

In [121]:
features[0].find('p', class_='excerpt').get_text()

"To celebrate the release of Emile Mosseri's new album, he sat down with old pal and collaborator Joe Talbot for a chat about Tupac, George Constanza, and swimming in LA lakes."

In [123]:
def collect_lwl_features(checkpoint=None):
    """Collect data on features from the littlewhitelies movie blog.
    
    Params
    ----------
    checkpoint: str
        Name of the most recent article scraped. Stops collecting articles
        when encountered. 
    Returns
    ----------
    List[dict] | None
        A list of dicts, each containing details of a feature article. 
        If an error occurs or checkpoint is first item, returns None."""
    
    
    res = requests.get('https://lwlies.com/features/')

    if res.status_code != 200:
        return None
    
    lwl_rev = BeautifulSoup(res.content)
    articles_list = []
    posts = lwl_rev.find_all('div', class_='postBlock')
    
    for post in posts:
        post_dict = {}
        
        post_dict['title'] = post.find('h3').get_text()
        
        if post_dict['title'] == checkpoint:
            if articles_list:
                return articles_list
            else:
                return None
        
        post_dict['author'] = post.select_one('p a').get_text()
        post_dict['url'] = post.find('a').get('href', None) 
        post_dict['blurb'] = post.find('p', class_='excerpt').get_text()
        
        articles_list.append(post_dict)
        
    return articles_list

In [124]:
feature_list = collect_lwl_features()

In [125]:
feature_list

[{'title': 'Emile Mosseri and Joe Talbot: ‘We’re just following the feeling of what we like’',
  'author': 'Kambole Campbell',
  'url': 'https://lwlies.com/interviews/emile-mosseri-and-joe-talbot/',
  'blurb': "To celebrate the release of Emile Mosseri's new album, he sat down with old pal and collaborator Joe Talbot for a chat about Tupac, George Constanza, and swimming in LA lakes."},
 {'title': 'LWLies 99: The Asteroid City issue – Out now!',
  'author': 'Little White Lies ',
  'url': 'https://lwlies.com/articles/lwlies-99-the-asteroid-city-issue-out-now/',
  'blurb': "We blast off to a space-obsessed town in 1955 for Wes Anderson's latest lavish adventure."},
 {'title': 'Parker Posey: ‘I’m nostalgic for how to really meet people. It’s so different now’',
  'author': 'Abbey Bender',
  'url': 'https://lwlies.com/interviews/parker-posey-party-girl/',
  'blurb': 'As seminal 90s cult movie Party Girl receives a 4K restoration, Parker Posey reflects on her first starring role and nostalg