In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Blog Scraping Notebook

In [2]:
import requests
from bs4 import BeautifulSoup
from dateutil import parser
from textstat.textstat import textstat

In [3]:
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'referrer': 'https://google.com'
}

## Scraping One Article

Show how to extract out header content


In [4]:
url = 'https://blog.frame.io/2018/10/01/womans-experience-cutting-blockbusterrs/'
r = requests.get(url, headers=headers)

In [5]:
r.status_code

200

In [6]:
html = r.text.strip()
print(html)

<!DOCTYPE html>

<script>
 var fioPageType = ''; 
var fioPageSlug = ''; 
fioPageType = 'article';
 fioPageSlug = 'womans-experience-cutting-blockbusterrs';
 </script>
<html lang="en-US" prefix="og: http://ogp.me/ns#">
	<head>
		<meta charset="UTF-8">
		<meta name="viewport" content="width=device-width, initial-scale=1">

		<link rel="profile" href="http://gmpg.org/xfn/11">

		

		<title>On Gender and Genre: A Woman’s Experience Cutting Action Blockbusters</title>

<!-- This site is optimized with the Yoast SEO plugin v8.1 - https://yoast.com/wordpress/plugins/seo/ -->
<meta name="description" content="“Mile 22” Editor Melissa Lawson Cheung shares her experience cutting action blockbusters, and what she brings to the cutting room as a woman."/>
<link rel="canonical" href="https://blog.frame.io/2018/10/01/womans-experience-cutting-blockbusterrs/" />
<meta property="og:locale" content="en_US" />
<meta property="og:type" content="article" />
<meta property="og:title" content="O

In [7]:
soup = BeautifulSoup(html, 'lxml')
print(type(soup))

<class 'bs4.BeautifulSoup'>


In [8]:
# Header Content
header = soup.find(class_='entry-header')
title_html = header.find(class_='post-meta-title')
print(title_html)

<h1 class="post-meta-title">
								On Gender and Genre: A Woman’s Experience Cutting Action Blockbusters							</h1>


In [9]:
title_html.contents

['\r\n\t\t\t\t\t\t\t\tOn Gender and Genre: A Woman’s Experience Cutting Action Blockbusters\t\t\t\t\t\t\t']

In [10]:
title_str = title_html.contents[0].strip()
print(title_str)

On Gender and Genre: A Woman’s Experience Cutting Action Blockbusters


In [11]:
author_html = header.find(class_='author-name')
author_str = author_html.find('a').contents[0].strip()
print(author_str)

Lisa McNamara


### Modularize Code

In [12]:
def parse_page(url):
    r = requests.get(url, headers=headers)
    html = r.text.strip()
    soup = BeautifulSoup(html, 'lxml')
    
    # Header Content
    header = soup.find(class_='entry-header')
    read_time = extract_read_time(header)
    title = extract_title(header)

    author = extract_author(header)
    categories = extract_categories(header)

    date = extract_date(header)
    dt = parser.parse(date)
    month = dt.strftime("%B")
    weekday = dt.strftime("%A")
    
    # Body Content
    content = soup.find(class_='entry-content')
    word_count = len(content.text.split())
    reading_level = textstat.flesch_kincaid_grade(content.text)

    links = content.find_all("a")
    link_count = len(links)

    images = content.find_all("img")
    image_count = len(images)
    
    page_data = {
        'reading_time' : read_time,
        'title': title,
        'date': date,
        'month': month,
        'weekday': weekday,
        'author': author,
        'categories': categories,
        'word_count': word_count,
        'reading_level': reading_level,
        'link_count': link_count,
        'image_count': image_count
    }
    
    return page_data
    
def extract_read_time(header):
    html_str = header.find(class_='read-time')
    time_str = html_str.contents[0].strip().lower().split()[0]
    time_int = int(time_str)
    return time_int

def extract_title(header):
    html_str = header.find(class_='post-meta-title')
    title_str = html_str.contents[0].strip()
    return title_str

def extract_date(header):
    html_str = header.find(class_='single-post-date')
    date_str = html_str.contents[0].strip()
    return date_str

def extract_author(header):
    html_str = header.find(class_='author-name')
    author_str = html_str.find('a').contents[0].strip()
    return author_str

def extract_categories(header):
    html_str = header.find(class_='single-post-cat')
    categories = html_str.findAll('a')
    cat_names = []
    for cat_link in categories:
        cat_name = cat_link.contents[0].strip().lower()
        cat_names.append(cat_name)
    return cat_names

In [13]:
url = 'https://blog.frame.io/2018/10/01/womans-experience-cutting-blockbusterrs/'
wmn_exp = parse_page(url)

In [14]:
print(wmn_exp)

{'reading_time': 13, 'title': 'On Gender and Genre: A Woman’s Experience Cutting Action Blockbusters', 'date': 'October 1, 2018', 'month': 'October', 'weekday': 'Monday', 'author': 'Lisa McNamara', 'categories': ['behind the scenes'], 'word_count': 2639, 'reading_level': 10.3, 'link_count': 6, 'image_count': 15}


## Scraping One Category

In [15]:
articles_store = []

In [16]:
def parse_category(url):
    r = requests.get(url, headers=headers)
    html = r.text.strip()
    soup = BeautifulSoup(html, 'lxml')
    
    article_cards = soup.findAll(class_='post-content')

    for article in article_cards:
        title = article.find(class_='post-meta-title')
        link = title.contents[0]['href']
        print('Parsing URL:', link)
        page = parse_page(link)
        articles_store.append(page)
        
    next_link = find_next_link(soup)
    
    if next_link is not None:
        print('Next page:', next_link)
        parse_category(next_link)
        
    return None

def find_next_link(soup_item):
    bottom_nav = soup_item.find(class_='navigation')
    
    if bottom_nav == None:
        return None
    
    links = bottom_nav.findAll('a')
    next_page = links[-1]

    if next_page.contents[0] == 'Next':
        next_link = next_page['href']
        return next_link
    
    return None

In [17]:
bts = 'https://blog.frame.io/category/behind-the-scenes/'
parse_category(bts)

Parsing URL: https://blog.frame.io/2018/10/01/womans-experience-cutting-blockbusterrs/
Parsing URL: https://blog.frame.io/2018/09/10/made-in-frame-searching/
Parsing URL: https://blog.frame.io/2018/07/30/inside-mission-impossible-fallout/
Parsing URL: https://blog.frame.io/2018/07/16/made-in-frame-film-riot/
Parsing URL: https://blog.frame.io/2018/07/02/bts-hotel-artemis/
Parsing URL: https://blog.frame.io/2018/06/25/made-in-frame-hereditary/
Parsing URL: https://blog.frame.io/2018/06/18/making-incredibles-2/
Parsing URL: https://blog.frame.io/2018/05/29/made-in-frame-conan-obrien/
Parsing URL: https://blog.frame.io/2018/05/07/madeinframe-kiwi/
Parsing URL: https://blog.frame.io/2018/03/26/made-in-frame-new-balance-fearlessly-independent/
Parsing URL: https://blog.frame.io/2018/02/26/made-in-frame-get-out/
Parsing URL: https://blog.frame.io/2017/08/21/editor-as-writer/
Next page: https://blog.frame.io/category/behind-the-scenes/page/2/
Parsing URL: https://blog.frame.io/2017/07/31/baby

In [18]:
print(len(articles_store))
print(articles_store[0])

20
{'reading_time': 13, 'title': 'On Gender and Genre: A Woman’s Experience Cutting Action Blockbusters', 'date': 'October 1, 2018', 'month': 'October', 'weekday': 'Monday', 'author': 'Lisa McNamara', 'categories': ['behind the scenes'], 'word_count': 2639, 'reading_level': 10.3, 'link_count': 6, 'image_count': 15}


## Scraping All Categories

In [19]:
articles_store = []

In [20]:
categories = ['post-production', 'color-correction', 'business', 'workflow', 'behind-the-scenes', 'production', 'announcement']

In [21]:
for category in categories:
    url = 'https://blog.frame.io/category/' + category + '/'
    print('Parsing category', category)
    parse_category(url)

Parsing category post-production
Parsing URL: https://blog.frame.io/2018/09/24/hevc-format-wars/
Parsing URL: https://blog.frame.io/2018/09/17/fcpx-shortcuts/
Parsing URL: https://blog.frame.io/2018/09/04/creating-video-for-blind-and-deaf/
Parsing URL: https://blog.frame.io/2018/08/27/deep-dive-fcpx-audio-tools/
Parsing URL: https://blog.frame.io/2018/07/23/tips-for-no-coverage/
Parsing URL: https://blog.frame.io/2018/06/11/adr-primer/
Parsing URL: https://blog.frame.io/2018/05/14/premiere-batch-syncing/
Parsing URL: https://blog.frame.io/2018/03/12/studying-6-editing-masterpieces/
Parsing URL: https://blog.frame.io/2018/02/19/avid-media-composer-troubleshooting-tips/
Parsing URL: https://blog.frame.io/2018/02/12/animated-masks-after-effects/
Parsing URL: https://blog.frame.io/2018/02/07/editorial-style/
Parsing URL: https://blog.frame.io/2018/01/31/fcpx-metadata/
Next page: https://blog.frame.io/category/post-production/page/2/
Parsing URL: https://blog.frame.io/2018/01/26/perfect-dia

Parsing URL: https://blog.frame.io/2018/04/02/arri-workflow-premiere/
Parsing URL: https://blog.frame.io/2018/03/05/oscar-2018-workflows/
Parsing URL: https://blog.frame.io/2018/01/17/remote-dailies-workflow/
Parsing URL: https://blog.frame.io/2017/10/04/turbo-charge-fcpx-workflow-davinci-resolve/
Parsing URL: https://blog.frame.io/2017/07/31/baby-driver-workflow/
Parsing URL: https://blog.frame.io/2017/05/15/sync-clips-in-davinci-resolve/
Parsing URL: https://blog.frame.io/2017/03/20/premiere-pro-proxies/
Parsing category behind-the-scenes
Parsing URL: https://blog.frame.io/2018/10/01/womans-experience-cutting-blockbusterrs/
Parsing URL: https://blog.frame.io/2018/09/10/made-in-frame-searching/
Parsing URL: https://blog.frame.io/2018/07/30/inside-mission-impossible-fallout/
Parsing URL: https://blog.frame.io/2018/07/16/made-in-frame-film-riot/
Parsing URL: https://blog.frame.io/2018/07/02/bts-hotel-artemis/
Parsing URL: https://blog.frame.io/2018/06/25/made-in-frame-hereditary/
Parsin

In [22]:
len(articles_store)

192

In [23]:
articles_store[0]

{'author': 'Seth Goldin',
 'categories': ['codecs', 'workflow'],
 'date': 'September 24, 2018',
 'image_count': 4,
 'link_count': 46,
 'month': 'September',
 'reading_level': 8.4,
 'reading_time': 11,
 'title': 'HEVC (H.265): What is it and Why Should You Care?',
 'weekday': 'Monday',
 'word_count': 2349}

In [24]:
import json

with open('data/articles.json', 'w') as f:
    json.dump(articles_store, f)