# MVP - Blog level analytics

In [100]:
import bonobo
import requests
from bs4 import BeautifulSoup
from dateutil import parser
from textstat.textstat import textstat

In [102]:
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'referrer': 'https://google.com'
}

In [103]:
categories = ['post-production', 'color-correction', 'business', 'workflow', 'behind-the-scenes', 'production', 'announcement']

In [104]:
articles_store = []

def parse_category(url):
    r = requests.get(url, headers=headers)
    html = r.text.strip()
    soup = BeautifulSoup(html, 'lxml')
    
    article_cards = soup.findAll(class_='post-content')

    for article in article_cards:
        title = article.find(class_='post-meta-title')
        link = title.contents[0]['href']
        print('Parsing URL:', link)
        page = parse_page(link)
        articles_store.append(page)
        
    next_link = find_next_link(soup)
    
    if next_link is not None:
        print('Next page:', next_link)
        parse_category(next_link)
        
    return None

def find_next_link(soup_item):
    bottom_nav = soup_item.find(class_='navigation')
    
    if bottom_nav == None:
        return None
    
    links = bottom_nav.findAll('a')
    next_page = links[-1]

    if next_page.contents[0] == 'Next':
        next_link = next_page['href']
        return next_link
    
    return None

In [105]:
def parse_page(url):
    r = requests.get(url, headers=headers)
    html = r.text.strip()
    soup = BeautifulSoup(html, 'lxml')
    
    # Header Content
    header = soup.find(class_='entry-header')
    read_time = extract_read_time(header)
    title = extract_title(header)

    author = extract_author(header)
    categories = extract_categories(header)

    date = extract_date(header)
    dt = parser.parse(date)
    month = dt.strftime("%B")
    weekday = dt.strftime("%A")
    
    # Body Content
    content = soup.find(class_='entry-content')
    word_count = len(content.text.split())
    reading_level = textstat.flesch_kincaid_grade(content.text)

    links = content.find_all("a")
    link_count = len(links)

    images = content.find_all("img")
    image_count = len(images)
    
    page_data = {
        'reading_time' : read_time,
        'title': title,
        'date': date,
        'month': month,
        'weekday': weekday,
        'author': author,
        'categories': categories,
        'word_count': word_count,
        'reading_level': reading_level,
        'link_count': link_count,
        'image_count': image_count
    }
    
    return page_data
    
def extract_read_time(header):
    html_str = header.find(class_='read-time')
    time_str = html_str.contents[0].strip().lower().split()[0]
    time_int = int(time_str)
    return time_int

def extract_title(header):
    html_str = header.find(class_='post-meta-title')
    title_str = html_str.contents[0].strip()
    return title_str

def extract_date(header):
    html_str = header.find(class_='single-post-date')
    date_str = html_str.contents[0].strip()
    return date_str

def extract_author(header):
    html_str = header.find(class_='author-name')
    author_str = html_str.find('a').contents[0].strip()
    return author_str

def extract_categories(header):
    html_str = header.find(class_='single-post-cat')
    categories = html_str.findAll('a')
    cat_names = []
    for cat_link in categories:
        cat_name = cat_link.contents[0].strip().lower()
        cat_names.append(cat_name)
    return cat_names

In [106]:
for category in categories:
    url = 'https://blog.frame.io/category/' + category + '/'
    print('Parsing category', category)
    parse_category(url)

Parsing category post-production
Parsing URL: https://blog.frame.io/2018/09/24/hevc-format-wars/
Parsing URL: https://blog.frame.io/2018/09/17/fcpx-shortcuts/
Parsing URL: https://blog.frame.io/2018/09/04/creating-video-for-blind-and-deaf/
Parsing URL: https://blog.frame.io/2018/08/27/deep-dive-fcpx-audio-tools/
Parsing URL: https://blog.frame.io/2018/07/23/tips-for-no-coverage/
Parsing URL: https://blog.frame.io/2018/06/11/adr-primer/
Parsing URL: https://blog.frame.io/2018/05/14/premiere-batch-syncing/
Parsing URL: https://blog.frame.io/2018/03/12/studying-6-editing-masterpieces/
Parsing URL: https://blog.frame.io/2018/02/19/avid-media-composer-troubleshooting-tips/
Parsing URL: https://blog.frame.io/2018/02/12/animated-masks-after-effects/
Parsing URL: https://blog.frame.io/2018/02/07/editorial-style/
Parsing URL: https://blog.frame.io/2018/01/31/fcpx-metadata/
Next page: https://blog.frame.io/category/post-production/page/2/
Parsing URL: https://blog.frame.io/2018/01/26/perfect-dia

Parsing URL: https://blog.frame.io/2018/03/05/oscar-2018-workflows/
Parsing URL: https://blog.frame.io/2018/01/17/remote-dailies-workflow/
Parsing URL: https://blog.frame.io/2017/10/04/turbo-charge-fcpx-workflow-davinci-resolve/
Parsing URL: https://blog.frame.io/2017/07/31/baby-driver-workflow/
Parsing URL: https://blog.frame.io/2017/05/15/sync-clips-in-davinci-resolve/
Parsing URL: https://blog.frame.io/2017/03/20/premiere-pro-proxies/
Parsing category behind-the-scenes
Parsing URL: https://blog.frame.io/2018/10/01/womans-experience-cutting-blockbusterrs/
Parsing URL: https://blog.frame.io/2018/09/10/made-in-frame-searching/
Parsing URL: https://blog.frame.io/2018/07/30/inside-mission-impossible-fallout/
Parsing URL: https://blog.frame.io/2018/07/16/made-in-frame-film-riot/
Parsing URL: https://blog.frame.io/2018/07/02/bts-hotel-artemis/
Parsing URL: https://blog.frame.io/2018/06/25/made-in-frame-hereditary/
Parsing URL: https://blog.frame.io/2018/06/18/making-incredibles-2/
Parsing 

In [107]:
len(articles_store)

191

In [108]:
articles_store[0]

{'author': 'Seth Goldin',
 'categories': ['codecs', 'workflow'],
 'date': 'September 24, 2018',
 'image_count': 4,
 'link_count': 46,
 'month': 'September',
 'reading_level': 8.4,
 'reading_time': 11,
 'title': 'HEVC, VP9, and AV1: What You Need to Know About the Codec Wars',
 'weekday': 'Monday',
 'word_count': 2349}

In [109]:
import pickle

pickle.dump(articles_store, open("articles.p", "wb"))

In [110]:
import json

with open('articles.json', 'w') as f:
    json.dump(articles_store, f)

# Summary Statistics to JSON

- Average reading time
- Count of posts by category
- Count of posts by author
- Posts by month
- Posts by day

In [111]:
from collections import Counter

In [112]:
# Summary Statistics
times = []
months = []
weekdays = []
authors = []
categories = []

for article in articles_store:
    # Average Reading Time
    times.append(article['reading_time'])
    average_time = sum(times) / float(len(times))
    average_time = round(average_time, 2)
    
    # Posts by Month
    months.append(article['month'])
    month_count = Counter(months)
    
    # Posts by Weekday
    weekdays.append(article['weekday'])
    weekday_count = Counter(weekdays)
    
    # Count by Category
    categories += article['categories']
    category_count = Counter(categories)
    
    # Count by Author
    authors.append(article['author'])
    author_count = Counter(authors)

In [113]:
print("Average reading time:", average_time)
print("Posts by month", month_count)
print("Posts by weekday", weekday_count)
print("Posts by category", category_count)
print("Posts by author", author_count)

Average reading time: 10.2
Posts by month Counter({'August': 23, 'September': 20, 'June': 19, 'October': 18, 'May': 16, 'November': 16, 'July': 15, 'January': 15, 'March': 13, 'December': 13, 'April': 12, 'February': 11})
Posts by weekday Counter({'Monday': 87, 'Wednesday': 45, 'Tuesday': 24, 'Thursday': 22, 'Friday': 13})
Posts by category Counter({'editing': 39, 'announcement': 39, 'post-production': 23, 'career': 20, 'storytelling': 15, 'behind the scenes': 14, 'workflow': 11, 'codecs': 8, 'color correction': 7, 'business': 7, 'made in frame': 7, 'production': 5, 'sound': 3, 'cribs': 2, 'design': 2, 'technology': 1})
Posts by author Counter({'Emery Wells': 36, 'Mark Christiansen': 16, 'Ron Dawson': 15, 'Stephen Heleker': 12, 'Jason Boone': 10, 'Lisa McNamara': 9, 'Sofi Marshall': 8, 'David Kong': 8, 'Scott Strandberg': 7, 'Yuri Baranovsky': 6, 'Clara Lehmann': 5, 'T Payton': 5, 'Reuben Evans': 4, 'Seth Goldin': 3, 'Jeff Hinton': 3, 'Hilda Saffari': 3, 'Jarle Leirpoll': 3, 'Ryan Char

In [114]:
import json

stats = { 
    'reading_time': average_time, 
    'num_articles': len(articles_store) 
}

with open('stats.json', 'w') as f:
    json.dump(stats, f)

with open('weekday.json', 'w') as f:
    json.dump(weekday_count, f)
    
with open('month.json', 'w') as f:
    json.dump(month_count, f)
    
with open('category.json', 'w') as f:
    json.dump(category_count, f)

with open('author.json', 'w') as f:
    json.dump(author_count, f)