# Scraping Goodreads

This notebook scrapes Goodreads for books published in 2017 and 2018.

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import urllib.request
import Goodreads_helper_functions as good
import json

In [None]:
def get_book_urls(url):
    """
    get_book_urls(url):
    Gets the url for each book's respective website on a Goodreads' list of best books
    Params:
        url: Goodreads' url which has a list of links for top books in a given year
    Returns:
        list of urls for each book
    """
    urls = []

    # go through each page 
    for i in range(1,20):
        new_url = str(url) + f'?page={i}'
        open_url = urllib.request.urlopen(new_url)
        soup = bs(open_url, 'html.parser')
        soups = soup.find_all('div', {"data-resource-type":"Book"})

    # iterate through each book on each page and grab its url
        for i in range(len(soups)):
            urls.append('https://goodreads.com' + soups[i].a['href'])

    return urls

In [None]:
# get all the individual book urls from the master lists
urls_2018 = get_book_urls('https://www.goodreads.com/list/show/119307.Best_Books_of_2018')
urls_2017 = get_book_urls('https://www.goodreads.com/list/show/107026.Best_Books_of_2017')

In [None]:
def get_book_info(urls):
      """
    get_book_info(url):
    Scrapes Goodreads' for a set of features for each book 
    Params:
        urls: List of urls for each book to scrape 
    Returns:
        list of dictionaries containing information for each book
    """ 
    list_of_books = []
    # iterate over the list of urls
    for url in urls:
        html_page = requests.get(url)
        soup = bs(html_page.content, 'html.parser')
    
        book_dict = {}
        # grab a bunch of information for each book and append to a list
        book_dict['title'] = good.get_title(soup)
        book_dict['ISBN'] = good.get_ISBN(soup)
        book_dict['author'] = good.get_author(soup)
        book_dict['series'] = good.get_series(soup)
        book_dict['genre'] = good.get_genre(soup)
        book_dict['rating'] = good.get_rating(soup)
        book_dict['publish_date'] = good.get_publish_date(soup)
        book_dict['publish_company'] = good.get_publishing_company(soup)
        book_dict['number_of_pages'] = good.get_pages(soup)
        book_dict['format'] = good.get_format(soup)
    
        list_of_books.append(book_dict)
    return list_of_books

In [None]:
# get the book data from each book url
list_of_book_dicts_2018 = get_book_info(urls_2018)
list_of_book_dicts_2017 = get_book_info(urls_2017)

In [None]:
# save!
with open('Goodreads_books_2017.json','w') as book_file:
    json.dump(list_of_book_dicts_2017, book_file)
with open('Goodreads_books_2018.json','w') as book_file:
    json.dump(list_of_book_dicts_2018, book_file)