# Goodreads Web Scraper

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import urllib.request
import Goodreads_helper_functions as good
import json

In [None]:
def get_book_urls(url):

    urls = []

    for i in range(1,20):
        new_url = str(url) + f'?page={i}'
        open_url = urllib.request.urlopen(new_url)
        soup = bs(open_url, 'html.parser')
        soups = soup.find_all('div', {"data-resource-type":"Book"})

        for i in range(len(soups)):
            urls.append('https://goodreads.com' + soups[i].a['href'])

    return urls

In [None]:
url17='https://www.goodreads.com/list/show/107026.Best_Books_of_2017'
url18='https://www.goodreads.com/list/show/119307.Best_Books_of_2018'
urls2018 = get_book_urls(url18)
urls2017 = get_book_urls(url17)

In [None]:
def get_book_info(urls):

    books = []
    # iterate over the list of urls
    for url in urls:
        page = requests.get(url)
        soup = bs(page.content, 'html.parser')
    
        book_dict = {}
        book_dict['title'] = good.get_title(soup)
        book_dict['ISBN'] = good.get_ISBN(soup)
        book_dict['author'] = good.get_author(soup)
        book_dict['series'] = good.get_series(soup)
        book_dict['genre'] = good.get_genre(soup)
        book_dict['rating'] = good.get_rating(soup)
        book_dict['publish_date'] = good.get_publish_date(soup)
        book_dict['publish_company'] = good.get_publishing_company(soup)
        book_dict['number_of_pages'] = good.get_pages(soup)
        book_dict['format'] = good.get_format(soup)
    
        books.append(book_dict)
    return books

In [None]:
book_dict_2018 = get_book_info(urls2018)
book_dict_2017 = get_book_info(urls2017)

In [None]:
with open('Goodreads_books_2017.json','w') as book_file:
    json.dump(book_dict_2017, book_file)
with open('Goodreads_books_2018.json','w') as book_file:
    json.dump(book_dict_2018, book_file)

# NYT Web Scraper

In [None]:
import requests
import time
import config
import json
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import Goodreads_helper_functions as good


In [None]:
best_sellers = []

def get_books():
    """
    get_books():
    Function returns a list of New York Times bestsellers
    Params:
        None
    Returns:
        List of dictionaries describing bestselling books
    """
    # offset the pages by multiples of 20
    for i in range(0, 32326, 20):
        params = {'api-key': config.NYT_api_key,
                 'offset': i}
        url = 'https://api.nytimes.com/svc/books/v3/lists/best-sellers/history.json'
        response = requests.get(url, params = params)
        data = response.json()
        
        best_sellers = {}
       
        for book in data['results']:
            try:

                best_sellers['title'] = book['title']
                best_sellers['author'] = book['author']
                best_sellers['publisher'] = book['publisher']
                best_sellers['ISBN'] = book['isbns']
                best_sellers['publish_date'] = book['ranks_history'][0]['published_date']

                # add the dictionary to master list

                best_sellers.append(best_sellers_dict)

            except IndexError:
                continue

        # print what page we are on for auditing purposes
        print(i)
        
        # wait so we don't hit the API's per minute call limit
        time.sleep(6)
            
    return True

In [None]:
get_books

In [None]:
def get_NYT_ISBNS(data):
    """
    get_NYT_ISBNS():
    Function returns a list of ISBNs from the list of NYT bestsellers between 2017 and the present
    Params:
        data: list of dictionaries describing each book from the NYT API call
    Returns:
        List of ISBNs for each NYT bestseller
    """
    ISBNs = []
    years = ['2019','2018','2017']
    for book in data:
        # only append the ISBN if the book was on a NYT bestseller list between 2017 and 2019
        if any(x in book['publish_date'] for x in years):
            try: 
                ISBN = book['ISBN'][0]['isbn13']
                ISBNs.append(ISBN)
            except IndexError:
                continue
    return ISBNs

ISBNs = get_NYT_ISBNS(best_sellers)


In [None]:
driver_options = webdriver.chrome.options.Options()
img = {"profile.managed_default_content_settings.images": 2}
driver_options.add_experimental_option("prefs", img)
driver = webdriver.Chrome(options=driver_options)

In [None]:
NYT_books = []

def get_NYTbook_info(ISBNs):
    """
    get_NYTbook_info():
    Function returns a list of dictionaries with features describing each NYT bestseller
    Params:
        ISBNs: list of ISBNs for each NYT bestseller
    Returns:
        List of dictionaries describing bestselling books
    """

    for ISBN in ISBNs:
        
        # log into goodreads website 
        driver.get('https://www.goodreads.com/')
        

        try:
            # get the webpage for each ISBN 
            driver.find_element_by_xpath('/html/body/div[4]/main/div[1]/section[1]/div/div/footer/div[1]/div/form/input').send_keys(ISBN,Keys.ENTER)
            time.sleep(2) 

            # grab the current url to scrape
            url = driver.current_url
            
        # Beautiful soup to scrape each book page for features
            
            html = requests.get(url)
            soup = bs(html.content, 'html.parser')

            book = {}

            book['title'] = good.get_title(soup)
            book['ISBN'] = ISBN
            book['author'] = good.get_author(soup)
            book['series'] = good.get_series(soup)
            book['genre'] = good.get_genre(soup)
            book['rating'] = good.get_rating(soup)
            book['publish_date'] = good.get_publish_date(soup)
            book['publish_company'] = good.get_publishing_company(soup)
            book['number_of_pages'] = good.get_pages(soup)
            book['format'] = good.get_format(soup)

            NYT_books.append(book)

        except AttributeError:
            continue
        
        time.sleep(2)
        
    return True

In [None]:
get_NYTbook_info(ISBNs)

# Helper function

In [None]:
def get_title(soup):
    title = soup.find('h1',{'id':'bookTitle'}).get_text()
    clean_title = title.replace("\n"," ").replace("  ", "")
    return clean_title

def get_ISBN(soup):
    try:
        ISBN = soup.find_all('div',class_='infoBoxRowItem')[1].get_text()
        clean_ISBN = ISBN.replace("\n"," ").replace("\n"," ").replace(" ", "").split("(")[1].replace(")","").replace("ISBN13:","")
    except IndexError:
        clean_ISBN = ""
    return clean_ISBN

def get_author(soup):
    author = soup.find('span', itemprop='name').get_text()
    return author

def get_series(soup):
    series = soup.find('a',class_='greyText').get_text()
    clean_series = series.replace("\n"," ").replace("  ", "").replace("(","").replace(")","")
    return clean_series

def get_genre(soup):
    try:
        genre = soup.find('a', class_='actionLinkLite bookPageGenreLink')
        clean_genre = genre['href'].replace("/genres/","")
    except TypeError:
        clean_genre = ""
    return clean_genre

def get_rating(soup):
    rating = soup.find('span', itemprop='ratingValue').get_text()
    clean_rating = rating.replace("\n"," ").replace(" ", "")
    return clean_rating

def get_publish_date(soup):
    try:
        date = soup.find_all('div',class_='row')[1].get_text()
        clean_date = date.replace("\n"," ").replace("  ","").split('by')[0].replace(' Published ',"")
    except IndexError:
        clean_date = ""
    return clean_date

def get_publishing_company(soup):
    try:
        company = soup.find_all('div',class_='row')[1].get_text()
        clean_company = company.replace("\n"," ").replace("  ","").split('by ')[1]
    except IndexError:
        clean_company = ""
    return clean_company

def get_pages(soup):
    try:
        pages = soup.find('span', itemprop='numberOfPages').get_text().split()[0]
    except AttributeError:
        pages = ""
    return pages

def get_format(soup):
    try: 
        book_format = soup.find('span',itemprop = 'bookFormat').get_text()
    except AttributeError:
        book_format = ""
    return book_format