# NYT API calls & Goodreads scraping for features

This notebook uses the NYT API to collect a set of NYT bestsellers between 2017 and the present. 
It also scrapes Goodreads for features of the collected NYT bestsellers.

In [None]:
import requests
import time
import config
import json
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import Goodreads_helper_functions as good

## Call NYT API for best sellers

In [None]:
# create an empty list as a global variable in the case that an error is returned by the API
# so that the data already returned by the API is not lost

all_best_sellers = []

def get_books():
    """
    get_books():
    Function returns a list of New York Times bestsellers
    Params:
        None
    Returns:
        List of dictionaries describing bestselling books
    """
    # offset the pages by multiples of 20
    for i in range(0, 32326, 20):
        params = {'api-key': config.NYT_api_key,
                 'offset': i}
        url = 'https://api.nytimes.com/svc/books/v3/lists/best-sellers/history.json'
        response = requests.get(url, params = params)
        data = response.json()
        
        # create dict to hold data for each book
        all_best_sellers_dict = {}
       
        for book in data['results']:
            try:

                all_best_sellers_dict['title'] = book['title']
                all_best_sellers_dict['author'] = book['author']
                all_best_sellers_dict['publisher'] = book['publisher']
                all_best_sellers_dict['ISBN'] = book['isbns']
                all_best_sellers_dict['publish_date'] = book['ranks_history'][0]['published_date']

                # add the dictionary to master list

                all_best_sellers.append(all_best_sellers_dict)

            except IndexError:
                continue

        # print what page we are on for auditing purposes
        print(i)
        
        # wait so we don't hit the API's per minute call limit
        time.sleep(6)
            
    return True

In [None]:
# call the function
get_books()

In [None]:
# save the data for backup purposes
with open('NYT_books.json', 'w') as datafile:
    json.dump(all_best_sellers, datafile)

In [None]:
# this function retrieves the ISBNs from the list returned by the API call above
def get_NYT_ISBNS(data):
    """
    get_NYT_ISBNS():
    Function returns a list of ISBNs from the list of NYT bestsellers between 2017 and the present
    Params:
        data: list of dictionaries describing each book from the NYT API call
    Returns:
        List of ISBNs for each NYT bestseller
    """
    ISBNs = []
    years = ['2019','2018','2017']
    for book in data:
        # only append the ISBN if the book was on a NYT bestseller list between 2017 and 2019
        if any(x in book['publish_date'] for x in years):
            try: 
                ISBN = book['ISBN'][0]['isbn13']
                ISBNs.append(ISBN)
            except IndexError:
                continue
    return ISBNs

In [None]:
# call the function
ISBNs = get_NYT_ISBNS(all_best_sellers)

## Scrape goodreads for data on NYT bestsellers

In [None]:
# set preferences for selenium
driver_options = webdriver.chrome.options.Options()
prefs = {"profile.managed_default_content_settings.images": 2}
driver_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=driver_options)

In [None]:
# this function takes in a list of ISBNs that were gathered from the NYT API 
# and returns a list of dictionaries, where each dictionary reflects data 
# about the NYT bestseller

list_of_NYT_books = []

def get_NYTbook_info(ISBNs):
    """
    get_NYTbook_info():
    Function returns a list of dictionaries with features describing each NYT bestseller
    Params:
        ISBNs: list of ISBNs for each NYT bestseller
    Returns:
        List of dictionaries describing bestselling books
    """

    for ISBN in ISBNs:
        
        # log into goodreads website 
        driver.get('https://www.goodreads.com/')
        
        # wait for website to load
        time.sleep(2)
        driver.find_element_by_id('userSignInFormEmail').send_keys(config.email_address)
        driver.find_element_by_id('user_password').send_keys(config.pw, Keys.ENTER)
        time.sleep(2)
        
        try:
            # get the webpage for each ISBN 
            driver.find_element_by_xpath('/html/body/div[4]/main/div[1]/section[1]/div/div/footer/div[1]/div/form/input').send_keys(ISBN,Keys.ENTER)
            time.sleep(2) 

            # grab the current url to scrape
            url = driver.current_url
            
        # Beautiful soup to scrape each book page for features
            html_page = requests.get(url)
            soup = bs(html_page.content, 'html.parser')

            book_dict = {}

            book_dict['title'] = good.get_title(soup)
            book_dict['ISBN'] = ISBN
            book_dict['author'] = good.get_author(soup)
            book_dict['series'] = good.get_series(soup)
            book_dict['genre'] = good.get_genre(soup)
            book_dict['rating'] = good.get_rating(soup)
            book_dict['publish_date'] = good.get_publish_date(soup)
            book_dict['publish_company'] = good.get_publishing_company(soup)
            book_dict['number_of_pages'] = good.get_pages(soup)
            book_dict['format'] = good.get_format(soup)

            list_of_NYT_books.append(book_dict)

        except AttributeError:
            continue
        
        time.sleep(2)
        
    return True

In [None]:
# call the function
get_NYTbook_info(ISBNs)

In [None]:
# save the data
with open('list_of_NYT_books.json','w') as datafile:
    json.dump(list_of_NYT_books, datafile)