# NYT API calls & Goodreads scraping for features

In [64]:
import requests
import time
import config
import json
from bs4 import BeautifulSoup as bs
!pip install selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import Goodreads_helper_functions as good

## Call NYT API for best sellers

In [None]:
# this function calls the NYT API for all NYT bestsellers from 2017 until now
def get_books():
    all_best_sellers = []
    
    # offset the pages by multiples of 20
    for i in range(0, 32326, 20):
        params = {'api-key': config.NYT_api_key,
                 'offset': i}
        url = 'https://api.nytimes.com/svc/books/v3/lists/best-sellers/history.json'
        response = requests.get(url, params = params)
        data = response.json()
        
        # create dict to hold data each time
        all_best_sellers_dict = {}
        
        for book in data['results']:
            try:
                # only include books that made it on a list in 2017 to present
                if ('2017' or '2018' or '2019') in book['ranks_history'][0]['published_date']:
            
                    all_best_sellers_dict['title'] = book['title']
                    all_best_sellers_dict['author'] = book['author']
                    all_best_sellers_dict['publisher'] = book['publisher']
                    all_best_sellers_dict['ISBN'] = book['isbns']
        
                    # add the dictionary to master list
                    all_best_sellers.append(all_best_sellers_dict)
            
            # if we don't know when the book was on a best seller list, skip it
            except IndexError:
                continue
                
        # print what page we are on
        print(i)
        
        # wait so we don't hit per minute call limit
        time.sleep(6)
            
    return all_best_sellers

In [None]:
# call the function
NYT_books = get_books()

In [None]:
# save it 
import json
with open('NYT_books.json','w') as datafile:
    json.dump(NYT_books, datafile)

In [9]:
with open("NYT_books.json") as datafile:
    NYT_data = json.load(datafile)

In [22]:
NYT_data

[{'title': "------, THAT'S DELICIOUS",
  'author': 'Action Bronson with Rachel Wharton',
  'publisher': 'Abrams',
  'ISBN': [{'isbn10': '1419726552', 'isbn13': '9781419726552'}]},
 {'title': '10-DAY GREEN SMOOTHIE CLEANSE',
  'author': 'J J Smith',
  'publisher': 'Atria',
  'ISBN': [{'isbn10': '0982301820', 'isbn13': '9780982301821'},
   {'isbn10': '1501100114', 'isbn13': '9781501100116'},
   {'isbn10': '1501100106', 'isbn13': '9781501100109'}]},
 {'title': '15TH AFFAIR',
  'author': 'James Patterson and Maxine Paetro',
  'publisher': 'Grand Central',
  'ISBN': [{'isbn10': '0316407070', 'isbn13': '9780316407076'},
   {'isbn10': '0316290033', 'isbn13': '9780316290036'},
   {'isbn10': '031629005X', 'isbn13': '9780316290050'},
   {'isbn10': '1455585270', 'isbn13': '9781455585274'}]},
 {'title': '42 FAITH',
  'author': 'Ed Henry',
  'publisher': 'Thomas Nelson',
  'ISBN': [{'isbn10': '0718088808', 'isbn13': '9780718088804'},
   {'isbn10': '0718089057', 'isbn13': '9780718089054'}]},
 {'titl

In [69]:
# this function retrieves the ISBNs from the API call above
def get_NYT_ISBNS(data):
    ISBNs = []
    for book in data:
        try: 
            ISBN = book['ISBN'][0]['isbn13']
            ISBNs.append(ISBN)
        except IndexError:
            continue
    return ISBNs

In [70]:
# call the function above
ISBNs = get_NYT_ISBNS(NYT_data)

## Scrape goodreads for data on NYT bestsellers

In [90]:
# set preferences for selenium
driver_options = webdriver.chrome.options.Options()
prefs = {"profile.managed_default_content_settings.images": 2}
driver_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=driver_options)

In [91]:
# log in to goodreads website
driver.find_element_by_id('userSignInFormEmail').send_keys(config.email_address)
driver.find_element_by_id('user_password').send_keys(config.pw, Keys.ENTER)
time.sleep(2)

In [100]:
# this closes open browser
driver.close()

InvalidSessionIdException: Message: invalid session id


In [92]:
# this function takes in a list of ISBNs that were gathered from the NYT API 
# and returns a list of dictionaries, where each dictionary reflects data 
# about the NYT bestseller

list_of_NYT_books = []

def get_NYTbook_info(ISBNs):

    for ISBN in ISBNs:
        if ISBN not in list_of_NYT_books:
        # selenium
            driver.get('https://www.goodreads.com/')
            time.sleep(2)
            try:
                driver.find_element_by_xpath('/html/body/div[4]/main/div[1]/section[1]/div/div/footer/div[1]/div/form/input').send_keys(ISBN,Keys.ENTER)
                time.sleep(2) 
        
                url = driver.current_url
            
        # Beautiful soup
                html_page = requests.get(url)
                soup = bs(html_page.content, 'html.parser')

                book_dict = {}

                book_dict['title'] = good.get_title(soup)
                book_dict['ISBN'] = ISBN
                book_dict['author'] = good.get_author(soup)
                book_dict['series'] = good.get_series(soup)
                book_dict['genre'] = good.get_genre(soup)
                book_dict['rating'] = good.get_rating(soup)
                book_dict['publish_date'] = good.get_publish_date(soup)
                book_dict['publish_company'] = good.get_publishing_company(soup)
                book_dict['number_of_pages'] = good.get_pages(soup)
                book_dict['format'] = good.get_format(soup)

                list_of_NYT_books.append(book_dict)

            except AttributeError:
                continue
            time.sleep(2)
        
    return True

In [93]:
# call the function to get NYT bestseller information from goodreads website
get_NYTbook_info(ISBNs)

In [98]:
with open('list_of_NYT_books.json','w') as datafile:
    json.dump(list_of_NYT_books, datafile)