In [1]:
"""Required imports"""
import json
import urllib

from bs4 import BeautifulSoup

In [2]:
# We have to generate the webpage dynamically since it's mostly JS.
from contextlib import closing
from selenium.webdriver import Chrome # pip install selenium
from selenium.webdriver.support.ui import WebDriverWait

In [122]:
class Book(object):
    def __init__(self, title, author, gr_id):
        self.title = title.encode('ascii', 'ignore')
        self.author = author
        self.id = gr_id
        self.href = 'https://www.goodreads.com/book/show/%s' % self.id
        
    def __eq__(self, other):
        return self.id == other.id
    
    def __str__(self):
        return "%s by %s" % (self.title, self.author.name)

    def __hash__(self):
        return hash(str(self.id))
    
    def __repr__(self):
        return "%s: %s" % (str(self), self.href)
        
class Author(object):
    def __init__(self, name, gr_id):
        self.name = name.encode('ascii', 'ignore')
        self.id = gr_id
        self.href = 'https://www.goodreads.com/author/show/%s' % self.id
        
    def __eq__(self, other):
        return self.id == other.id
    
    def __str__(self):
        return self.name

    def __hash__(self):
        return hash(str(self.id))
    
    def __repr__(self):
        return "%s: %s" % (str(self), self.href)

In [123]:
def get_raw_html(browser, shelf, page=1):
    '''
    Fetches the raw html for the page corresponding to `shelf`
    and `page` number.
    '''
    service_url = 'https://www.goodreads.com/shelf/show/' 
    params = {
        'page': page
    }
    url = service_url + shelf + '?' + urllib.urlencode(params)
    browser.get(url)
    return browser.page_source

In [124]:
def get_shelf(browser, term, n = 100):
    '''
    Given a browser and a shelf name, returns a list 
    of n `Book`s as determined by Good Reads. 
    '''
    page = 1
    books = set()
    while len(books) < n:
        html = get_raw_html(browser, term, page)
        books_in_page = get_books(html)
        books = books.union(books_in_page)
        page += 1
    return books

In [125]:
def extract_author(anchor):
    '''
    Given an anchor .authorName extracts an Author object.
    '''
    gr_id = anchor['href'].split('/')[-1]
    name = anchor.find('span').contents[0]
    return Author(name=name, gr_id=gr_id)

def extract_book(div):
    '''
    Given an .elementList div for a book, returns a Book object.
    '''
    anchor_tag = div.find('a', {'class': 'bookTitle'})
    title = anchor_tag.contents[0]
    gr_id = anchor_tag['href'].split('/')[-1]
    author_tag = div.find('a', {'class': 'authorName'})
    author = extract_author(author_tag)
    return Book(title=title, author=author, gr_id=gr_id)

In [126]:
def get_books(html):
    '''
    Extracts a set of Books from the html raw source if possible.
    '''
    soup = BeautifulSoup(html, 'html.parser')
    result_list = soup.find('div', { 'class': 'leftContainer'})
    results = result_list.find_all('div', {'class': 'elementList'})
    return set([extract_book(res) for res in results])

In [127]:
def login(browser, email, password):
    '''
    Login to GoodReads so we can access more data.
    '''
    url = 'https://www.goodreads.com/user/sign_in'
    browser.get(url)
    browser.find_element_by_id('user_email').send_keys(email)
    browser.find_element_by_id('user_password').send_keys(password)
    browser.find_element_by_name('next').click()

In [137]:
def find_books(tags, n = 500):
    '''
    Given a list of tags, finds their intersection by looking
    at the top n books in each tag and intersecting the resulting
    sets.
    
    Returns:
        A set of Book objects. It may be helpful to run something like:
        for book in book:
            print book
    '''
    books_by_tag = []
    with closing(Chrome('./chromedriver')) as browser:
        login(browser, 'luis.perez.live@gmail.com', 'luis3137')
        for tag in tags:
            books_by_tag.append(get_shelf(browser, tag, n = n))
            print "Finished collecting data for tag %s." % tag
    return set.intersection(*books_by_tag)

In [None]:
books = find_books(['strong-heroine', 'female-lead', 'fantasy', 'kindle-unlimited'], n = 10000)

In [142]:
for book in books:
    print book.href + " author: " + str(book.author)

https://www.goodreads.com/book/show/6148028-catching-fire author: Suzanne Collins
https://www.goodreads.com/book/show/7260188-mockingjay author: Suzanne Collins
https://www.goodreads.com/book/show/2767052-the-hunger-games author: Suzanne Collins
https://www.goodreads.com/book/show/15863832-angelfall author: Susan Ee


In [None]:
# Good reads data pulling.
def get_shelf(shelf_name):
    service_url = 'https://www.goodreads.com/shelf/show/'
    url = service_url + '?' + shelf_name
    response = urllib.urlopen(url).read()
    
    return response

In [None]:
get_shelf("fiction")