In [2]:
#import relevant libraries
import requests
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd

In [34]:
#defining a function to scrope quotes from a single webpage
def get_page_quotes(url):
    '''
    Scapes all of the quotes from a single page of goodreads quotes
    
    Args:
        url (text): a url to a single page of quotes on goodreads
        format "https://www.goodreads.com/author/quotes/[AUTHOR SPECIFIC INFO]?page=[NUMBER]"
    
    Returns:
        quotes (list): a list of all the quotes on the given url page
    '''
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.text, "html.parser")
    quotes = []

    for div in soup.findAll('div', class_='quoteText'):
        quotes.append(div.find_all(text=True)[0])

    return quotes

In [51]:
#defining a function to scrope pages of quotes for a single author
def scrape_all_pages(url_base, author, numPages):
    '''
    Scapes all of the quotes for a particular author (specified in url_base) from the goodreads quotes page starting
    from the first page of quotes to the indicated last page of quotes (last page specified as numPages). Returns a two
    column dataframe containing quotes in one column, and the author name (specified as author) in the other column
    
    Args:
        url_base (text): goodreads quotes url for a specific author, with page info removed
                         format "https://www.goodreads.com/author/quotes/[AUTHOR SPECIFIC INFO]?page="
        author (str):   Name of author being scraped
        numPages (int): number of pages of quotes to be scraped
                        must be less than or equal to the total number of pages that exist for the author
    
    Returns:
        quotes (dataframe): a two column dataframe containing: (1) a list of all the quotes from the first
                            page of quotes to the indicated last page of quotes. Note that some pages may
                            be skipped due to an internet timeout exception, print statements will indicate
                            if pages have been successfully scraped or if they failed to scrape. (2) the
                            author name as written in the args to the function
    '''    
    quotes = []
    for page in range (1,numPages+1):
        url = url_base + str(page)
        try:
            quotes.extend(get_page_quotes(url))
            print('Successfully Scraped Page #'+str(page))
        except:
            ## sometimes my internet timesout for a small window, and I don't care that I scrape every page
            ## so I just pass if it's taking too long to respond
            print('Failed to Scrape Page #'+str(page))
            pass
    
    df = pd.DataFrame(data = quotes, columns = ['Quote'])
    df['Author'] = author
    
    return df

In [53]:
# defining input parameters for the scrape_all_pages function for both authors
Jane_Austen_url_base = 'https://www.goodreads.com/author/quotes/1265.Jane_Austen?page='
JK_Rowling_url_base = 'https://www.goodreads.com/author/quotes/1077326.J_K_Rowling?page='
numPages = 100 ## Jane Austen has ~120 pages; JK Rowling has ~210 pages

In [54]:
# scraping all pages for Jane Austen. When I ran, failed to scrape pages 11, 12, 14, 72
Jane_quotes = scrape_all_pages(Jane_Austen_url_base,'Jane Austen',numPages)

Successfully Scraped Page #1
Successfully Scraped Page #2
Successfully Scraped Page #3
Successfully Scraped Page #4
Successfully Scraped Page #5
Successfully Scraped Page #6
Successfully Scraped Page #7
Successfully Scraped Page #8
Successfully Scraped Page #9
Successfully Scraped Page #10
Failed to Scrape Page #11
Failed to Scrape Page #12
Successfully Scraped Page #13
Failed to Scrape Page #14
Successfully Scraped Page #15
Successfully Scraped Page #16
Successfully Scraped Page #17
Successfully Scraped Page #18
Successfully Scraped Page #19
Successfully Scraped Page #20
Successfully Scraped Page #21
Successfully Scraped Page #22
Successfully Scraped Page #23
Successfully Scraped Page #24
Successfully Scraped Page #25
Successfully Scraped Page #26
Successfully Scraped Page #27
Successfully Scraped Page #28
Successfully Scraped Page #29
Successfully Scraped Page #30
Successfully Scraped Page #31
Successfully Scraped Page #32
Successfully Scraped Page #33
Successfully Scraped Page #34
S

NameError: name 'scrape_all_page' is not defined

In [56]:
# scraping all pages for JK Rowling. When I ran, failed to scrape pages 23, 78, 88, 90
JK_quotes = scrape_all_pages(JK_Rowling_url_base,'JK Rowling',numPages)

Successfully Scraped Page #1
Successfully Scraped Page #2
Successfully Scraped Page #3
Successfully Scraped Page #4
Successfully Scraped Page #5
Successfully Scraped Page #6
Successfully Scraped Page #7
Successfully Scraped Page #8
Successfully Scraped Page #9
Successfully Scraped Page #10
Successfully Scraped Page #11
Successfully Scraped Page #12
Successfully Scraped Page #13
Successfully Scraped Page #14
Successfully Scraped Page #15
Successfully Scraped Page #16
Successfully Scraped Page #17
Successfully Scraped Page #18
Successfully Scraped Page #19
Successfully Scraped Page #20
Successfully Scraped Page #21
Successfully Scraped Page #22
Failed to Scrape Page #23
Successfully Scraped Page #24
Successfully Scraped Page #25
Successfully Scraped Page #26
Successfully Scraped Page #27
Successfully Scraped Page #28
Successfully Scraped Page #29
Successfully Scraped Page #30
Successfully Scraped Page #31
Successfully Scraped Page #32
Successfully Scraped Page #33
Successfully Scraped Pa

In [60]:
# saving the qutoes to a CSV file
df = pd.concat([Jane_quotes,JK_quotes])
df.to_csv(path_or_buf=r'C:\Users\MainUser\Desktop\Quotes.csv', index=False)