In [1]:
import pandas as pd
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pickle
import time
import os

**Logging in manually once to bypass repeated login prompts.**
**This allows me to save session cookies so Goodreads doesn’t keep asking for login on future runs.**

In [None]:
driver = webdriver.Chrome()

# Navigating to the Goodreads login page
driver.get("https://www.goodreads.com/user/sign_in")

# Pausing the script to log in manually
input("press Enter to continue")

# Saving the session cookies to a file for later use
cookies = driver.get_cookies()
with open("goodreads_cookies.pkl", "wb") as f:
    pickle.dump(cookies, f)

driver.quit()

**Gathering top Listopia list links**

In [None]:
URL = 'https://www.goodreads.com/list' # Targeting the main Listopia page where thematic book lists are hosted
driver = webdriver.Chrome()

# Opening the homepage
driver.get("https://www.goodreads.com")  

# Applying saved cookies to log in
with open("goodreads_cookies.pkl", "rb") as f:
    cookies = pickle.load(f)
    for cookie in cookies:
        driver.add_cookie(cookie)

# Reloading the page after applying cookies
driver.get("https://www.goodreads.com")
driver.get(URL) # Navigating to the page that contains all the book lists

Genre_lists = set()

# Locating all Listopia links listed under themed categories
elems = driver.find_elements(By.XPATH, "//ul[@class='listTagsTwoColumn']//a[@class='actionLinkLite']")

# Extracting URLs of all themed book lists and storing it in a set to ensure uniqueness.
for elem in elems:
    href = elem.get_attribute('href')
    if href:
        Genre_lists.add(href)

for href in Genre_lists:
    print(href)
    
driver.close()

https://www.goodreads.com/list/tag/lgbt
https://www.goodreads.com/list/tag/romance
https://www.goodreads.com/list/tag/middle-grade
https://www.goodreads.com/list/tag/thriller
https://www.goodreads.com/list/tag/women
https://www.goodreads.com/list/tag/fiction
https://www.goodreads.com/list/tag/title-challenge
https://www.goodreads.com/list/tag/queer
https://www.goodreads.com/list/tag/nonfiction
https://www.goodreads.com/list/tag/historical-romance
https://www.goodreads.com/list/tag/science-fiction
https://www.goodreads.com/list/tag/love
https://www.goodreads.com/list/tag/biography
https://www.goodreads.com/list/tag/contemporary
https://www.goodreads.com/list/tag/fantasy
https://www.goodreads.com/list/tag/lgbtq
https://www.goodreads.com/list/tag/titles
https://www.goodreads.com/list/tag/gay
https://www.goodreads.com/list/tag/best
https://www.goodreads.com/list/tag/series
https://www.goodreads.com/list/tag/historical-fiction
https://www.goodreads.com/list/tag/horror
https://www.goodreads.

**Collecting URLs of top Listopia book lists from each subject category**

In [None]:
Book_list_Links = set()
driver = webdriver.Chrome()

# Using a while loop to limit the number of book list URLs collected.
# Each book list link typically contains 100–150 books
# so it's Capped at 80 book list URLs to prevent creating an excessively large dataset.

while len(Book_list_Links) < 80:
    for href in Genre_lists:
        if len(Book_list_Links) >= 80:
            break 

        # Locating all user-curated list links (e.g., "Best History Books", "Memoirs by Women") under the 'listTitle' class.
        driver.get(href)
        elems = driver.find_elements(By.CLASS_NAME, 'listTitle')

        # Extracting the href of each list and storing it in a set to ensure uniqueness.
        for elem in elems:
            book_href = elem.get_attribute('href') 
            if book_href: # if href is not a null add it in the Book_list_Links
                Book_list_Links.add(book_href)
    driver.quit()

    for link in Book_list_Links:
        print(link)

https://www.goodreads.com/list/show/122109.Best_M_M_Romance_of_2018
https://www.goodreads.com/list/show/79446.Bully_Love_Stories
https://www.goodreads.com/list/show/2491.Must_Read_Books_Different_Genres
https://www.goodreads.com/list/show/27443.100_Best_Lesbian_Fiction_Memoir_Books_Of_All_Time_
https://www.goodreads.com/list/show/1708.Best_Intro_to_Sci_Fi_for_Young_Readers
https://www.goodreads.com/list/show/14945.F_F_Paranormal_and_Urban_Fantasy
https://www.goodreads.com/list/show/26227.2013_Debut_Authors_Young_Adult_Middle_Grade_
https://www.goodreads.com/list/show/12066.College_Romance
https://www.goodreads.com/list/show/5490.YA_Books_Far_Better_than_Twilight
https://www.goodreads.com/list/show/5505.Best_Gay_Historical_Fiction
https://www.goodreads.com/list/show/226.Favorite_books_from_my_childhood
https://www.goodreads.com/list/show/5038.Best_Graphic_Novels_for_Children
https://www.goodreads.com/list/show/128754.Young_Adult_Novels_2020
https://www.goodreads.com/list/show/11525.2012

**Extracting individual book links from each Listopia book lists**

In [None]:
Book_links = set()
driver = webdriver.Chrome()

# Opening each Listopia book lists page to locate and extract individual book URLs
for href in Book_list_Links:
    driver.get(href) 

    # Locating all book links under the 'bookTitle' class.
    elems = driver.find_elements(By.XPATH, "//table[contains(@class, 'tableList')]//a[@class='bookTitle']")
    
    # Extracting the href of each book and storing it in a set to ensure uniqueness.
    for elem in elems:
        href = elem.get_attribute('href')
        if href: # if href is not a null add it in the Book_links
            Book_links.add(href)

print(len(Book_links))

driver.close()

8654


In [None]:
# Helper function to safely extract the text of a single element
def get_element_text(by, value):
    try:
        return driver.find_element(by, value).text
    except:
        return None  # Returns None if the element is not found or inaccessible

# Helper function to safely extract the text from multiple elements
def get_elements_text(by, value):
    try:
        elements = driver.find_elements(by, value)
        return [element.text for element in elements]
    except:
        return []  # Returns an empty list if no elements are found or an error occurs

In [None]:
data = {'BookTitle': [], 'Author': [],'Format': [],'Published': [],'ISBN': [],'ASIN': [],
        'Language': [],'Total_Ratings': [],'Reviews': [],'Rating': [],'Genres': []}

**Extracting Book Data**

In [None]:
# Setting Chrome options for headless scraping
# (Disabling images, extensions, sandbox, and GPU for performance)
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu") 
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--blink-settings=imagesEnabled=false') 
options.add_argument('--disable-extensions')
options.add_argument('--disable-infobars')

# Initializing the headless browser
driver = webdriver.Chrome(options=options)

# Opening the homepage to login 
driver.get("https://www.goodreads.com")

# Applying saved cookies to log in 
with open("goodreads_cookies.pkl", "rb") as f:
    cookies = pickle.load(f)
    for cookie in cookies:
        driver.add_cookie(cookie)

# Reloading the page after applying cookies
driver.get("https://www.goodreads.com")

# Opening each book links page to extract book Data
for href in Book_links:
    try:
        driver.get(href)
    except:
        continue # Skipping any broken links or load failures

    # Collecting all the data
    title = get_element_text(By.XPATH, "//h1[@data-testid='bookTitle']")
    authors = get_element_text(By.XPATH, "//span[@class='ContributorLink__name']")
    rating = get_element_text(By.CLASS_NAME,'RatingStatistics__rating')
    total_ratings = get_element_text(By.XPATH, "//span[@data-testid='ratingsCount']")
    reviews = get_element_text(By.XPATH, "//span[@data-testid='reviewsCount']")
    genres = get_elements_text(By.XPATH, "//a[contains(@class, 'Button--tag')]")
    format = get_element_text(By.XPATH, "//div[contains(@class, 'EditionDetails')]//div[contains(@class, 'DescListItem')]//dt[contains(text(), 'Format')]/following-sibling::dd//div[contains(@class, 'TruncatedContent')]//div[contains(@class, 'TruncatedContent__text') and @tabindex='-1']")
    Published = get_element_text(By.XPATH, "//div[contains(@class, 'EditionDetails')]//div[contains(@class, 'DescListItem')]//dt[contains(text(), 'Published')]/following-sibling::dd//div[contains(@class, 'TruncatedContent')]//div[contains(@class, 'TruncatedContent__text') and @tabindex='-1']")
    ISBN = get_element_text(By.XPATH, "//div[contains(@class, 'EditionDetails')]//div[contains(@class, 'DescListItem')]//dt[contains(text(), 'ISBN')]/following-sibling::dd//div[contains(@class, 'TruncatedContent')]//div[contains(@class, 'TruncatedContent__text') and @tabindex='-1']")
    ASIN = get_element_text(By.XPATH, "//div[contains(@class, 'EditionDetails')]//div[contains(@class, 'DescListItem')]//dt[contains(text(), 'ASIN')]/following-sibling::dd//div[contains(@class, 'TruncatedContent')]//div[contains(@class, 'TruncatedContent__text') and @tabindex='-1']")
    Language = get_element_text(By.XPATH, "//div[contains(@class, 'EditionDetails')]//div[contains(@class, 'DescListItem')]//dt[contains(text(), 'Language')]/following-sibling::dd//div[contains(@class, 'TruncatedContent')]//div[contains(@class, 'TruncatedContent__text') and @tabindex='-1']")
    
    # Storing all the data
    data['BookTitle'].append(title)
    data['Author'].append(authors)
    data['Total_Ratings'].append(total_ratings)
    data['Rating'].append(rating)
    data['Reviews'].append(reviews)
    data['Genres'].append(genres)
    data['Format'].append(format)
    data['Published'].append(Published)
    data['ISBN'].append(ISBN)
    data['ASIN'].append(ASIN)
    data['Language'].append(Language)
driver.quit()

In [None]:
data2 = pd.DataFrame(data)
data2.to_csv('goodreads_scrapper.csv')

**This code is able to extract both Paperback and Hardcover prices from Amazon. However, the process is quite slow because Goodreads does not store Amazon links in the page source or within identifiable HTML classes. Instead, the links are dynamically generated only after clicking the "Buy on Amazon" button, which opens a dropdown and redirects to Amazon in a new tab. This need for interaction and page switching significantly slows down the scraping process.**

In [None]:
chrome_options = Options()
chrome_options.add_argument("--headless")
options.add_argument("--disable-gpu") 
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--blink-settings=imagesEnabled=false') 
options.add_argument('--disable-extensions')
options.add_argument('--disable-infobars')

# Setting Chrome options for headless scraping
# (Disabling images, extensions, sandbox, and GPU for performance)
driver = webdriver.Chrome(options=chrome_options)

amazon_links = set()

# Opening Goodreads and applying cookies to log in
driver.get("https://www.goodreads.com")
with open("goodreads_cookies.pkl", "rb") as f:
    cookies = pickle.load(f)
    for cookie in cookies:
        driver.add_cookie(cookie)

# Reloading the homepage after applying cookies
driver.get("https://www.goodreads.com")


# Loop through each book link to extract Amazon price info
for href in Book_links:
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(href)
    try:
        # Clicking the "Buy on Amazon" dropdown button
        Viewlinks_button = driver.find_element(By.XPATH, "//button[contains(@class, 'Button--buy') and contains(@class, 'Button--medium') and contains(@class, 'Button--rounded')]")
        Viewlinks_button.click()

        # Clicking the Amazon link from the dropdown menu
        amazon_button = driver.find_element(By.XPATH,"//button[contains(@class,'DropdownMenu__item') and contains(@aria-label,'Shop this book on Amazon, link, opens in new tab')]")
        amazon_button.click()

        new_tab = driver.window_handles[-1]  # Get the handle of the newly opened tab
        driver.switch_to.window(new_tab)     # Switch to the new tab
    except Exception as e: 
        print(f"Error encountered: {e}")   

    # Extracting Paperback and Hardcover prices from the Amazon page
    PaperBack_price = get_element_text(By.XPATH, "//span[@class='slot-price' and preceding::span[contains(@aria-label, 'Paperback')]]")
    HardCover_price = get_element_text(By.XPATH, "//span[@class='slot-price' and preceding::span[contains(@aria-label, 'Hardcover')]]")

    # Storing the prices in the dataset
    data['PaperBack_price'].append(PaperBack_price)
    data['HardCover_price'].append(HardCover_price)


    print(data)

    driver.close()

driver.quit()


{'BookTitle': ['Vampire Academy', 'You', 'Grey', "Wizard's First Rule", "The Qur'an", 'Hillbilly Elegy: A Memoir of a Family and Culture in Crisis', 'Dragonbride', 'Blood Promise', 'Hades', 'Breaking Dawn', 'Prince Lestat', 'New Book', 'Vampire Academy', 'You', 'Grey', "Wizard's First Rule", "The Qur'an", 'Hillbilly Elegy: A Memoir of a Family and Culture in Crisis', 'Dragonbride', 'Blood Promise', 'Hades', 'Breaking Dawn', 'Prince Lestat', 'New Book', 'Passion', 'The Selection', 'City of Heavenly Fire', 'Fifty Shades Freed', 'City of Glass', 'Frostbite', 'Crown of Starlight', 'Shadow Kiss', 'The Enlightenment of Alexander', 'The Gollywhopper Games', 'Spirit Bound', "Tiger's Curse", 'The Giver', 'Fifty Shades of Grey', 'Shiver', 'Awake at Dawn', "Harry Potter and the Sorcerer's Stone", 'Only Ever Yours', 'City of Bones', 'A Game of Thrones', 'Starcrossed', 'Torment', 'Angel Star', 'Freed', 'Silence', 'Max', 'The Lord of the Rings', 'Clockwork Angel', 'The Short Second Life of Bree Tann

KeyboardInterrupt: 