In [9]:
import httpx
from bs4 import BeautifulSoup
import re
from datetime import datetime
from calendar import monthrange
import pandas as pd
from pydantic import BaseModel


In [22]:
rating_map = {
    "did not like it": 1,
    "it was ok": 2,
    "liked it": 3,
    "really liked it": 4,
    "it was amazing": 5
}

In [36]:
class Book(BaseModel):

    title: str
    authorName: str
    dateRead: datetime
    userRating: int
    review: str | None = None

In [33]:
def parse_date(date_str):
    for fmt in ("%b %d, %Y", "%b %Y"):
        try:
            dt = datetime.strptime(date_str, fmt)
            if fmt == "%b %Y":
                # Set to last day of the month
                last_day = monthrange(dt.year, dt.month)[1]
                dt = dt.replace(day=last_day)
            return dt
        except Exception:
            continue
    return None

In [43]:
url = "https://www.goodreads.com/review/list/177424567?shelf=read"

def get_response(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    with httpx.Client(headers=headers, follow_redirects=True) as client:
        response = client.get(url)
        if response.status_code == 200:
            return response.text
        elif response.status_code == 404:
            raise ValueError(f"404 Not Found: {response.url}")
        else:
            raise ValueError(f"Unexpected status: {response.status_code}")

In [44]:
html = get_response(url)
soup = BeautifulSoup(html, "html.parser")
# Find all <tr> tags with id starting with 'review_'


In [49]:
review_trs = soup.find_all('tr', id=re.compile(r'^review_'))
data = []
for tr in review_trs:
    # Date extraction
    date_span = tr.find('span', class_='date_read_value')
    date_str = date_span.get_text(strip=True) if date_span else None
    date_read = parse_date(date_str) if date_str else None

    # Review extraction
    review_span = tr.find('span', id=re.compile(r'^freeTextContainerreview'))
    review_text = review_span.get_text(strip=True) if review_span else None

    # Author extraction
    author_td = tr.find('td', class_='field author')
    author_a = author_td.find('a') if author_td else None
    author_name = author_a.get_text(strip=True) if author_a else None

    # Title extraction
    title_td = tr.find('td', class_='field title')
    title_a = title_td.find('a') if title_td else None
    title = title_a.get_text(strip=True) if title_a else None

    # Rating extraction
    rating_td = tr.find('td', class_='field rating')
    rating_span = rating_td.find('span', class_='staticStars') if rating_td else None
    rating_text = rating_span.get('title') if rating_span and rating_span.has_attr('title') else None
    rating = rating_map.get(rating_text, 0) if rating_text else 0

    # Only add if required fields are present
    if author_name and title and date_read is not None:
        book = Book(
            authorName=author_name,
            title=title,
            dateRead=date_read,
            userRating=rating,
            review=review_text
        )
        data.append(book)

In [50]:
data[0]

Book(title='Brave New World', authorName='Huxley, Aldous', dateRead=datetime.datetime(2025, 7, 6, 0, 0), userRating=4, review=None)

In [47]:
html

'<!DOCTYPE html>\n<html class="desktop\n">\n<head>\n  <title>Faith’s &#39;read&#39; books on Goodreads (161 books)</title>\n\n<meta content=\'Faith has 161 books on her read shelf: Brave New World by Aldous Huxley, The Hellbound Heart by Clive Barker, One Day, Everyone Will Have Always Been Aga...\' name=\'description\'>\n<meta content=\'telephone=no\' name=\'format-detection\'>\n<link href=\'https://www.goodreads.com/review/list/177424567?shelf=read\' rel=\'canonical\'>\n  <meta property="og:title" content="Faith’s &#39;read&#39; books on Goodreads (161 books)"/>\n  <meta property="og:type" content="website"/>\n  <meta property="og:site_name" content="Goodreads"/>\n  <meta property="og:description" content="Faith has 161 books on her read shelf: Brave New World by Aldous Huxley, The Hellbound Heart by Clive Barker, One Day, Everyone Will Have Always Been Aga..."/>\n    <meta property="og:image" content="https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1298180450l/

In [48]:
pagination_div = soup.find('div', id='reviewPagination')

# Extract all page number links
page_links = pagination_div.find_all('a')

# Extract page numbers and convert to integers (ignore "next »" text)
page_numbers = [int(a.text) for a in page_links if a.text.isdigit()]

# Get the highest page number
total_pages = max(page_numbers) if page_numbers else 1

# Output
print(f"Total pages: {total_pages}")

Total pages: 9
