In [49]:
import httpx
from bs4 import BeautifulSoup
import re
from datetime import datetime
from calendar import monthrange
import pandas as pd
from pydantic import BaseModel, model_validator


### TODO:
1. Function-ify everything.
2. Extract series from HTML if present.
3. Add "Library"
4. Code optimizations

In [2]:
rating_map = {
    "did not like it": 1,
    "it was ok": 2,
    "liked it": 3,
    "really liked it": 4,
    "it was amazing": 5
}

In [50]:
class Book(BaseModel):

    title: str
    series: str | None = None
    seriesNumber: int | None = None
    authorName: str
    dateRead: datetime
    userRating: int
    review: str | None = None

    @property
    def full_title(self) -> str:

        title = f"{self.title} "

        if self.series:

            title += f"({self.series}, #{self.seriesNumber}) "

        title += f"by {self.authorName}"

        return title

    @model_validator(mode="after")
    def check_series_fields(self) -> "Book":
        if (self.series is not None) != (self.seriesNumber is not None):
            raise ValueError("Both series and seriesNumber must be set together or both be None.")
        return self

In [33]:
def parse_date(date_str):
    for fmt in ("%b %d, %Y", "%b %Y"):
        try:
            dt = datetime.strptime(date_str, fmt)
            if fmt == "%b %Y":
                # Set to last day of the month
                last_day = monthrange(dt.year, dt.month)[1]
                dt = dt.replace(day=last_day)
            return dt
        except Exception:
            continue
    return None

In [34]:
# Code completion recommendations are disabled in this notebook.
def format_url(user_id: int, page: int = 1):
    url = f"https://www.goodreads.com/review/list/{user_id}?page={page}&shelf=read"
    return url



In [35]:
def get_response(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    with httpx.Client(headers=headers, follow_redirects=True) as client:
        response = client.get(url)
        if response.status_code == 200:
            return response.text
        elif response.status_code == 404:
            raise ValueError(f"404 Not Found: {response.url}")
        else:
            raise ValueError(f"Unexpected status: {response.status_code}")

def extract_books_from_html(html):
    soup = BeautifulSoup(html, "html.parser")
    review_trs = soup.find_all('tr', id=re.compile(r'^review_'))
    data = []
    for tr in review_trs:
        # Date extraction
        date_span = tr.find('span', class_='date_read_value')
        date_str = date_span.get_text(strip=True) if date_span else None
        date_read = parse_date(date_str) if date_str else None

        # Review extraction
        review_span = tr.find('span', id=re.compile(r'^freeTextContainerreview'))
        review_text = review_span.get_text(strip=True) if review_span else None

        # Author extraction
        author_td = tr.find('td', class_='field author')
        author_a = author_td.find('a') if author_td else None
        author_name = author_a.get_text(strip=True) if author_a else None

        # Title and Series extraction
        title_td = tr.find('td', class_='field title')
        title_a = title_td.find('a') if title_td else None
        series = None
        series_number = None
        title = None
        if title_a:
            # Extract title text (excluding series)
            title_text = title_a.contents[0].strip() if title_a.contents else title_a.get_text(strip=True)
            # Extract series if present
            series_span = title_a.find('span', class_='darkGreyText')
            if series_span:
                series_info = series_span.get_text(strip=True)
                # Example: "(The Stormlight Archive, #1)"
                match = re.match(r"\((.*),\s*#(\d+)\)", series_info)
                if match:
                    series = match.group(1)
                    series_number = int(match.group(2))
            title = title_text

        # Rating extraction
        rating_td = tr.find('td', class_='field rating')
        rating_span = rating_td.find('span', class_='staticStars') if rating_td else None
        rating_text = rating_span.get('title') if rating_span and rating_span.has_attr('title') else None
        rating = rating_map.get(rating_text, 0) if rating_text else 0

        # Only add if required fields are present
        if author_name and title and date_read is not None:
            book = Book(
                authorName=author_name,
                title=title,
                dateRead=date_read,
                userRating=rating,
                review=review_text,
                series=series,
                seriesNumber=series_number
            )
            data.append(book)
    return data

In [36]:
user_id = 177424567

url = format_url(user_id)

In [48]:
html = get_response(url)
soup = BeautifulSoup(html, "html.parser")
# Find all <tr> tags with id starting with 'review_'

pagination_div = soup.find('div', id='reviewPagination')

# Extract all page number links
page_links = pagination_div.find_all('a')

# Extract page numbers and convert to integers (ignore "next »" text)
page_numbers = [int(a.text) for a in page_links if a.text.isdigit()]

# Get the highest page number
total_pages = max(page_numbers) if page_numbers else 1

books = extract_books_from_html(html)

for page in range(2, total_pages+1):
    next_url = format_url(user_id, page)
    html = get_response(next_url)
    books += extract_books_from_html(html)

In [51]:
for book in books:
    print(book.full_title)

Brave New World by Huxley, Aldous
The Hellbound Heart by Barker, Clive
One Day, Everyone Will Have Always Been Against This by El Akkad, Omar
The Haunting of Hill House by Jackson, Shirley
Fry Bread: A Native American Family Story by Maillard, Kevin Noble
The Last Cuentista by Higuera, Donna Barba
Beetle & the Hollowbones (The Beetle Books, #1) by Layne, Aliza
The Rock from the Sky by Klassen, Jon
Life in the Present: A Joyful Collection of Comics About Living in the Moment by Climo, Liz
Most Ardently by Novoa, Gabe Cole
Night Owls by Vishny, A.R.
The Employees by Ravn, Olga
The Wood at Midwinter by Clarke, Susanna
Alcatraz vs. the Evil Librarians (Alcatraz vs. the Evil Librarians, #1) by Sanderson, Brandon
The Spook's Sacrifice (The Last Apprentice / Wardstone Chronicles, #6) by Delaney, Joseph
The Librarian’s Guide to Book Programs and Author Events by Hooper, Brad
Book Club Reboot: 71 Creative Twists by Ostman, Sarah
Joyful Song: A Naming Story by Newman, Lesléa
Tread of Angels by R

In [None]:
def get_library(user_id: int) -> list[Book]:
    url = format_url(user_id)
    html = get_response(url)
    soup = BeautifulSoup(html, "html.parser")
    pagination_div = soup.find('div', id='reviewPagination')
    page_links = pagination_div.find_all('a') if pagination_div else []
    page_numbers = [int(a.text) for a in page_links if a.text.isdigit()]
    total_pages = max(page_numbers) if page_numbers else 1
    books = extract_books_from_html(html)
    for page in range(2, total_pages+1):
        next_url = format_url(user_id, page)
        html = get_response(next_url)
        books += extract_books_from_html(html)
    return books