In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Explorations of web-scraping Goodreads website

In [24]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from pydantic import BaseModel


In [25]:
USERID = 144433284
BASE_URL = "https://www.goodreads.com/review/list/"
res = requests.get(
    BASE_URL + str("144433284")
)
res.status_code


200

In [26]:
books_soup = BeautifulSoup(res.content, "html.parser")


In [27]:
books_table = books_soup.find("table", id="books")


In [28]:
table_header = [
    x.strip() for x in books_table.find("tr", id="booksHeader").text.splitlines()
]
table_header = [x for x in table_header if x != ""]
table_header[:5]


['#', 'cover', 'title', 'author', 'isbn']

In [29]:
from typing import Any, Optional
from datetime import date, datetime

class BookData(BaseModel):
    """Book data."""

    book_url: str
    cover_url: str
    full_title: str
    show_title: str
    author: str
    author_url: str
    isbn: Optional[str]
    isbn13: Optional[str]
    asin: Optional[str]
    avg_rating: float
    num_ratings: int
    date_pub: Optional[date]
    date_pub_edition: Optional[date]
    rating: Optional[int]
    # shelves

class UnknownDateFormatException(BaseException):
    """Unknown date format."""

    ...

def _search_table_row_td(tr: Tag, class_: str, find: str) -> Tag:
    return tr.find("td", class_=class_).find(find)

def _get_table_row_td_value(tr: Tag, class_: str) -> str:
    return tr.find(class_=class_).find("div", class_="value").text.strip()

def _parse_dates(date_str: str) -> Optional[date]:
    if date_str == "unknown":
        return None
    fmt_date: Optional[date] = None
    for fmt in ["%b %Y", "%b %d, %Y", "%Y"]:
        try:
            fmt_date = datetime.strptime(date_str, fmt).date() 
            return fmt_date
        except ValueError:
            pass
    if fmt_date is None:
        raise UnknownDateFormatException(date_str)

def _extract_star_rating(star_tag: Optional[Tag]) -> int:
    stars_on = list(star_tag.find_all(class_="staticStar p10"))
    if len(stars_on) == 0:
        return None
    return len(stars_on)

def parse_book_table_row(tr: Tag) -> BookData:
    _data: dict[str, Any] = {}

    # Cover
    cover_url = _search_table_row_td(tr, "field cover", "img")["src"]
    _data["cover_url"] = cover_url

    # Title
    title = _search_table_row_td(tr, "field title", "a")
    _data["full_title"] = title["title"]
    _data["show_title"] = title.text.strip()
    _data["book_url"] = title["href"]

    # Author
    author = _search_table_row_td(tr, "field author", "a")
    _data["author"] = author.text
    _data["author_url"] = author["href"]

    # ISBN, ISBN13, ASIN, average rating
    for field in ["isbn", "isbn13", "asin", "avg_rating"]:
        d = _get_table_row_td_value(tr, class_=f"field {field}")
        _data[field] = None if d == "" else d
    
    # Number of ratings
    n_ratings = _get_table_row_td_value(tr, class_=f"field num_ratings")
    _data["num_ratings"] = n_ratings.replace(",", "")

    # Date published, date published (edition)
    for field in ["date_pub", "date_pub_edition"]:
        d = _get_table_row_td_value(tr, class_=f"field {field}")
        _data[field] = _parse_dates(d)
    
    _data["rating"] = _extract_star_rating(tr.find("td", "field rating"))
    
    return BookData(**_data)

In [30]:
table_body = books_table.find("tbody", id="booksBody")
book_datas: list[BookData] = []
for table_row in table_body.find_all("tr"):
    book_datas.append(parse_book_table_row(table_row))

print(len(book_datas))
print(book_datas[10])

30
book_url='/book/show/21563974-the-world-atlas-of-coffee' cover_url='https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1403197222l/21563974._SX50_.jpg' full_title='The World Atlas of Coffee: From Beans to Brewing -- Coffees Explored, Explained and Enjoyed' show_title='The World Atlas of Coffee: From Beans to Brewing -- Coffees Explored, Explained and Enjoyed' author='Hoffmann, James' author_url='/author/show/205889.James_Hoffmann' isbn='1770854703' isbn13='9781770854703' asin=None avg_rating=4.42 num_ratings=2065 date_pub=datetime.date(2014, 10, 1) date_pub_edition=datetime.date(2014, 11, 21) rating=None
