In [23]:
%load_ext autoreload
%autoreload 2

# Explorations of web-scraping Goodreads website

In [42]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from pydantic import BaseModel

In [25]:
USERID = 144433284
BASE_URL = "https://www.goodreads.com/review/list/"
res = requests.get(BASE_URL + str("144433284"), params={"ref": "nav_mybooks", "per_page": "10"})
res.status_code

200

In [26]:
books_soup = BeautifulSoup(res.content, "html.parser")

In [27]:
books_table = books_soup.find("table", id="books")

In [28]:
table_header = [x.strip() for x in books_table.find("tr", id="booksHeader").text.splitlines()]
table_header = [x for x in table_header if x != ""]
table_header

['#',
 'cover',
 'title',
 'author',
 'isbn',
 'isbn13',
 'asin',
 'pages',
 'rating',
 'ratings',
 'pub',
 '(ed.)',
 'rating',
 'my rating',
 'review',
 'notes',
 'recommender',
 'comments',
 'votes',
 'count',
 'started',
 'read',
 'added',
 'purchased',
 'owned',
 'location',
 'condition',
 'format']

In [86]:
from typing import Any, Optional

class BookData(BaseModel):
    """Book data."""

    book_url: str
    cover_url: str
    full_title: str
    show_title: str
    author: str
    author_url: str
    isbn: Optional[str]
    isbn13: Optional[str]
    asin: Optional[str]
    avg_rating: float
    num_ratings: int

def _search_table_row_td(tr: Tag, class_: str, find: str) -> Tag:
    return tr.find("td", class_=class_).find(find)

def _get_table_row_td_value(tr: Tag, class_: str) -> str:
    return tr.find(class_=class_).find("div", class_="value").text.strip()

def parse_book_table_row(tr: Tag) -> BookData:
    _data: dict[str, Any] = {}

    # Cover
    cover_url = _search_table_row_td(tr, "field cover", "img")["src"]
    _data["cover_url"] = cover_url

    # Title
    title = _search_table_row_td(tr, "field title", "a")
    _data["full_title"] = title["title"]
    _data["show_title"] = title.text.strip()
    _data["book_url"] = title["href"]

    # Author
    author = _search_table_row_td(tr, "field author", "a")
    _data["author"] = author.text
    _data["author_url"] = author["href"]

    # ISBN, ISBN13, ASIN
    for field in ["isbn", "isbn13", "asin", "avg_rating"]:
        _d = _get_table_row_td_value(tr, class_=f"field {field}")
        _data[field] = None if _d == "" else _d
    
    # Number of ratings
    n_ratings = _get_table_row_td_value(tr, class_=f"field num_ratings")
    _data["num_ratings"] = n_ratings.replace(",", "")
    return BookData(**_data)
    

In [87]:
table_body = books_table.find("tbody", id="booksBody")
book_datas: list[BookData] = []
for table_row in table_body.find_all("tr"):
    book_datas.append(parse_book_table_row(table_row))

print(len(book_datas))
print(book_datas[0])

30
book_url='/book/show/42683.On_Writing' cover_url='https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1435405010l/42683._SY75_.jpg' full_title='On Writing' show_title='On Writing' author='Hemingway, Ernest' author_url='/author/show/1455.Ernest_Hemingway' isbn='0684854295' isbn13='9780684854298' asin=None avg_rating=4.04 num_ratings=3759
