# Crawl Goodreads Book Pages in Multiple Languages

The aim is to gather book information and book reviews for a list of book titles, for a number of languages.

In [1]:
%reload_ext autoreload
%autoreload 2


In [2]:
import os
import re

import pandas as pd

# The core dataset with book titles, ISBNs and Goodreads URLs
pianzola_file = '../../data/reviews/Multilingual/Shared_Meta_EN_fin_LOBO_v0_2.csv'

# The output directory
html_dir = '../../data/reviews/Multilingual/Goodreads/HTML/Canonical_book_pages/'

dtype = {
    'ISBN': str
}

# Read the core dataset as a pandas DataFrame and show the Goodreads link column
df = pd.read_csv(pianzola_file, index_col=0, dtype=dtype)
df.gr_EN_link

0      https://www.goodreads.com/book/show/16299.And_...
1      https://www.goodreads.com/book/show/140345.Abs...
2      https://www.goodreads.com/book/show/853510.Mur...
3      https://www.goodreads.com/book/show/2975046-a-...
4      https://www.goodreads.com/book/show/16322.The_...
                             ...                        
130    https://www.goodreads.com/book/show/50157589-u...
131    https://www.goodreads.com/book/show/39618887-t...
132    https://www.goodreads.com/book/show/797192.I_H...
133    https://www.goodreads.com/book/show/40796117-d...
134    https://www.goodreads.com/book/show/25525419-p...
Name: gr_EN_link, Length: 135, dtype: object

In [3]:
def get_goodreads_number(gr_EN_link):
    if isinstance(gr_EN_link, str) is False:
        return gr_EN_link
    base_url, book_name = os.path.split(gr_EN_link)
    match = re.match(r"(\d+)", book_name)
    if match is None:
        raise ValueError(f'cannot extract book_number from book_name {book_name}')
    return int(match.group(1))
    
df['gr_EN_number'] = df.gr_EN_link.apply(get_goodreads_number)
df

Unnamed: 0,Title,Title_EN,ISBN,Author,Auth_Sugg,az_EN_link,gr_EN_link,az_IT_link,an_IT_link,Yes24_link,...,Num_Voti_gr,Num_Rev_gr,Pub_Date_gr,Rating_az,Distr_Rating_az,Num_Voti_az,Num_Review_az,Title_DE,LOBO_IDs,gr_EN_number
0,그리고 아무도 없었다,And there was no one.,9788938201010,애거사 크리스티,Agatha Christie,https://www.amazon.it/Then-There-Were-None/dp/...,https://www.goodreads.com/book/show/16299.And_...,https://www.amazon.it/Ridere-N-19-20/dp/880472...,https://www.anobii.com/books/e-poi-non-rimase-...,http://www.yes24.com/Product/Goods/70266,...,989788.0,41925.0,Published May 3rd 2004 by St. Martin's Press (...,4.5,"['72%', '17%', '7%', '2%', '3%']",14290.0,4375.0,Und dann gab's keines mehr,B001375 B001376 B001377,16299
1,봄에 나는 없었다,Absent in spring,9788954623865,애거사 크리스티,Agatha Christie,https://www.amazon.com/Absent-Spring-AGATHA-CH...,https://www.goodreads.com/book/show/140345.Abs...,https://www.amazon.it/deserto-del-cuore-Mary-W...,https://www.anobii.com/books/il-deserto-del-cu...,http://www.yes24.com/Product/Goods/12000664,...,,,,,,,,Ein Frühling ohne Dich,Not found,140345
2,오리엔트 특급 살인,Orient Express Murder,9788960177765,애거사 크리스티,Agatha Christie,https://www.amazon.com/Murder-Orient-Express-A...,https://www.goodreads.com/book/show/853510.Mur...,https://www.amazon.it/Assassinio-sullOrient-Ex...,https://www.anobii.com/books/assassinio-sull-o...,http://www.yes24.com/Product/Goods/11865812,...,457294.0,28262.0,Published June 4th 2007 by HarperCollins (firs...,4.6,"['76%', '15%', '6%', '2%', '1%']",145.0,59.0,Mord im Orientexpress,B001140 B001221 B001304 B001322 B001323,853510
3,딸은 딸이다,Daughter is a daughter.,9788954624664,애거사 크리스티,Agatha Christie,https://www.amazon.com/Daughters-Daughter-Othe...,https://www.goodreads.com/book/show/2975046-a-...,https://www.amazon.it/figlia-sempre-Oscar-best...,https://www.anobii.com/books/una-figlia-per-se...,http://www.yes24.com/Product/Goods/12818848,...,,,,,,,,Sie ist meine Tochter,Not found,2975046
4,ABC 살인사건,ABC murder case,9788938201041,애거사 크리스티,Agatha Christie,https://www.amazon.com/b-c-Murders-Hercule-Poi...,https://www.goodreads.com/book/show/16322.The_...,https://www.amazon.it/serie-infernale-Oscar-gi...,https://www.anobii.com/books/la-serie-infernal...,http://www.yes24.com/Product/Goods/103160,...,122639.0,6463.0,Published September 1st 2006 by Black Dog & Le...,4.6,"['71%', '20%', '6%', '1%', '2%']",2648.0,746.0,Die Morde des Herrn ABC,B001236 B001237 B001348 B001349,16322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,,Untold Night and Day,,,Bae Suah,https://www.amazon.com/gp/product/B07V3335HK/,https://www.goodreads.com/book/show/50157589-u...,https://www.amazon.it/gp/product/B08KTR3GL6,https://www.anobii.com/books/notti-invisibili-...,http://www.yes24.com/Product/Goods/8736913,...,,,,,,,,,,50157589
131,,The plotter,,,Un-Su Kim,https://www.amazon.com/gp/product/1925603768,https://www.goodreads.com/book/show/39618887-t...,https://www.amazon.it/I-cospiratori-Kim-ebook/...,https://www.anobii.com/books/i-cospiratori/978...,http://www.yes24.com/Product/Goods/69284667,...,,,,,,,,Die Plotter,B158436,39618887
132,,I Have the Right to Destroy Myself,,,Young-ha Kim,https://www.amazon.com/gp/product/0156030802,https://www.goodreads.com/book/show/797192.I_H...,https://www.amazon.it/Ho-diritto-distruggermi-...,https://www.anobii.com/books/Ho_il_diritto_di_...,http://www.yes24.com/Product/Goods/4077402,...,,,,,,,,Das Gottesspiel,B165983,797192
133,,Diary of a murderer,,,Young-ha Kim,https://www.amazon.com/gp/product/1328545423,https://www.goodreads.com/book/show/40796117-d...,https://www.amazon.it/Memorie-assassino-Young-...,https://www.anobii.com/books/memorie-di-un-ass...,http://www.yes24.com/Product/Goods/9196011,...,,,,,,,,,,40796117


In [3]:
import requests


for url in df.gr_EN_link.to_list():
    print(url)
    response = requests.get(url)
    break
    
print(f'response code (should be 200): {response.status_code}\n')

response.content

https://www.goodreads.com/book/show/16299.And_Then_There_Were_None
response code (should be 200): 200





Below are a few short functions to help download book pages and store them locally.

Test that writing a book page to disk works.

In [5]:
from download import write_book_page

html_dir = '../../data/reviews/Multilingual/Goodreads/HTML-2025-10-22/Canonical_book_pages/'
print(os.path.exists(html_dir))
write_book_page(html_dir, url, response)

True


In [6]:
from parse import get_page_filename

filename = get_page_filename(html_dir, url)
print('local file:', filename)
print('local file exists:', os.path.exists(filename))

local file: ../../data/reviews/Multilingual/Goodreads/HTML/Canonical_book_pages/16299.And_Then_There_Were_None.html
local file exists: True


Next, download and store the book pages for all URLs in the core dataset.

In [7]:
from download import download_urls

download_urls(df.gr_EN_link.to_list(), html_dir)

## Identifying additional books

In [20]:
from parse import get_book_list_pagination_urls

get_book_list_pagination_urls(soup)
table = soup.find('table')
trs = table.find_all('tr')
len(trs)

100

In [10]:
import glob
from bs4 import BeautifulSoup


book_list_dir = '../../data/reviews/Multilingual/Goodreads/HTML-2025-10-22/Book_list_pages/'
book_list_files = glob.glob(os.path.join(book_list_dir, '*.html'))

print(f"number of book_list_files: {len(book_list_files)}")

book_map = {}
pagination_urls = {}

for blf in book_list_files:
    with open(blf, 'rt') as fh:
        print(blf)
        _, filename = os.path.split(blf)
        if m := re.match(r"^(.*) \((\d+) books\)", filename):
            book_list = m.group(1)
            num_books = m.group(2)
        soup = BeautifulSoup(fh, features='xml')
        pagination_urls[book_list] = get_book_list_pagination_urls(soup)
        books = get_book_list_books(soup, book_list)
        for book in books:
            if book['book_id'] in book_map:
                book_map[book['book_id']]['book_lists'].append(book_list)
            else:
                book_map[book['book_id']] = book
        



number of book_list_files: 15
../../data/reviews/Multilingual/Goodreads/HTML-2025-10-22/Book_list_pages/سلسله المكتبة الخضراء للأطفال (65 books) _ Goodreads.html
../../data/reviews/Multilingual/Goodreads/HTML-2025-10-22/Book_list_pages/Africa (fiction and nonfiction) (1747 books) _ Goodreads.html
../../data/reviews/Multilingual/Goodreads/HTML-2025-10-22/Book_list_pages/International Women_ Female Leads From All Over the World (758 books) _ Goodreads.html
../../data/reviews/Multilingual/Goodreads/HTML-2025-10-22/Book_list_pages/Women Around the World (1157 books) _ Goodreads.html
../../data/reviews/Multilingual/Goodreads/HTML-2025-10-22/Book_list_pages/Best Indian Fiction Books (914 books) _ Goodreads.html
../../data/reviews/Multilingual/Goodreads/HTML-2025-10-22/Book_list_pages/100 Best Books of All Time_ The World Library List (100 books) _ Goodreads.html
../../data/reviews/Multilingual/Goodreads/HTML-2025-10-22/Book_list_pages/China Expat Books (99 books) _ Goodreads.html
../../data

In [21]:
len(book_map)
#len([url for book_id in pagination_urls for url in pagination_urls[book_id]])
#pagination_urls
#book_list_files
#book_map

1261

In [22]:
for book_id in book_map:
    num_book_lists = len(book_map[book_id]['book_lists'])
    if len(book_map[book_id]['book_lists']) <= 3:
        continue
    print(book_id, len(book_map[book_id]['book_lists']))

669780 4
2364284 4
3554772 4
3438000 4
5295735 4
7704143 4
3243517 4
8239301 4
6219415 4
2501458 4


### Parsing Book Pages, Extracting Language Links

The book pages contain links to pages about the same book, but in different languages. 

In [30]:
# source: https://hreflang.org/list-of-hreflang-codes/

target_langs = [
    'it', # Italian
    'de', # German
    'en', # English
    'es', # Spanish
    'fa', # Persian
    'ps', # Pashto
    'ur', # Urdu
    'nl', # Dutch
    'tr', # Turkish
    'ja', # Japanese
    'zh', # Chinese (macro-language label)

    'pt', # Portuguese
    'fr', # French
    'ko', # Korean
    'ar', # Arabic
    'no', # Norwegian
    'da', # Danish
    'fi', # Finnish
    'sv', # Swedish
    'cs', # Czech
    'pl', # Polish
    'ru', # Russian
    'uk', # Ukranian
    'sk', # Slovak
    'sl', # Slovenian
    'sr', # Serbian
    'el', # Greek
    'hi', # Hindi
    'hu', # Hungarian
    'id', # Indonesian
]

for target_lang in target_langs:
    print(url.replace('/book/', f'/{target_lang}/book/'))

https://www.goodreads.com/it/book/show/16299.And_Then_There_Were_None
https://www.goodreads.com/de/book/show/16299.And_Then_There_Were_None
https://www.goodreads.com/en/book/show/16299.And_Then_There_Were_None
https://www.goodreads.com/es/book/show/16299.And_Then_There_Were_None
https://www.goodreads.com/fa/book/show/16299.And_Then_There_Were_None
https://www.goodreads.com/ps/book/show/16299.And_Then_There_Were_None
https://www.goodreads.com/ur/book/show/16299.And_Then_There_Were_None
https://www.goodreads.com/nl/book/show/16299.And_Then_There_Were_None
https://www.goodreads.com/tr/book/show/16299.And_Then_There_Were_None
https://www.goodreads.com/ja/book/show/16299.And_Then_There_Were_None
https://www.goodreads.com/zh/book/show/16299.And_Then_There_Were_None


The functions below extract those links for any book page.

In [31]:
from bs4 import BeautifulSoup as bsoup
from parse import get_language_links, filter_language_links

    
page_soup = bsoup(response.text, 'lxml')
links = get_language_links(page_soup)
filter_language_links(links, target_langs)

AttributeError: 'NoneType' object has no attribute 'find_all'

**Important**: when bulk downloading/crawling, build in some wait time between requests to not overload the Goodreads servers (netiquette) and to not get blacklisted by Goodreads.

Adding some random wait time to a minimal amount might help postpone the moment that Goodreads start throttling server access. 

In [34]:
from download import sleep
from parse import read_html_file, get_page_filename


book_page_files = glob.glob(os.path.join(html_dir, '*.html'))

base_dir = '../../data/reviews/Multilingual/Goodreads/HTML/'


for fname in book_page_files:
    page_soup = read_html_file(fname)
    links = get_language_links(page_soup)
    links = filter_language_links(links, target_langs)
    for link in links:
        lang_dir = os.path.join(base_dir, link.attrs['hreflang'])
        if not os.path.isdir(lang_dir):
            os.mkdir(lang_dir)
            print(lang_dir)
        lang_file = get_page_filename(lang_dir, link['href'])
        if os.path.exists(lang_file):
            print('file exists:', lang_file)
            continue
        else:
            print('downloading', link['href'])
            response = requests.get(link['href'])
            write_book_page(lang_dir, link['href'], response.text)
            sleep(min_sleep_time=5, max_random_time=5)


file exists: ../../data/reviews/Multilingual/Goodreads/HTML/it/50157589-untold-night-and-day.html
file exists: ../../data/reviews/Multilingual/Goodreads/HTML/en/50157589-untold-night-and-day.html
file exists: ../../data/reviews/Multilingual/Goodreads/HTML/fa/50157589-untold-night-and-day.html
file exists: ../../data/reviews/Multilingual/Goodreads/HTML/de/50157589-untold-night-and-day.html
file exists: ../../data/reviews/Multilingual/Goodreads/HTML/es/50157589-untold-night-and-day.html
file exists: ../../data/reviews/Multilingual/Goodreads/HTML/tr/50157589-untold-night-and-day.html
file exists: ../../data/reviews/Multilingual/Goodreads/HTML/nl/50157589-untold-night-and-day.html
file exists: ../../data/reviews/Multilingual/Goodreads/HTML/en/32940867-the-chemist.html
file exists: ../../data/reviews/Multilingual/Goodreads/HTML/es/32940867-the-chemist.html
file exists: ../../data/reviews/Multilingual/Goodreads/HTML/de/32940867-the-chemist.html
file exists: ../../data/reviews/Multilingual/Go