# Extracting Book Metadata from Book Pages


In [1]:
from parse import read_book_review_files

main_dir = '../../data/reviews/Multilingual/Goodreads/HTML/Canonical_book_pages/'
html_dir = '../../data/reviews/Multilingual/Goodreads/HTML/'


book_files = read_book_review_files(html_dir)



In [2]:
book_ids = [book_id for book_id in book_files.keys()]
print('number of book IDs:', len(book_ids))

[book_id for book_id in book_ids if book_id.startswith('39')]

number of book IDs: 265


['39988',
 '393199',
 '39618887',
 '39618887-the-plotters',
 '393199.Down_and_Out_in_Paris_and_London',
 '39988.Matilda']

## Extract Book Metadata and Reviews from Goodreads Book Pages

In [3]:
from parse import get_book_metadata, get_book_reviews
from parse import read_html_file


all_book_metadata = []
all_reviews = []
for book_id in book_files:
    print(book_id, len(book_files[book_id]))
    for book_review_file in book_files[book_id]:
        #if '/nl/' not in filepath:
        #    continue
        #print(book_review_file)
        page = read_html_file(book_review_file)
        book_metadata = get_book_metadata(book_id, book_review_file, page)
        all_book_metadata.append(book_metadata)
        reviews = get_book_reviews(book_id, book_review_file, page)
        all_reviews.extend(reviews)
    #print(json.dumps(reviews[0], indent=4))    
    #break

1656001 3
39988 4
170448 5
7823678 4
7912007 4
12986521 3
41804 5
13661 4
22540125 3
32829 3
16793 4
31912730 2
41865 5
6691 2
38355410 4
1162543 3
46015758 4
6413193 3
13440 3
71904 1
7631105 3
910046 2
3131741 3
84786 3
853510 5
50157589 3
72579 4
428263 3
20926278 4
472343 4
17333227 4
19312 3
960 5
25525419 1
61439040 5
20518872 5
30091914 5
6320534 5
386162 4
17707648 4
9460487 5
13079982 5
32507 3
239399 5
140345 3
22557520 3
13667 4
17347634 5
23492589 4
42975172 5
11564 3
41044147 4
19288043 5
19302 3
797192 3
9646 5
6411961 5
11082037 3
36327117 1
41839602 3
17644 3
16130549 4
393199 5
10569 5
15837724 3
40097951 4
33124137 4
17647 2
36510196 4
16328 5
11588 4
9014 4
5886881 4
23164983 4
18775247 5
36804340 5
10567 4
77289 2
39618887 5
32504 4
35297901 5
18116 3
10644930 5
7864437 5
13648 4
16299 5
6043781 4
4502877 3
7937462 3
2125922 2
389627 5
47701 3
38447 5
32508 5
17650 4
8574333 5
365 4
37903770 5
13596166 4
18209505 2
30553 4
2975046 1
7936809 3
25489025 5
46654 3
968 

How many different book metadata records and reviews do we get?

In [48]:
from collections import Counter


print('number of book metadata records (all languages):', len(all_book_metadata))
for lang, num in Counter([bm['review_file_language'] for bm in all_book_metadata]).most_common():
    print(f"\tlanguage: {lang}\trecords: {num}")
print('number of reviews (all languages):', len(all_reviews))
for lang, num in Counter([bm['review_lang'] for bm in all_reviews]).most_common():
    print(f"\tlanguage: {lang}\trecords: {num}")


number of book metadata records (all languages): 626
	language: Ca	records: 135
	language: it	records: 128
	language: de	records: 120
	language: nl	records: 115
	language: zh	records: 83
	language: ko	records: 45
number of reviews (all languages): 12192
	language: Ca	records: 3870
	language: it	records: 2880
	language: de	records: 2520
	language: nl	records: 2281
	language: zh	records: 532
	language: ko	records: 109


## From Records to DataFrame

Add some book identifier fields derived from other fields:

In [49]:
import os
import pandas as pd

from parse import parse_edition_isbn


df_reviews = pd.DataFrame(all_reviews)
df_meta = pd.DataFrame(all_book_metadata)

df_reviews['reviewed_isbn'] = df_reviews.edition.apply(parse_edition_isbn)
df_meta['book_isbn'] = df_meta.book_isbn.apply(lambda b: None if b == 'null' else b)
df_meta['work_id'] = df_meta.book_url.apply(lambda x: x if pd.isna(x) else os.path.split(x)[-1])

df_reviews.review_lang.value_counts()


Ca    3870
it    2880
de    2520
nl    2281
zh     532
ko     109
Name: review_lang, dtype: int64

The edition field shows that for many records, the edition information only contains the type of edition, for others also the specific book identifier.

In [50]:
df_reviews.edition.value_counts()


Kindle Edition             1032
Paperback                   355
Mass Market Paperback       118
Hardcover                   102
Audible Audio                74
                           ... 
Audio CD 9783867175463        1
Paperback 9788817095471       1
Paperback 9789863613022       1
ebook 9781594746208           1
Paperback 9789029542364       1
Name: edition, Length: 1648, dtype: int64

How many of the reviews list the specific ISBN that has been reviewed (this information is missing for reviews where the edition information has no identifier)?


In [51]:
df_reviews.reviewed_isbn.isna().value_counts()

False    6543
True     5649
Name: reviewed_isbn, dtype: int64

In [6]:
reviewed_isbns = [isbn for isbn in df_reviews.reviewed_isbn.unique() if isbn is not None]
book_isbns = [isbn for isbn in df_meta.book_isbn.unique() if isbn is not None]
gr_book_ids = [gr_id for gr_id in df_meta.goodreads_book_id.unique() if gr_id is not None]
work_ids = [work_id for work_id in df_meta.work_id.unique() if work_id is not None]

len(reviewed_isbns), len(book_isbns), len(gr_book_ids), len(work_ids)

(1637, 114, 265, 137)

## Linking found Book Identifiers to The Impact and Fiction Review Corpus

In [8]:
import pandas as pd

review_file = '../../data/review_features/reviews-stats.tsv.gz'
work_file = '../../data/book_metadata/work_isbn_title_genre.tsv.gz'

work_genre = pd.read_csv(work_file, sep='\t', compression='gzip', dtype={'unesco': str, 'brinkman': str})
work_genre.head(2)

Unnamed: 0,work_id,record_id,record_id_type,work_year,work_author,work_title,work_length,nur,thema,bisac,brinkman,unesco
0,impfic-work-1,97779,odbr,2010.0,['Ingalill Roos'],['Energievreters -- omgaan met mensen waar je ...,,['770'],,,['sociale relaties'],"['Sociologie, statistiek']"
1,impfic-work-1,326085068,ppn,2010.0,['Ingalill Roos'],['Energievreters -- omgaan met mensen waar je ...,,['770'],,,['sociale relaties'],"['Sociologie, statistiek']"


The ImpFic book metadata file has records based on different book identifiers:

In [9]:
work_genre.record_id_type.value_counts()

isbn         189038
ppn          111613
odbr          74758
bvr           47458
goodreads     15980
apart         15745
Name: record_id_type, dtype: int64

In [10]:
reviews = pd.read_csv(review_file, sep='\t', compression='gzip')
reviews.head(2)

Unnamed: 0,source,user_id,review_id,review_num_terms,review_num_words,num_sentences,review_date,review_text,rating,work_id
0,NBD_Biblion,impfic-user-210320,impfic-review-1,206,185,13,,"Als Tom 15 jaar geworden is, verandert zijn le...",,impfic-work-3723
1,NBD_Biblion,impfic-user-210320,impfic-review-2,212,185,7,,Bushnell beschrijft de levens van een aantal b...,,impfic-work-36913


In [22]:
# From the extracted reviews, zoom in on the Goodreads identifiers and ISBNs
gr_book_isbn = df_reviews[['goodreads_book_id', 'goodreads_book_num', 'reviewed_isbn']].dropna().drop_duplicates()
gr_book_isbn

# From the ImpFic book metadata, zoom in on the records with identifiers from Goodreads and ISBNs
work_genre_gr = work_genre[work_genre.record_id_type == 'goodreads']
work_genre_isbn = work_genre[work_genre.record_id_type == 'isbn']

# Find the overlap between ISBNs from the multilingual dataset and the ImpFic dataset
isbn_map = pd.merge(gr_book_isbn, work_genre_isbn[['work_id', 'record_id']], left_on='reviewed_isbn', right_on='record_id', how='inner')

# Find the overlap between Goodreads book identifiers from the multilingual dataset and the ImpFic dataset
gr_id_map = pd.merge(gr_book_isbn, work_genre_gr[['work_id', 'record_id']], left_on='goodreads_book_id', right_on='record_id', how='inner')

# Find the overlap between Goodreads book numbers from the multilingual dataset and the ImpFic dataset
gr_num_map = pd.merge(gr_book_isbn, work_genre_gr[['work_id', 'record_id']], left_on='goodreads_book_num', right_on='record_id', how='inner')

# merge all mappings above into a single frame
gr_impfic_work = pd.concat([isbn_map, gr_id_map, gr_num_map])[['goodreads_book_id', 'goodreads_book_num', 'work_id']].drop_duplicates()
gr_impfic_work


Unnamed: 0,goodreads_book_id,goodreads_book_num,work_id
0,1656001,1656001,impfic-work-6748
2,39988,39988,impfic-work-64050
3,39988,39988,impfic-work-123080
4,39988.Matilda,39988,impfic-work-123080
5,170448,170448,impfic-work-118882
...,...,...,...
66,38447.The_Handmaid_s_Tale,38447,impfic-work-40088
71,7747374-i-am-number-four,7747374,impfic-work-112938
75,18116.His_Dark_Materials,18116,impfic-work-4158
84,36804340-the-good-son,36804340,impfic-work-115991


This extra source of book-work mapping shows that two works in the Impact and Fiction dataset should be merged:

In [16]:
work_genre[work_genre.work_id.isin(['impfic-work-64050', 'impfic-work-123080'])][['work_id', 'work_title', 'work_author']].drop_duplicates()

Unnamed: 0,work_id,work_title,work_author
322470,impfic-work-64050,['Matilda'],['Roald Dahl']
448984,impfic-work-123080,['Matilda'],[]


In [23]:
# The number of distinct ImpFic work IDs
gr_impfic_work.work_id.nunique()

142

In [24]:
gr_impfic_work

Unnamed: 0,goodreads_book_id,goodreads_book_num,work_id
0,1656001,1656001,impfic-work-6748
2,39988,39988,impfic-work-64050
3,39988,39988,impfic-work-123080
4,39988.Matilda,39988,impfic-work-123080
5,170448,170448,impfic-work-118882
...,...,...,...
66,38447.The_Handmaid_s_Tale,38447,impfic-work-40088
71,7747374-i-am-number-four,7747374,impfic-work-112938
75,18116.His_Dark_Materials,18116,impfic-work-4158
84,36804340-the-good-son,36804340,impfic-work-115991


It also shows that in the ImpFic dataset, some books in the same series have been mapped to a single work:

In [32]:
gr_impfic_work[['goodreads_book_num', 'work_id']].drop_duplicates().work_id.value_counts()

impfic-work-12095     3
impfic-work-48738     2
impfic-work-6748      1
impfic-work-40088     1
impfic-work-45423     1
                     ..
impfic-work-10657     1
impfic-work-15681     1
impfic-work-27088     1
impfic-work-90392     1
impfic-work-114830    1
Name: work_id, Length: 142, dtype: int64

In [35]:
# Which books have been merged into a single work ID in the ImpFic dataset?
df_meta[df_meta.goodreads_book_num.isin(gr_impfic_work[gr_impfic_work.work_id == 'impfic-work-12095'].goodreads_book_num)]

Unnamed: 0,goodreads_book_id,goodreads_book_num,source_url,review_file_language,book_title,book_description,book_url,book_image,book_type,book_author,book_isbn,book_page_count,work_id
44,41865,41865,https://www.goodreads.com/it/book/show/41865.T...,it,"Twilight (The Twilight Saga, #1)",About three things I was absolutely positive. ...,https://www.goodreads.com/work/best_book/32122...,https://i.gr-assets.com/images/S/compressed.ph...,books.book,https://www.goodreads.com/author/show/941441.S...,9780316015844.0,498.0,3212258-twilight
45,41865,41865,https://www.goodreads.com/book/show/41865.Twil...,zh,"Twilight (The Twilight Saga, #1)",About three things I was absolutely positive. ...,https://www.goodreads.com/work/3212258-twilight,https://images-na.ssl-images-amazon.com/images...,books.book,,,,3212258-twilight
46,41865,41865,https://www.goodreads.com/nl/book/show/41865.T...,nl,"Twilight (The Twilight Saga, #1)",About three things I was absolutely positive. ...,https://www.goodreads.com/work/best_book/32122...,https://i.gr-assets.com/images/S/compressed.ph...,books.book,https://www.goodreads.com/author/show/941441.S...,9780316015844.0,498.0,3212258-twilight
47,41865,41865,https://www.goodreads.com/de/book/show/41865.T...,de,"Twilight (The Twilight Saga, #1)",About three things I was absolutely positive. ...,https://www.goodreads.com/work/best_book/32122...,https://i.gr-assets.com/images/S/compressed.ph...,books.book,https://www.goodreads.com/author/show/941441.S...,9780316015844.0,498.0,3212258-twilight
48,41865,41865,https://www.goodreads.com/ko/book/show/41865.T...,ko,"Twilight (The Twilight Saga, #1)",About three things I was absolutely positive. ...,https://www.goodreads.com/work/best_book/32122...,https://i.gr-assets.com/images/S/compressed.ph...,books.book,https://www.goodreads.com/author/show/941441.S...,9780316015844.0,498.0,3212258-twilight
55,1162543,1162543,https://www.goodreads.com/it/book/show/1162543...,it,"Breaking Dawn (The Twilight Saga, #4)","""Don't be afraid,"" I murmured. ""We belong toge...",https://www.goodreads.com/work/best_book/29605...,https://i.gr-assets.com/images/S/compressed.ph...,books.book,https://www.goodreads.com/author/show/941441.S...,9780316067928.0,756.0,2960529-breaking-dawn
56,1162543,1162543,https://www.goodreads.com/nl/book/show/1162543...,nl,"Breaking Dawn (The Twilight Saga, #4)","""Don't be afraid,"" I murmured. ""We belong toge...",https://www.goodreads.com/work/best_book/29605...,https://i.gr-assets.com/images/S/compressed.ph...,books.book,https://www.goodreads.com/author/show/941441.S...,9780316067928.0,756.0,2960529-breaking-dawn
57,1162543,1162543,https://www.goodreads.com/de/book/show/1162543...,de,"Breaking Dawn (The Twilight Saga, #4)","""Don't be afraid,"" I murmured. ""We belong toge...",https://www.goodreads.com/work/best_book/29605...,https://i.gr-assets.com/images/S/compressed.ph...,books.book,https://www.goodreads.com/author/show/941441.S...,9780316067928.0,756.0,2960529-breaking-dawn
422,49041,49041,https://www.goodreads.com/it/book/show/49041.N...,it,"New Moon (The Twilight Saga, #2)",There is an alternate cover edition for ISBN13...,https://www.goodreads.com/work/best_book/32039...,https://i.gr-assets.com/images/S/compressed.ph...,books.book,https://www.goodreads.com/author/show/89021.Sy...,,563.0,3203964-new-moon
423,49041,49041,https://www.goodreads.com/zh/book/show/49041.N...,zh,"New Moon (The Twilight Saga, #2)",There is an alternate cover edition for ISBN13...,https://www.goodreads.com/work/best_book/32039...,https://i.gr-assets.com/images/S/compressed.ph...,books.book,https://www.goodreads.com/author/show/89021.Sy...,,563.0,3203964-new-moon


In [52]:
print('number of distinct reviews:', reviews[reviews.work_id.isin(gr_impfic_work.work_id)].shape[0])

gr_impfic_reviews = pd.merge(gr_impfic_work[['goodreads_book_num', 'work_id']].drop_duplicates(), reviews, on='work_id', how='inner')

print('number of reviews mapped to goodreads book numbers:', len(gr_impfic_reviews))

# distribution of reviews from differences platforms:
gr_impfic_reviews.source.value_counts()

number of distinct reviews: 5326
number of reviews mapped to goodreads book numbers: 6278


Bol            3099
Hebban         1795
WLJN            796
Goodreads       270
Dizzie          264
NBD_Biblion      54
Name: source, dtype: int64

In [53]:
gr_impfic_reviews.to_csv('../data/multilingual_books-impfic_reviews.tsv.gz', sep='\t', index=False, compression='gzip')