In [1]:
%reload_ext autoreload
%autoreload 2


In [3]:
import glob
import gzip
import json
import os
import spacy

from language import lang_code_map, spacy_model_map

lang_code_map

{'Arabic': 'ar',
 'Czech': 'cs',
 'Danish': 'da',
 'German': 'de',
 'Greek': 'el',
 'English': 'en',
 'Spanish': 'es',
 'Persian': 'fa',
 'Finnish': 'fi',
 'French': 'fr',
 'Hindi': 'hi',
 'Hungarian': 'hu',
 'Indonesian': 'id',
 'Italian': 'it',
 'Japanese': 'ja',
 'Korean': 'ko',
 'Dutch': 'nl',
 'Norwegian': 'no',
 'Polish': 'pl',
 'Pashto': 'ps',
 'Portuguese': 'pt',
 'Russian': 'ru',
 'Slovak': 'sk',
 'Slovenian': 'sl',
 'Serbian': 'sr',
 'Swedish': 'sv',
 'Turkish': 'tr',
 'Ukranian': 'uk',
 'Urdu': 'ur',
 'Chinese': 'zh'}

In [4]:
languages = [
    # Add languages for which you want to do linguistic parsing of reviews
    'English', 'Italian', 'Spanish'
]

lang_codes = [lang_code_map[language] for language in languages]
lang_codes

['en', 'it', 'es']

## Load the SpaCy models

You need these to either parse the review text yourself, or read the pre-parsed reviews from file.

In [5]:
from language import load_language_nlp_model

lang_nlp = {lang: load_language_nlp_model(lang) for lang in lang_codes}

# Reading book metadata and reviews

In [30]:
import pandas as pd

book_meta_file = '../data/book_metadata.csv'
book_df = pd.read_csv(book_meta_file, sep='\t')
book_df.head(2)

Unnamed: 0,source_file,source_url,book_id,book_title,book_description,book_author,book_author_url,genres,format,num_pages,publication_date,rating_avg,rating_count,review_count,canonical_url
0,../data/Book_language_pages/en/19288043-gone-g...,https://www.goodreads.com/en/book/show/1928804...,19288043,Gone Girl,An alternative cover edition for this ISBN can...,['Gillian Flynn'],['https://www.goodreads.com/author/show/2383.G...,"['Fiction', 'Mystery', 'Thriller', 'Book Club'...",Paperback,415.0,2012-05-24T00:00:00,4.14,3399892,167690,https://www.goodreads.com/book/show/19288043-g...
1,../data/Book_language_pages/en/41865.Twilight....,https://www.goodreads.com/en/book/show/41865.T...,41865,Twilight,About three things I was absolutely positive. ...,['Stephenie Meyer'],['https://www.goodreads.com/author/show/941441...,"['Fantasy', 'Young Adult', 'Romance', 'Fiction...",Paperback,498.0,2005-10-05T00:00:00,3.67,7211130,146232,https://www.goodreads.com/book/show/41865.Twil...


In [31]:
def read_reviews(review_file):
    with gzip.open(review_file, 'rt') as fh:
        return [json.loads(line) for line in fh]

review_dir = '../data/lang_reviews/'
review_files = glob.glob(os.path.join(review_dir, '*'))
review_file_map = {rf.split('lang_')[-1][:2]: rf for rf in review_files}
reviews = {}
for lang in lang_codes:
    reviews[lang] = read_reviews(review_file_map[lang])
    print(f"{len(reviews[lang])} reviews for language {lang}")

review_df = pd.DataFrame([review for lang in reviews for review in reviews[lang]])
review_df = pd.merge(review_df, book_df[['book_id', 'book_title', 'book_author']], on='book_id')

8924 reviews for language en
6128 reviews for language it
6254 reviews for language es


In [8]:
review_df.head(2)

Unnamed: 0,review_text,user_id,review_id,review_date,shelf_status,user_shelves,rating,book_id,source_url,review_lang,book_title,book_author
0,"Oh, Emma. Emma, Emma, Emma. Darling, why must ...",4189f79d23c4bd1eac89950d91d80c5af9c51261eb6539...,33fd62c4107560c5153864c4394a7d9ea4120c76e9d41d...,2011-07-05T00:00:00,,"[19th-century, examined-lives, fiction]",4.0,2175,https://goodreads.com/book/show/2175.Madame_Bo...,en,Madame Bovary,"['Gustave Flaubert', 'Hans van Pinxteren']"
1,welcome to...MADAME BOVUARY! you know what tim...,a2e726dd7be2f50e9a00a9327f153af4676a8bb0f5f7f6...,46f530f863cad4fbda780805ae3360768ad3db4f3c0f2e...,2023-02-26T00:00:00,,"[3-and-a-half-stars, classics, gorgeous-covers]",3.0,2175,https://goodreads.com/book/show/2175.Madame_Bo...,en,Madame Bovary,"['Gustave Flaubert', 'Hans van Pinxteren']"


In [32]:
review_df.groupby(['book_id', 'book_title', 'book_author']).review_lang.value_counts().unstack().fillna(0.0)

Unnamed: 0_level_0,Unnamed: 1_level_0,review_lang,es,it
book_id,book_title,book_author,Unnamed: 3_level_1,Unnamed: 4_level_1
11,The Hitchhiker’s Guide to the Galaxy,['Douglas Adams'],30.0,30.0
93,Heidi,"['Johanna Spyri', 'Angelo Rinaldi', 'Beverly Cleary']",30.0,30.0
320,One Hundred Years of Solitude,"['Gabriel García Márquez', 'Gregory Rabassa']",30.0,30.0
343,Perfume: The Story of a Murderer,"['Patrick Süskind', 'John E. Woods']",30.0,30.0
656,War and Peace,"['Leo Tolstoy', 'Aylmer Maude', 'Louise Maude']",30.0,30.0
...,...,...,...,...
61439040,1984,"['George Orwell', 'Thomas Pynchon']",30.0,30.0
77265004,The Iliad,"['Homer', 'Emily Wilson']",30.0,30.0
127441416,The Diary of a Young Girl,['Anne Frank'],30.0,30.0
129915654,Pride and Prejudice,['Jane Austen'],30.0,30.0


In [10]:
# show an example review
print(json.dumps(reviews['it'][0], indent=4))

{
    "review_text": "C'EST MOI   Meravigliosa come sempre, semplicemente perfetta, Isabelle Huppert nell\u2019adattamento del 1991 firmato da Claude Chabrol. Letto un paio di volte e sempre amato. Uno dei massimi capolavori della letteratura, secondo me. Flaubert \u00e8 uno dei sommi: me lo immagino di notte, solo nella sua casa di Rouen, che sono ovviamente stato a visitare, al lume di candela, che 'recita' le parole scritte, ancora e ancora, urlandole, cancellando, limando, riscrivendo, fino a trovare la formula giusta, quella perfetta. Le mot juste.  Perch\u00e9, lui \u00e8 con la perfezione che si misurava.  E alla perfezione si \u00e8 avvicinato, e, secondo me, la perfezione ha raggiunto. Realistico, il romanzo certamente lo \u00e8: non contiene nulla che non sia esistito nella vita reale (e facilissimo da riscontrare attraverso sopralluoghi e testimonianze); e anche se sbuffa ogni tanto \"nulla in questa storia \u00e8 tratto dalla vita, \u00e8 totalmente inventata\", non c'\u00e

## Loading parsed reviews

In [12]:
from analyse import read_from_doc_bins


parsed_dir = '../data/spacy_doc_bins/'
parsed_reviews = {}
for lang in lang_codes:
    parsed_file = os.path.join(parsed_dir, f"parsed_reviews-{lang}.doc_bin")
    parsed_reviews[lang] = read_from_doc_bins(lang, parsed_dir, lang_nlp[lang].vocab)
    print(lang, len(parsed_reviews[lang]))

en 5580
it 6128
es 6254


In [13]:
from collections import defaultdict, Counter

lang = 'en'

# the total frequency of words
term_freq = Counter()
# the document frequency of words, that is, in how many reviews does a word occur?
doc_freq = Counter()
# The total number of documents/reviews
num_reviews = len(parsed_reviews[lang])

for doc in parsed_reviews[lang]:
    # list all words in the review
    terms = [token.text for token in doc]
    # ignore case, turn all terms to lowercase
    terms = [term.lower() for term in terms]
    term_freq.update(terms)
    doc_freq.update(set(terms))
    

In [17]:
print(f"total number of terms: {len(term_freq):,}\ttokens: {sum(term_freq.values()):,}")

total number of terms: 139,286	tokens: 2,979,373


In [18]:
term_freq.most_common(10)

[(',', 133851),
 ('.', 113942),
 ('the', 111651),
 ('and', 65077),
 ('of', 60654),
 ('a', 56884),
 ('to', 53771),
 ('i', 41909),
 ('is', 36210),
 ('in', 35373)]

In [19]:
doc_freq.most_common(10)

[('.', 5259),
 (',', 4776),
 ('a', 4570),
 ('the', 4428),
 ('and', 4359),
 ('to', 4288),
 ('of', 4277),
 ('i', 4186),
 ('this', 4109),
 ('in', 4102)]

In [24]:
stopword_file = '../resources/stopwords-en.json'
with open(stopword_file, 'rt') as fh:
    stopwords = set(json.load(fh))

print(f"number of stopwords: {len(stopwords)}")

number of stopwords: 1298


In [25]:
# use min_df to filter words that are rare, e.g. occur in fewer than 10 reviews
min_df = 10
# filter stopwords and rare words
vocab = set(term for term in doc_freq if term not in stopwords and doc_freq[term] >= min_df)
print(f"original vocabulary size: {len(doc_freq)}")
print(f"filtered vocabulary size: {len(vocab)}")


original vocabulary size: 139286
filtered vocabulary size: 13099


In [26]:
pd.DataFrame([{'term': term, 'tf': term_freq[term], 'df': doc_freq[term]} for term in vocab])

Unnamed: 0,term,tf,df
0,decay,49,27
1,playing,174,147
2,«,540,134
3,لغة,19,17
4,concepto,14,12
...,...,...,...
13094,noisy,16,16
13095,label,29,26
13096,flynn,150,56
13097,indulge,22,21
