# Part 1: Looking at metadata and some term frequencies

# 0. Preparatory stuff
First we need to setup some stuff, e.g. importing the right modules and libs.
It's not particularly interesting, but very much needed.

In [None]:
import sys

!{sys.executable} -m pip install stopwordsiso


In [None]:
# Do all im
import glob
import gzip
import json
import os
import requests

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
import stopwordsiso
from scipy import stats


# scripts in this directory

review_json_urls = {
    "ar": "https://surfdrive.surf.nl/public.php/dav/files/6EjpYBae65JKq35",
    "da": "https://surfdrive.surf.nl/public.php/dav/files/QyRbNXRjKH5qAoF",
    "de": "https://surfdrive.surf.nl/public.php/dav/files/qs3iD76PXpDZjdL",
    "en": "https://surfdrive.surf.nl/public.php/dav/files/FDWCKim3mQAn592",
    "es": "https://surfdrive.surf.nl/public.php/dav/files/7wYxiRNfdtyH6kd",
    "fa": "https://surfdrive.surf.nl/public.php/dav/files/mKYKiMRTMWesFDX",
    "fr": "https://surfdrive.surf.nl/public.php/dav/files/PXnYeBFgXb4iHpN",
    "it": "https://surfdrive.surf.nl/public.php/dav/files/Bpmc9Nf5Re2s5Eq",
    "ja": "https://surfdrive.surf.nl/public.php/dav/files/BceKXZ4QiM5G8mc",
    "ko": "https://surfdrive.surf.nl/public.php/dav/files/2o6P6aq4csEWr3g",
    "nl": "https://surfdrive.surf.nl/public.php/dav/files/GHtmzgGaTJwiRaR",
    "pl": "https://surfdrive.surf.nl/public.php/dav/files/eBe3XPzgm8sBzy3",
    "pt": "https://surfdrive.surf.nl/public.php/dav/files/MBazLdF5doNm9Rt",
    "sl": "https://surfdrive.surf.nl/public.php/dav/files/y3ZdqAwTfSPC4Ay",
    "sv": "https://surfdrive.surf.nl/public.php/dav/files/qRr9SbbtgLP2R9n",
    "tr": "https://surfdrive.surf.nl/public.php/dav/files/2rmd6M5EcBrKKTD",
    "uk": "https://surfdrive.surf.nl/public.php/dav/files/KTDMfpnKYP8LQDP"
}
spacy_doc_bin_urls = {
    "da": "https://surfdrive.surf.nl/public.php/dav/files/ewCq2dwEF5AR9B9",
    "de": "https://surfdrive.surf.nl/public.php/dav/files/8or8rxeiZjEZsLt",
    "en": "https://surfdrive.surf.nl/public.php/dav/files/zyCbNoSLLHrCkij",
    "es": "https://surfdrive.surf.nl/public.php/dav/files/GFk53JrcYSg3Gk3",
    "fa": "https://surfdrive.surf.nl/public.php/dav/files/yH6krzA8YoELSiR",
    "fr": "https://surfdrive.surf.nl/public.php/dav/files/nxqsbtWHz5BQASJ",
    "it": "https://surfdrive.surf.nl/public.php/dav/files/qatCX5Af6RkCX5E",
    "ja": "https://surfdrive.surf.nl/public.php/dav/files/dBrkkpt2g29cwNS",
    "ko": "https://surfdrive.surf.nl/public.php/dav/files/Bbrb2xaEDj6zndm",
    "nl": "https://surfdrive.surf.nl/public.php/dav/files/CFCkFWdHs27GsNE",
    "pl": "https://surfdrive.surf.nl/public.php/dav/files/kDxtrb3Tp4aPezP",
    "pt": "https://surfdrive.surf.nl/public.php/dav/files/nwkTAsTedbBiYyH",
    "sl": "https://surfdrive.surf.nl/public.php/dav/files/gYDJTB54CkaQob2",
    "sv": "https://surfdrive.surf.nl/public.php/dav/files/QNYX5dj2LigPDof",
    "uk": "https://surfdrive.surf.nl/public.php/dav/files/BCRg3SnZsa3deYT"
}

# The language code map shows the list of languages for which reviews are available.
code_lang_map = {
    'ar': 'Arabic',
    'cs': 'Czech',
    'da': 'Danish',
    'de': 'German',
    'el': 'Greek',
    'en': 'English',
    'es': 'Spanish',
    'fa': 'Persian',
    'fi': 'Finnish',
    'fr': 'French',
    'hi': 'Hindi',
    'hu': 'Hungarian',
    'id': 'Indonesian',
    'it': 'Italian',
    'ja': 'Japanese',
    'ko': 'Korean',
    'nl': 'Dutch',
    'no': 'Norwegian',
    'pl': 'Polish',
    'ps': 'Pashto',
    'pt': 'Portuguese',
    'ru': 'Russian',
    'sk': 'Slovak',
    'sl': 'Slovenian',
    'sr': 'Serbian',
    'sv': 'Swedish',
    'tr': 'Turkish',
    'uk': 'Ukranian',
    'ur': 'Urdu',
    'zh': 'Chinese' # (macro-language label)
}

lang_code_map = {lang: code for code, lang in code_lang_map.items()}


In [None]:
languages = [
    # Add languages for which you want to do linguistic parsing of reviews
    'English', 'Italian', 'Spanish', 'German'
]

lang_codes = [lang_code_map[language] for language in languages]
lang_codes

## Load the SpaCy models

You only need the models for individual languages if you want to do your own parsing of the review text. In this notebook, you read the pre-parsed reviews from file, so only a single model is needed.

In [None]:
# For now we'll just load the English parser model
lang_nlp = spacy.load('en_core_web_lg')

# 1. Reading and inspecting book metadata

The book metadata that we scraped from Goodreads is stored in the file `book_metadata.csv'. Let's load that and see what fields (columns) the metadata contains.

In [None]:
# the URL for the book metadata
book_meta_url = "https://surfdrive.surf.nl/public.php/dav/files/bN8qFtH4BKtJADC/"

response = requests.get(book_meta_url)
if response.status_code != 200:
    raise ValueError(f"Failed to download book metadata file, with HTTP code {response.status_code}")
book_meta_data = response.json()

book_df = pd.DataFrame(book_meta_data)
book_df.head(2)

### Discussion questions
* Are there possible fields that you miss in the metadata?
* What potentially interesting questions would you not be able to answer with these metadata?
* Would there be other ways to obtain that data?

## Basic stats
Let's do some basic inspection of the metadata we have. Let's start with a histogram that tells us about how the number of times books get reviewed is distributed. 

In [None]:
book_df.hist( column='rating_count', bins=10 )

Wow, those nubers are huge. What is the highest and lowest number of ratings?

In [None]:
print( 'max:', book_df['rating_count'].max() )
print( 'min:', book_df['rating_count'].min() )

### Exercise
When dealing with huge numbers it is often more insightful to use a log scale. <br/>
Create a histogram based on the 10log of 'rating_count'. <br/>
Based on the histogram estimate what the most frequent number of times is that books receive a rating.

In [None]:
book_df['log_rating_count'] = np.log10(book_df['rating_count'])
book_df.hist( column='log_rating_count', bins=25 )

## Popularity vs rating

The number of ratings and reviews a book gets is probably a good proxy for its popularity. It is also not unimaginable that popularity relates to rating, in the sense that popular books may be rated higher than not so popular books.

To investigate this, let's first create a scatter plot of review counts versus the mean rating.

In [None]:
book_df['log_review_count'] = np.log(book_df['review_count'])
book_df.plot.scatter( x='log_review_count', y='rating_avg' )

That does not seem to give us much reason to suspect a strong relation between popularity and acverage rating. <br/>
But how can we know for sure? Let's try a linear regression.

In [None]:
slope, intercept, pearson_r, p_value, std_err = stats.linregress( book_df['log_review_count'], book_df['rating_avg'] )

def lin_reg_func(x):
  return slope * x + intercept

lin_model = list( map( lin_reg_func, book_df['log_review_count'] ) )

ax = book_df.plot.scatter( x='log_review_count', y='rating_avg' )
ax.plot( book_df['log_review_count'], lin_model )


### Question
How do you evaluate the relation between review count and mean rating?

In [None]:
f'{pearson_r:.2f}'

# 2. Investigating review level metadata

If you are using the Google Colab version of the notebook, the 'raw' reviews need to be downloaded from SURF drive. <br/>
We need a bit of file reading and data mangling to read in the review data for the chosen languages…

In [None]:
import io


def download_reviews(lang_code):
    if lang_code not in review_json_urls:
        choices = list(review_json_urls.keys())
        raise KeyError(f"'{lang_code}' is not a valid language code. Please choose from {choices}")
    response = requests.get(review_json_urls[lang_code])
    if response.status_code == 200:
        with gzip.open(io.BytesIO(response.content), 'rt') as fh:
            reviews = [json.loads(line) for line in fh]
            return reviews
    return None


reviews = {}
for lang in lang_codes:
    reviews[lang] = download_reviews(lang)
    print(f"{len(reviews[lang])} reviews for language {lang}")

review_df = pd.DataFrame([review for lang in reviews for review in reviews[lang]])
review_df = pd.merge(review_df, book_df[['book_id', 'book_title', 'book_author']], on='book_id')

Let's see what the review data looks like…

In [None]:
review_df.head(2)

### Exercise 
We can also inspect individual reviews, using the code below.
Change and rerun the code several times to get an impression of reviews in different languages.

In [None]:
# show an example review
print(json.dumps(reviews['it'][0], indent=4))

### Question

You might be tempted to investigate the popularity of the same book in different languages by comparing review counts between languages. <br/>
The below code shows you why this data set is not adequat for such an approach. Can you explain why that is? <br/>
Can you think of a different approach that might be attempted with this data?

In [None]:
review_df.groupby(['book_id', 'book_title', 'book_author']).review_lang.value_counts().unstack().fillna(0.0)

In [None]:
def avg_rating_by_lang( book_id ):
    book_title = list( review_df[ review_df['book_id']==book_id ]['book_title'] )[0]
    print( f'{book_title}…' )
    for lang in [ 'en', 'de', 'it', 'es' ]:
        mean_rating = review_df[ review_df['review_lang']==lang ][ review_df['book_id']==book_id ]['rating'].mean()
        print( f'{lang}: {mean_rating:.3f}' )
    print( '---' )
    
avg_rating_by_lang( 320 )
avg_rating_by_lang( 11 )


### Exercise
The above observation might lead to the hypothesis that different languages (i.e. different culutres?) rate differently. <br/>
For instance: might Italians rate books higher on average than Germans? <br/>
Can you adapt the `groupby` statement above to generate a general average rating per language?


In [None]:
review_df.groupby(['book_id', 'book_title', 'book_author']).review_lang.value_counts().unstack().fillna(0.0)

# review_df.groupby( [ 'review_lang' ] ).rating.mean()


### Question
Is there a significant difference between the languages? How would you evaluate?

# 3. Inspecting reviews

## Loading parsed reviews

It is possible to parse the texts of reviews, giving you POS tagging, lemmatization, syntactic information, etc. However the process takes quite some time. So, for the practical purpose of this workshop we are using pre-parsed data. We leave it as an exercise to the user to try out the various language parsers.

In [None]:
from spacy.tokens import DocBin


def download_spacy_doc_bin(lang_code, vocab):
    if lang_code not in spacy_doc_bin_urls:
        choices = list(spacy_doc_bin_urls.keys())
        raise KeyError(f"'{lang_code}' is not a valid language code. Please choose from {choices}")
    response = requests.get(spacy_doc_bin_urls[lang_code])
    if response.status_code == 200:
        doc_bin = DocBin().from_bytes(response.content)
        return list(doc_bin.get_docs(vocab))
    return None


parsed_reviews = {}
for lang in lang_codes:
    parsed_reviews[lang] = download_spacy_doc_bin(lang, lang_nlp.vocab)
    print(lang, len(parsed_reviews[lang]))

## Comparing stats by language
We have similar numbers of reviews per language. Use the code below to gauge if this means we also have comparable amounts of tokens and words too.

In [None]:
from collections import defaultdict, Counter

frequencies = { 
    'languages': [ 'en', 'de', 'it', 'es' ],
    'term_freq': [],
    'total_terms': [],
    'doc_freq': [],
    'total_tokens': []
}
    
for lang in frequencies['languages']:
    # the total frequency of words
    term_freq = Counter()
    # the document frequency of words, that is, in how many reviews does a word occur?
    doc_freq = Counter()
    # The total number of documents/reviews
    num_reviews = len(parsed_reviews[lang])
    # Load language-specific stopwords
    stopwords = stopwordsiso.stopwords(lang)
    
    for doc in parsed_reviews[lang]:
        # list all words in the review
        terms = [token.text for token in doc if token.pos_ != 'PUNCT' and len(token.lemma_) > 2]
        # ignore case, turn all terms to lowercase
        terms = [term.lower() for term in terms]
        # remove language-specific stopwords
        terms = [term for term in terms if term not in stopwords]
        term_freq.update(terms)
        doc_freq.update(set(terms))

    frequencies['term_freq'].append( term_freq )
    frequencies['total_terms'].append( len( term_freq ) )
    frequencies['doc_freq'].append( doc_freq )
    frequencies['total_tokens'].append( sum( term_freq.values() ) )

frequencies_df = pd.DataFrame( frequencies, index=frequencies['languages'] )
frequencies_df.plot.bar( rot=0 )

### Question
What are some possible reasons for the differences you see?

## Most used words per language

In [None]:
plt.tight_layout()
fig, axs = plt.subplots( 2, 2 )
plt.subplots_adjust( bottom=-0.5 )
axsl = [axs[0,0], axs[0,1], axs[1,0], axs[1,1] ]
for idx, lang in enumerate( frequencies['languages'] ):
    mc_df = pd.DataFrame( frequencies['term_freq'][idx].most_common(10), columns=['term','term_freq'] )
    mc_df.plot.bar( x='term', y='term_freq', ax=axsl[idx] )
    axsl[idx].set_xticklabels(axsl[idx].get_xticklabels(), rotation=45, ha='right')


# A naive approach to comparing reviews across languages

We might have intuitions about how people express themselves when they are impressed by reading a particular book. Thus, if we want to quantify this we might approach that by thinking of some expressions, translating these in the target languages and finding matching reviews. <br/>
This is a naive approach as our intuitions might not match what is actually happening in the data, but it is a informative first step towards a more exhaustive methof of analysis. <br/>
Let's start from the word “fantastic” in English, which we surmise to be present in highly excited reviews. It translates to “fantastisch” in German, “fantastico” in Italian, and “fantástico” in Spanish.

In [None]:
review_df.head(2)

In [None]:
terms = [ 'fantastic', 'fantastico', 'fantastisch', 'fantástico' ]
fanta_df = review_df[ review_df['review_text'].apply( lambda rev_txt: any( term in rev_txt for term in terms ) ) ]
pd.DataFrame( fanta_df['review_lang'].value_counts() ).plot.bar()


### Discussion question
It is clear that English speaking people are much more prone to be excited about books, right? ;-) <br/>
This approach to analyzing impact terms across languages is certainly flawed (hence 'naive'). 
Discuss confounding factors that might be skewing the result.