# Preparing Impact Datasets

In [1]:
# This reload library is just used for developing the notebook
# code and can be removed once this is stable.
%reload_ext autoreload
%autoreload 2

In [2]:
import ast

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from src.topic_summary import ModelAnalyser, NurGenreMapper, ReviewExtractor

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


## Set paths and load functions

In [3]:
# please adjust the following paths to reflect the location of the following files in your local directory

impact_file = '../data/review-impact_matches.tsv.gz'
review_stats_file = '../data/reviews-stats.tsv.gz'
isbn_map = "../data/work-isbn-mapping.tsv"
isbn_work_id_mappings_file = "../data/work_isbn_title_genre.tsv.gz"
isbn_topic_file = '../data/isbn_topic_scores.tsv.gz'

## Prepare review data

In [5]:
from impfic_core.map.map_genre import read_genre_file

# read review metadata
review_stats = pd.read_csv(review_stats_file, sep='\t', compression='gzip')

# read work theme mapping
work_genre = read_genre_file(isbn_work_id_mappings_file)
work_genre = work_genre[['work_id', 'nur_genre']].drop_duplicates()

# merge review metadata and theme data
review_stats = pd.merge(review_stats, work_genre[['work_id', 'nur_genre']].drop_duplicates(), 
                        on='work_id', how='left')

# remove professional reviews
review_stats = review_stats[review_stats.source != 'NBD_Biblion']

# remove reviews with no words
review_stats = review_stats[review_stats.review_num_words > 0]

# remove duplicate reviews with alternative work ID
review_stats = review_stats.groupby('review_id').first().reset_index()

# add unknown when no nur_genre is provided
review_stats['nur_genre'] = (review_stats
                             .nur_genre
                             .apply(lambda x: 'unknown' if pd.isna(x) else x))


In [6]:
review_stats.source.value_counts()

Bol          254080
Hebban       210583
Goodreads     90600
WLJN          38210
Dizzie        26873
Boekmeter      7250
LTL            7004
Name: source, dtype: int64

In [7]:
select_cols = ['review_id', 'source', 'work_id', 'user_id', 'nur_genre', 'review_num_words', 'review_date', 'rating']
review_stats_clean_file = '../data/review-stats-clean.tsv.gz'
review_stats[select_cols].to_csv(review_stats_clean_file, index=False, sep='\t', compression='gzip')

## Read information on ISBN, Topic and Theme

In [8]:
isbn_topic = pd.read_csv(isbn_topic_file, sep='\t', compression='gzip', dtype={'isbn': str})

# Turn the theme string into a Python list object
isbn_topic['themes'] = isbn_topic.themes.str.split(';')
#isbn_topic['themes'] = isbn_topic.themes.apply(ast.literal_eval)

# explode the themes lists so that each theme is column and rows indicate 
# whether an ISBN has been assigned to a theme.
isbn_theme = isbn_topic.explode('themes')[['isbn', 'themes']].set_index('isbn')
isbn_theme = pd.get_dummies(isbn_theme).reset_index().groupby('isbn').sum().reset_index()
isbn_theme

Unnamed: 0,isbn,themes_behaviours / feelings,themes_city & travel,themes_crime,themes_culture,themes_economy & work,themes_family,themes_geography & setting,themes_history,themes_lifestyle & sport,themes_medicine / health,themes_other,themes_politics,themes_reli. / spirit. / phil.,themes_romance & sex,themes_science,themes_society,"themes_super., fantasy & sci-fi",themes_war,themes_wildlife / nature
0,9789020200485,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,9789020201451,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,9789020202106,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
3,9789020205008,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,9789020205268,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18462,9789493189560,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
18463,9789493189584,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
18464,9789493189607,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
18465,9789493189669,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [9]:
theme_cols = [col for col in isbn_theme.columns if col.startswith('themes_')]
themes = [col.replace('themes_', '') for col in theme_cols]
len(theme_cols), len(themes)

(19, 19)

## Combine Work identifier and theme to Topic and Theme information

In [10]:
from impfic_core.map.map_genre import read_genre_file

# read work theme mapping
work_genre = read_genre_file(isbn_work_id_mappings_file)
work_genre = work_genre[work_genre.record_id_type == 'isbn'].rename(columns={'record_id': 'isbn'})
work_genre = work_genre[['work_id', 'isbn', 'nur_genre']]
work_genre.shape

(189038, 3)

In [11]:
# merge and drop the ISBN column, since we need only the work_id to link
# topic and theme to the reviews and impact terms
work_genre_theme = pd.merge(work_genre, isbn_theme, on='isbn').drop('isbn', axis=1)

# the collection has 18,465 books, so there should be 18,465 rows
work_genre_theme.shape

(18465, 21)

In [12]:
# Check that each work ID appears only once. If there are multiple ISBNs for the same work ID, their themes might differ. 
# Solution: assume the work has all the themes of its ISBNs

work_genre_theme.work_id.value_counts()

impfic-work-26287     12
impfic-work-31916      5
impfic-work-15072      4
impfic-work-12518      4
impfic-work-11806      4
                      ..
impfic-work-34207      1
impfic-work-34221      1
impfic-work-34223      1
impfic-work-34241      1
impfic-work-118718     1
Name: work_id, Length: 18205, dtype: int64

In [13]:
work_genre_theme.drop_duplicates().work_id.value_counts()

impfic-work-10225     3
impfic-work-6840      3
impfic-work-15030     3
impfic-work-50896     3
impfic-work-17557     3
                     ..
impfic-work-33799     1
impfic-work-33800     1
impfic-work-33801     1
impfic-work-33803     1
impfic-work-118718    1
Name: work_id, Length: 18205, dtype: int64

In [14]:
# if ISBNs for the same work have the same theme profile, collapse them
work_genre_theme = work_genre_theme.drop_duplicates()

In [15]:
work_genre_theme[work_genre_theme.work_id == 'impfic-work-1681']

Unnamed: 0,work_id,nur_genre,themes_behaviours / feelings,themes_city & travel,themes_crime,themes_culture,themes_economy & work,themes_family,themes_geography & setting,themes_history,...,themes_medicine / health,themes_other,themes_politics,themes_reli. / spirit. / phil.,themes_romance & sex,themes_science,themes_society,"themes_super., fantasy & sci-fi",themes_war,themes_wildlife / nature
317,impfic-work-1681,Suspense,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
318,impfic-work-1681,Suspense,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
319,impfic-work-1681,Suspense,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


Indeed, different ISBNs of the same work have different themes. We combine them at the work level.

In [16]:
temp = work_genre_theme.groupby(['work_id', 'nur_genre'])[theme_cols].max()
work_genre_theme = temp.reset_index()
work_genre_theme[work_genre_theme.work_id == 'impfic-work-1681']


Unnamed: 0,work_id,nur_genre,themes_behaviours / feelings,themes_city & travel,themes_crime,themes_culture,themes_economy & work,themes_family,themes_geography & setting,themes_history,...,themes_medicine / health,themes_other,themes_politics,themes_reli. / spirit. / phil.,themes_romance & sex,themes_science,themes_society,"themes_super., fantasy & sci-fi",themes_war,themes_wildlife / nature
1993,impfic-work-1681,Suspense,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [17]:
work_genre_theme.shape

(18205, 21)

In [18]:
# Some insight in the distribution of books over themes
work_genre_theme.nur_genre.value_counts() / len(work_genre_theme)

Literary_fiction      0.296347
Non-fiction           0.287613
Other fiction         0.109091
Suspense              0.091733
Regional_fiction      0.047405
Literary_thriller     0.044438
Children_fiction      0.030321
Romance               0.028838
Young_adult           0.026641
Fantasy_fiction       0.019555
Historical_fiction    0.018017
Name: nur_genre, dtype: float64

In [19]:
theme_cols = [col for col in work_genre_theme.columns if col.startswith('themes_')]
theme_cols

['themes_behaviours / feelings',
 'themes_city & travel',
 'themes_crime',
 'themes_culture',
 'themes_economy & work',
 'themes_family',
 'themes_geography & setting',
 'themes_history',
 'themes_lifestyle & sport',
 'themes_medicine / health',
 'themes_other',
 'themes_politics',
 'themes_reli. / spirit. / phil.',
 'themes_romance & sex',
 'themes_science',
 'themes_society',
 'themes_super., fantasy & sci-fi',
 'themes_war',
 'themes_wildlife / nature']

In [20]:
for theme in theme_cols:
    work_genre_theme[theme] = work_genre_theme[theme].apply(lambda x: 0 if x == 0 else 1)

In [21]:
work_genre_theme.head(2)

Unnamed: 0,work_id,nur_genre,themes_behaviours / feelings,themes_city & travel,themes_crime,themes_culture,themes_economy & work,themes_family,themes_geography & setting,themes_history,...,themes_medicine / health,themes_other,themes_politics,themes_reli. / spirit. / phil.,themes_romance & sex,themes_science,themes_society,"themes_super., fantasy & sci-fi",themes_war,themes_wildlife / nature
0,impfic-work-10,Non-fiction,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,impfic-work-1001,Suspense,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Work-theme assignments should be 0 or 1, so the max score per genre should be 1:

In [22]:
work_genre_theme.max()

work_id                            impfic-work-9986
nur_genre                               Young_adult
themes_behaviours / feelings                      1
themes_city & travel                              1
themes_crime                                      1
themes_culture                                    1
themes_economy & work                             1
themes_family                                     1
themes_geography & setting                        1
themes_history                                    1
themes_lifestyle & sport                          1
themes_medicine / health                          1
themes_other                                      1
themes_politics                                   1
themes_reli. / spirit. / phil.                    1
themes_romance & sex                              1
themes_science                                    1
themes_society                                    1
themes_super., fantasy & sci-fi                   1
themes_war  

In [23]:
#from impfic_core.map.map_theme import read_theme_file
import impfic_core.map.map_genre

In [16]:
from impfic_core.map.map_genre import read_genre_file

# read review metadata
review_stats = pd.read_csv(review_stats_file, sep='\t', compression='gzip')

# read work theme mapping
work_genre = read_genre_file(isbn_work_id_mappings_file)

# merge review metadata and theme data
review_stats = pd.merge(review_stats, work_genre[['work_id', 'nur_genre']].drop_duplicates(), 
                        on='work_id', how='left')

# remove professional reviews
review_stats = review_stats[review_stats.source != 'NBD_Biblion']

# remove reviews with no words
review_stats = review_stats[review_stats.review_num_words > 0]

In [17]:
review_stats['nur_genre'] = review_stats.nur_genre.apply(lambda x: 'unknown' if pd.isna(x) else x)


In [24]:
review_stats.columns

Index(['review_id', 'source', 'user_id', 'review_num_terms',
       'review_num_words', 'num_sentences', 'review_date', 'review_text',
       'rating', 'work_id', 'nur_genre'],
      dtype='object')

### Review Statistics

In [25]:
print('number of reviews:', len(review_stats))
review_stats.nur_genre.value_counts() / len(review_stats)

number of reviews: 634600


Literary_fiction      0.305585
Non-fiction           0.149830
unknown               0.142778
Literary_thriller     0.119333
Suspense              0.104564
Other fiction         0.054743
Young_adult           0.045816
Children_fiction      0.039512
Fantasy_fiction       0.020645
Romance               0.009847
Historical_fiction    0.005183
Regional_fiction      0.002165
Name: nur_genre, dtype: float64

## Combining work-genre-theme info with review info

In [47]:
work_genre_theme
review_theme = pd.merge(review_stats[['review_id', 'review_num_words', 'work_id']], work_genre_theme, on='work_id')
review_theme.head(5)

Unnamed: 0,review_id,review_num_words,work_id,nur_genre,themes_behaviours / feelings,themes_city & travel,themes_crime,themes_culture,themes_economy & work,themes_family,...,themes_medicine / health,themes_other,themes_politics,themes_reli. / spirit. / phil.,themes_romance & sex,themes_science,themes_society,"themes_super., fantasy & sci-fi",themes_war,themes_wildlife / nature
0,impfic-review-36736,51,impfic-work-35144,Historical_fiction,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,impfic-review-36737,587,impfic-work-35144,Historical_fiction,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,impfic-review-36738,38,impfic-work-35144,Historical_fiction,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,impfic-review-36739,396,impfic-work-35144,Historical_fiction,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,impfic-review-36740,131,impfic-work-35144,Historical_fiction,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [48]:
review_theme.shape

(125287, 23)

In [49]:
review_theme.to_csv('../data/review_work_genre_theme.tsv.gz', 
                    index=False, sep='\t', compression='gzip')

In [50]:
work_genre_theme
review_theme = pd.merge(review_stats[['review_id', 'review_num_words', 'work_id']], work_genre_theme, on='work_id')
review_theme = review_theme.melt(id_vars=['review_id', 'review_num_words', 'work_id', 'nur_genre'], value_vars=theme_cols, value_name='has_theme', var_name='theme')
review_theme = review_theme[review_theme.has_theme == 1]
review_theme

Unnamed: 0,review_id,review_num_words,work_id,nur_genre,theme,has_theme
53,impfic-review-36801,57,impfic-work-12385,Literary_thriller,themes_behaviours / feelings,1
54,impfic-review-36802,384,impfic-work-12385,Literary_thriller,themes_behaviours / feelings,1
55,impfic-review-36803,227,impfic-work-12385,Literary_thriller,themes_behaviours / feelings,1
56,impfic-review-36804,25,impfic-work-12385,Literary_thriller,themes_behaviours / feelings,1
57,impfic-review-36805,614,impfic-work-12385,Literary_thriller,themes_behaviours / feelings,1
...,...,...,...,...,...,...
2380248,impfic-review-593744,162,impfic-work-37367,Other fiction,themes_wildlife / nature,1
2380251,impfic-review-595292,73,impfic-work-44763,Other fiction,themes_wildlife / nature,1
2380327,impfic-review-621714,423,impfic-work-2742,Non-fiction,themes_wildlife / nature,1
2380408,impfic-review-652244,13,impfic-work-21968,Other fiction,themes_wildlife / nature,1


# Preparing the Review impact dataset

## Load custom-made classes from `topic_summary.py`

In [26]:
# this class produces as output the impact_reviews
extractor = ReviewExtractor(impact_file, review_stats_file)

## Prepare dataset

The impact matches dataset consists of impact terms extracted from the reviews of books by the impact model and scored according to _affect_, _style_, _narrative_ and _reflection_. 

The _reflection_ category is not validated by the manual annotations (see Boot & Koolen 2021), so we remove all _reflection_-only matches from the dataset:

In [51]:
# this is our impact reviews dataset:
review_impact = extractor.get_impact_reviews()

review_impact.head()

Unnamed: 0,work_id,review_id,affect,style,narrative,reflection,impact_term,review_num_words
0,impfic-work-3723,impfic-review-1,1,0,0,0,fantastisch,185
1,impfic-work-3723,impfic-review-1,1,0,1,0,fantastisch,185
2,impfic-work-3723,impfic-review-1,1,0,1,0,spanning,185
3,impfic-work-36913,impfic-review-2,1,1,0,0,prachtig,185
4,impfic-work-31725,impfic-review-3,1,0,0,0,leuk,217


In [52]:
# some reviews are linked to multiple work IDs
review_impact[['review_id', 'work_id']].drop_duplicates().review_id.value_counts()

impfic-review-48033     2
impfic-review-74593     2
impfic-review-1         1
impfic-review-455655    1
impfic-review-455649    1
                       ..
impfic-review-225818    1
impfic-review-225817    1
impfic-review-225816    1
impfic-review-225814    1
impfic-review-671245    1
Name: review_id, Length: 549254, dtype: int64

In [54]:
# solve this by dropping the work ID column and adding it back in via the review stats, where this issue
# has already been solved

review_impact = pd.merge(review_impact.drop('work_id', axis=1), review_stats[['review_id', 'work_id']], on=['review_id'])

review_impact.shape

(2305329, 8)

In [65]:
review_impact[['review_id', 'work_id']].drop_duplicates().review_id.value_counts()

impfic-review-36635     1
impfic-review-463458    1
impfic-review-463480    1
impfic-review-463479    1
impfic-review-463478    1
                       ..
impfic-review-244074    1
impfic-review-244073    1
impfic-review-244072    1
impfic-review-244071    1
impfic-review-671245    1
Name: review_id, Length: 515239, dtype: int64

In [56]:
review_impact = pd.merge(review_impact, work_genre[['work_id', 'nur_genre']].drop_duplicates(), on='work_id', how='left')

In [64]:
review_impact.head()

Unnamed: 0,review_id,affect,aesthetic,narrative,impact_term,review_num_words,work_id,nur_genre
0,impfic-review-36635,0,0,1,onverwacht,708,impfic-work-118720,unknown
2,impfic-review-36635,0,1,0,stijl,708,impfic-work-118720,unknown
5,impfic-review-36636,0,0,1,indrukwekkend,106,impfic-work-85015,Other fiction
6,impfic-review-36636,0,1,0,goed (geschreven|omschreven|beschreven),106,impfic-work-85015,Other fiction
7,impfic-review-36636,0,1,0,beschrijven,106,impfic-work-85015,Other fiction


In [61]:

review_impact['nur_genre'] = (review_impact
                              .nur_genre
                              .apply(lambda x: 'unknown' if pd.isna(x) else x))

# drop reflection because the reflection rules were not validated in Boot & Koolen 2020
review_impact = (review_impact[review_impact[['affect', 'style', 'narrative']]
                 .sum(axis=1) > 0]
                 .drop('reflection', axis=1))

impact_types = ['affect', 'aesthetic', 'narrative']

review_impact = review_impact.rename(columns={'style': 'aesthetic'})
review_impact['affect'] = abs(review_impact[impact_types].sum(axis=1) - 2)

review_impact.head()

Unnamed: 0,review_id,affect,aesthetic,narrative,impact_term,review_num_words,work_id,nur_genre
0,impfic-review-36635,0,0,1,onverwacht,708,impfic-work-118720,unknown
2,impfic-review-36635,0,1,0,stijl,708,impfic-work-118720,unknown
5,impfic-review-36636,0,0,1,indrukwekkend,106,impfic-work-85015,Other fiction
6,impfic-review-36636,0,1,0,goed (geschreven|omschreven|beschreven),106,impfic-work-85015,Other fiction
7,impfic-review-36636,0,1,0,beschrijven,106,impfic-work-85015,Other fiction


In [62]:
review_impact.shape

(2037431, 8)

In [63]:
# make sure that reviews map to only one work ID and one NUR genre
s = review_impact[['review_id', 'work_id', 'nur_genre']].drop_duplicates().review_id.value_counts()
multi_review = s[s > 1].index
review_impact[review_impact.review_id.isin(multi_review)]

Unnamed: 0,review_id,affect,aesthetic,narrative,impact_term,review_num_words,work_id,nur_genre


In [66]:
review_impact.to_csv('../data/review_work_genre-impact_terms.tsv.gz', 
                     sep='\t', index=False, compression='gzip')

In [67]:
(review_impact
 .groupby(['review_id', 'work_id', 'review_num_words', 'nur_genre'])[impact_types]
 .sum()
 .reset_index()
 .to_csv('../data/review_work_genre-impact_counts.tsv.gz', 
         index=False, sep='\t', compression='gzip'))