# Keyness analysis

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from top2vec import Top2Vec
from scipy.spatial.distance import cosine 
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import sys
from tqdm import tqdm

from src.topic_summary import ModelAnalyser, NurGenreMapper, ReviewExtractor

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


#### Set paths and load functions

In [2]:
# please adjust the following paths to reflect the location of the following files in your local directory

review_dir = '../../models/reviews/review-impact_matches.tsv.gz'
raw_review_data = '../../models/reviews/reviews-stats.tsv.gz'
isbn_map = "../../data/work-isbn-mapping.tsv"
isbn_work_id_mappings_file = "../../data/work_isbn_genre.tsv"

impact_file = '../data/review-impact_matches.tsv.gz'
raw_review_data = '../data/reviews-stats.tsv.gz'
isbn_map = "../data/work-isbn-mapping.tsv"
isbn_work_id_mappings_file = "../data/work_isbn_title_genre.tsv.gz"

#### Load custom-made classes from `topic_summary.py`

In [3]:
# this class helps to preprocess the inputs and output a genre mapping file
mapper = NurGenreMapper(isbn_map, isbn_work_id_mappings_file)

# this class produces as output the impact_reviews
extractor = ReviewExtractor(impact_file, raw_review_data)

##### `map_function` Identifies for each word, the impact that has the maximum value (if the word has multiple mappings), and thus the prevalent impact.

In [4]:
def map_function(row):
    """
    Identifies and returns the column names with the maximum value in a subset of the given row. 

    This function processes a row of a dataframe. It focuses on the columns 
    'affect', 'style', 'narrative' and 'reflection'. Among these columns, it identifies the one(s) 
    with the maximum value in the row. If multiple columns share the same maximum value, 
    their names are concatenated and returned as a string separated by slashes.

    Parameters:
    - row (pd.Series): A row of a pandas DataFrame, expected to contain the columns 
                     'affect', 'style', 'narrative', and 'reflection'.

    Returns:
    - str: A string representing the column name(s) with the maximum value in the specified subset of columns. 
         If there are multiple columns with the same maximum value, their names are concatenated and 
         separated by slashes.
    """
    col_names = ['affect', 'style', 'narrative', 'reflection']
    row_subset = row[col_names]
    max_value = row_subset.max()
    prevalent_columns = list(row_subset.index[row_subset==max_value])
    return "/".join(prevalent_columns)

##### `compute_keyness` computes the keyness score for each word-genre pair according to the formula present in the paper, section `Data and Method` - `Keyness analysis on impact terms` subsection

In [5]:
def compute_keyness(df: pd.DataFrame, key_column: str) -> pd.DataFrame:
    """
    Compute the keyness score for each word-genre pair in a DataFrame.
    Note that this function takes into consideration also the size of the subsets in several ways:
    - in NF_W and NF_WC, by dividing the count of each 'impact_term' in a given key column 
    by the total count of words in that key column (total_words_per_key), 
    it accounts for the relative size of each category. This normalization step ensures that the 
    frequency of terms is comparable across different categories, 
    regardless of the actual size of each category. Moreover, in the keyness (i.e., log likelihood of the ratio of NF_W to NF_WC) 
    This comparison inherently takes into account the size differences between the subsets (categories) 
    and the entire corpus
    
    Parameters:
    - df_expanded: A pandas DataFrame containing the columns 'impact_term' and + key column.
    - genre_column: The name of the column in df_expanded that contains the genre or category information.
    
    Returns:
    - A pandas DataFrame with columns ['key_column', 'impact_term', 'NF_W', 'NF_WC', 'Keyness']
      representing the genre, the impact term, normalized frequency within the key column, normalized frequency
      in corpus, and the keyness score respectively.
    """

    # Count how many times each word appears in each key_column
    word_counts_per_key = df.groupby([key_column, 'impact_term']).size()

    # Count the total number of words in each key_column
    total_words_per_key = df.groupby(key_column)['impact_term'].count()

    # Compute normalised frequency for each word in the key_column
    NF_W = word_counts_per_key / total_words_per_key
    NF_W_reset = NF_W.reset_index().rename(columns={0: 'NF_W'})

    # Count how many times each word appears in the corpus
    word_counts_corpus = df['impact_term'].value_counts()

    # Compute the total number of words in the corpus
    total_words_corpus = df['impact_term'].count()

    # Compute normalised frequency of impact words in the corpus
    NF_WC = word_counts_corpus / total_words_corpus
    NF_WC_reset = NF_WC.reset_index().rename(columns={'index': 'impact_term', 'impact_term': 'NF_WC'})

    # Merge the two dataframes based on the 'impact_term' column
    df_keyness = NF_W_reset.merge(NF_WC_reset, on='impact_term', how='left')

    # Compute keyness - use log likelihood
    df_keyness['Keyness'] = np.log(df_keyness['NF_W'] / df_keyness['NF_WC']).replace([np.inf, -np.inf], np.nan)

    # Drop rows with nan values (which occur if NF_W or NF_WC is 0)
    df_keyness.dropna(subset=['Keyness'], inplace=True)

    return df_keyness

#### Prepare dataset

In this section we need to put together multiple datasets containing the mapping between words and isbns, and the relative reviews. 
To this end, I made custom-made classes that handle the multiple steps under the hood. These classes are provided in the `src` folder, `topic_summary.py` file.

In [6]:
# get the mapping file which contains `word_id` and `isbn` columns. These are necessary to merge the reviews
mapped_df = mapper.process_genre_mapping()

# this is our impact reviews dataset:
reviews = extractor.get_impact_reviews()

# NB. left-join is the best way to merge the files without losing data
reviews_merged_with_genre = pd.merge(reviews, mapped_df, on = 'work_id', how = 'left')

Dataset consists of impact terms extracted from the reviews of books by the impact model and scored according to affect, style, narrative and reflection. 

In [7]:
dt = reviews_merged_with_genre

In [8]:
dt.head()

Unnamed: 0,work_id,review_id,affect,style,narrative,reflection,impact_term,review_num_words,isbn,nur_genre
0,impfic-work-3723,impfic-review-1,1,0,0,0,fantastisch,185,9789048416547,Young_adult
1,impfic-work-3723,impfic-review-1,1,0,0,0,fantastisch,185,9789400804876,Young_adult
2,impfic-work-3723,impfic-review-1,1,0,0,0,fantastisch,185,9789490767648,Young_adult
3,impfic-work-3723,impfic-review-1,1,0,1,0,fantastisch,185,9789048416547,Young_adult
4,impfic-work-3723,impfic-review-1,1,0,1,0,fantastisch,185,9789400804876,Young_adult


Step below takes all impact terms and if there are any multiwords it appends the word as an additional row below.

In [9]:
dt_expanded = dt.assign(impact_term=dt['impact_term'].str.split()).explode('impact_term')
dt_expanded.head()

Unnamed: 0,work_id,review_id,affect,style,narrative,reflection,impact_term,review_num_words,isbn,nur_genre
0,impfic-work-3723,impfic-review-1,1,0,0,0,fantastisch,185,9789048416547,Young_adult
1,impfic-work-3723,impfic-review-1,1,0,0,0,fantastisch,185,9789400804876,Young_adult
2,impfic-work-3723,impfic-review-1,1,0,0,0,fantastisch,185,9789490767648,Young_adult
3,impfic-work-3723,impfic-review-1,1,0,1,0,fantastisch,185,9789048416547,Young_adult
4,impfic-work-3723,impfic-review-1,1,0,1,0,fantastisch,185,9789400804876,Young_adult


## Compute prevalence of affect, style, narrative and reflection

Every word in 'impact term' has been categorised as belonging to affect, style, narrative and reflection. However, the same word might be used in multiple contexts, across reviews of multiple books, and as such, receive different classifications.

We wish to know what was the "major"/"most prevalent" classification for each word. To do so, we sum for each word within each genre their relative score for each type of impact.

In [10]:
aggregated_data = dt_expanded.groupby(['impact_term', 'nur_genre'], as_index=False).agg(
    affect=('affect', 'sum'),
    style=('style', 'sum'),
    narrative=('narrative', 'sum'),
    reflection=('reflection', 'sum')
)

counts = dt_expanded.groupby(['impact_term', 'nur_genre']).size().reset_index(name='impact_term_count')

table = pd.merge(aggregated_data, counts, on=['impact_term', 'nur_genre'])


In [11]:
table.head()

Unnamed: 0,impact_term,nur_genre,affect,style,narrative,reflection,impact_term_count
0,(begrijp|begrijpt|begreep|begrepen|begrijpen)....,Children_fiction,0,0,0,449,449
1,(begrijp|begrijpt|begreep|begrepen|begrijpen)....,Fantasy_fiction,0,0,0,538,538
2,(begrijp|begrijpt|begreep|begrepen|begrijpen)....,Historical_fiction,0,0,0,75,75
3,(begrijp|begrijpt|begreep|begrepen|begrijpen)....,Literary_fiction,0,0,0,10504,10504
4,(begrijp|begrijpt|begreep|begrepen|begrijpen)....,Literary_thriller,0,0,0,3372,3372


Identify the impact that has the maximum value, and thus it is the prevalent impact:

In [12]:
# use `map_function` for this. This function is available at the top of the page.
table['prevalent_impact'] = table.apply(map_function,axis=1)

In [13]:
table['impact_term'] = table['impact_term'].astype(str)
table['nur_genre'] = table['nur_genre'].astype(str)
dt_expanded['impact_term'] = dt_expanded['impact_term'].astype(str)
dt_expanded['nur_genre'] = dt_expanded['nur_genre'].astype(str)

In [14]:
table_minimal = table[['impact_term', 'nur_genre', 'prevalent_impact']]

The result of applying `map_function` is that for each impact term, there will be a unique impact value. This is stored in the `prevalent_impact` column.

For example:

In [15]:
table_minimal.head(2)

Unnamed: 0,impact_term,nur_genre,prevalent_impact
0,(begrijp|begrijpt|begreep|begrepen|begrijpen)....,Children_fiction,reflection
1,(begrijp|begrijpt|begreep|begrepen|begrijpen)....,Fantasy_fiction,reflection


We're ready to merge this info with the impact model output:

In [16]:
prevalence = pd.merge(dt_expanded, table_minimal, how='left', on = ['impact_term', 'nur_genre'])

In [17]:
prevalence.head(2)

Unnamed: 0,work_id,review_id,affect,style,narrative,reflection,impact_term,review_num_words,isbn,nur_genre,prevalent_impact
0,impfic-work-3723,impfic-review-1,1,0,0,0,fantastisch,185,9789048416547,Young_adult,affect
1,impfic-work-3723,impfic-review-1,1,0,0,0,fantastisch,185,9789400804876,Young_adult,affect


great, we have our dataframe now with the prevalent_impact column, we can start computing keyness

In [18]:
# note that by merging files we might end up with duplicates.
# we get rid of those with drop_duplicates()

prevalence_copy = prevalence.drop_duplicates()

In [19]:
# see the duplicates here:
prevalence_copy.shape, prevalence.shape

((6724367, 11), (7635611, 11))

## Compute keyness

In [20]:
# use the function `compute_keyness`. This can be found at the top of the page.
keyness = compute_keyness(prevalence_copy, 'nur_genre')

In [21]:
keyness.shape

(2274, 5)

We're ready to add in this info with the rest of the dataframe. We select however the most useful columns to make the dataframe tidy.

In [22]:
prevalence_keyness = pd.merge(keyness, prevalence[['impact_term', 'nur_genre', 'prevalent_impact']], on=['impact_term', 'nur_genre'], how='left')

In [23]:
# again, remove duplicates, if any
prevalence_keyness_copy = prevalence_keyness.drop_duplicates()

In [24]:
# there were indeed duplicates:
prevalence_keyness_copy.shape, prevalence_keyness.shape

((2274, 6), (7635611, 6))

This is our final dataframe:

In [25]:
prevalence_keyness_copy.head()

Unnamed: 0,nur_genre,impact_term,NF_W,NF_WC,Keyness,prevalent_impact
0,Children_fiction,(begrijp|begrijpt|begreep|begrepen|begrijpen)....,0.002271,0.003228,-0.351958,reflection
449,Children_fiction,(boek|verhaal|verhalen|portret*|wijze|verslag|...,0.000135,0.000176,-0.268096,reflection
475,Children_fiction,(dit|dat|het),0.000451,0.000916,-0.708456,reflection
562,Children_fiction,(geeft|geven|gegeven|biedt|bieden|geboden|verg...,0.000415,0.001625,-1.36566,reflection
647,Children_fiction,(geschreven|omschreven|beschreven),0.006153,0.009402,-0.423902,affect/style


This dataframe is used later on for producing figures. These figures are rendered in R because the graphical capabilities of this software allows more flexibility for producing plots.

The code to reproduce the scatter plots in R present in the paper is `plot_keyness.R`. It takes as input `prevalence_keyness.csv` and it saves automatically the plots.

In [26]:
prevalence_keyness_copy.to_csv('prevalence_keyness.csv', index = False)