# This File Produces a Substantiveness Score for Book Reviews. 
### Goal: Score reviews 0-5 based on how detailed/informative they are

In [2]:
# Install required NLP libraries
!pip install nltk spacy

# Install / download spaCy English model
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m476.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


### Imports and Setup

In [27]:
import pandas as pd
import re
import nltk
import spacy

# Download NLTK data 
nltk.download('punkt')

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/laurenrutledge/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read in Cleaned Data CSV:

In [18]:
df = pd.read_csv('datasets/processed/goodreads_reviews_mystery_thriller_crime_with_links_flag.csv')

print("Post-Data Clean row count:", len(df),"\n")
print(df.head())


Post-Data Clean row count: 1685280 

                            user_id                         review_id  \
0  8842281e1d1347389f2ab93d60773d4d  5e212a62bced17b4dbe41150e5bb9037   
1  8842281e1d1347389f2ab93d60773d4d  2ede853b14dc4583f96cf5d120af636f   
2  8842281e1d1347389f2ab93d60773d4d  022bb6daffa49adc27f6b20b6ebeb37d   
3  8842281e1d1347389f2ab93d60773d4d  0e317947e1fd341f573192111bb2921d   
4  8842281e1d1347389f2ab93d60773d4d  4276918357312212384ac6415ceb9159   

                                         review_text  rating  \
0  I haven't read a fun mystery book in a while a...       3   
1  A fun, fast paced science fiction thriller. I ...       3   
2  An amazing and unique creation: JJ Abrams and ...       4   
3  The Name of the Rose is a thrilling Dan Brown-...       3   
4  ** spoiler alert ** \n Hooked me equally as we...       3   

                       date_added  n_votes  contains_link  
0  Mon Jul 24 02:48:17 -0700 2017        6          False  
1  Tue Nov 15 11:29

# Define NLP Feature Functions

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize

### Count Sentences in Text Review (NLTK)

In [19]:
# Sentence count
def count_sentences(text):
    return len(sent_tokenize(text))

# Apply NLTK-based features
df['sentence_count'] = df['review_text'].apply(count_sentences)

### Count Words in Text Review (NLTK)

In [23]:
# Word count
def count_words(text):
    return len(word_tokenize(text))

df['word_count'] = df['review_text'].apply(count_words)

### Determine Average Words per Sentence in Text Review

In [20]:
# Average words per sentence
def avg_words_per_sentence(text):
    s = count_sentences(text)
    w = count_words(text)
    return (w / s) if s > 0 else 0

df['avg_words_per_sentence'] = df['review_text'].apply(avg_words_per_sentence)

### Compute Lexical Diversity Score Per Review 
#### Reviews with higher diversity tend to use a richer vocabulary and be more detailed/informative.

In [24]:
# Lexical diversity (type-token ratio = number of unique words / total # of words
# NLTK’s tokenizer splits text into individual tokens (words, punctuation, etc.).
# If output is close to 1 = very diverse vocabulary (most words are unique)
# If output is closer to 0 = lots of repetition in vocabulary 

def lexical_diversity(text):
    
    # Removes numbers and punctuation, lowercase everything 
    words = [w.lower() for w in word_tokenize(text) if w.isalpha()]
    
    # Set removes duplicates, returns 
    return len(set(words)) / len(words) if words else 0

df['lexical_diversity'] = df['review_text'].apply(lexical_diversity)

### Determine whether a name (doesn't matter if it's a character or the author) is in the Review's Text

In [25]:
print(df.head())

                            user_id                         review_id  \
0  8842281e1d1347389f2ab93d60773d4d  5e212a62bced17b4dbe41150e5bb9037   
1  8842281e1d1347389f2ab93d60773d4d  2ede853b14dc4583f96cf5d120af636f   
2  8842281e1d1347389f2ab93d60773d4d  022bb6daffa49adc27f6b20b6ebeb37d   
3  8842281e1d1347389f2ab93d60773d4d  0e317947e1fd341f573192111bb2921d   
4  8842281e1d1347389f2ab93d60773d4d  4276918357312212384ac6415ceb9159   

                                         review_text  rating  \
0  I haven't read a fun mystery book in a while a...       3   
1  A fun, fast paced science fiction thriller. I ...       3   
2  An amazing and unique creation: JJ Abrams and ...       4   
3  The Name of the Rose is a thrilling Dan Brown-...       3   
4  ** spoiler alert ** \n Hooked me equally as we...       3   

                       date_added  n_votes  contains_link  sentence_count  \
0  Mon Jul 24 02:48:17 -0700 2017        6          False               7   
1  Tue Nov 15 11:29:22

## Save to Substantiveness csv for futher processesing

In [26]:
output_path = "datasets/processed/goodreads_reviews_with_nlp_features_substantiveness_v2.csv"

df.to_csv(output_path, index=False)

In [None]:
input_path_2 = "datasets/processed/goodreads_reviews_with_nlp_features_substantiveness_v2.csv"

pd.read_csv(input_path_2)

In [None]:
# Named Entity Recognition with spaCy for people only
def mentions_person(text):
    """
    Returns 1 if the review mentions at least one person's name (spaCy PERSON entity),
    otherwise returns 0.
    """
    doc = nlp(text)
    return int(any(ent.label_ == "PERSON" for ent in doc.ents))

df['mentions_person'] = df['review_text'].apply(mentions_person)