In [6]:
import pandas as pd
import os

file_path_1 = 'datasets/raw/goodreads_reviews_mystery_thriller_crime.json'

# Extract genre from the filename; ex: "goodreads_reviews_mystery_thriller_crime.json"
base_name_1 = os.path.basename(file_path_1)
genre_1 = os.path.splitext(base_name_1)[0].replace('goodreads_reviews_', '')

print("Extracted genre:", genre_1)

Extracted genre: mystery_thriller_crime


## Read in Cleaned Data CSV: 

In [7]:
df = pd.read_csv('datasets/processed/goodreads_reviews_mystery_thriller_crime_with_links_flag.csv')

print("Post-Data Clean row count:", len(df),"\n")
print(df.head())


Post-Data Clean row count: 1685571 

                            user_id                         review_id  \
0  8842281e1d1347389f2ab93d60773d4d  5e212a62bced17b4dbe41150e5bb9037   
1  8842281e1d1347389f2ab93d60773d4d  2ede853b14dc4583f96cf5d120af636f   
2  8842281e1d1347389f2ab93d60773d4d  022bb6daffa49adc27f6b20b6ebeb37d   
3  8842281e1d1347389f2ab93d60773d4d  0e317947e1fd341f573192111bb2921d   
4  8842281e1d1347389f2ab93d60773d4d  4276918357312212384ac6415ceb9159   

                                         review_text  rating  \
0  I haven't read a fun mystery book in a while a...       3   
1  A fun, fast paced science fiction thriller. I ...       3   
2  An amazing and unique creation: JJ Abrams and ...       4   
3  The Name of the Rose is a thrilling Dan Brown-...       3   
4  ** spoiler alert ** \n Hooked me equally as we...       3   

                       date_added  n_votes  contains_link  
0  Mon Jul 24 02:48:17 -0700 2017        6          False  
1  Tue Nov 15 11:29

## Determine which rows have a rating but no text review

In [8]:
# Determine which rows have a rating but no text review
def rating_but_no_review(text, rating):
    """
    Returns True if the review has a valid rating but the review text is empty or whitespace.
    
    :param text: the string pulled from the 'review_text' column
    :param rating: the numeric rating
    :return: boolean
    """
    # Check rating
    if pd.isna(rating) or rating <= 0:
        return False  # no rating or invalid rating
    
    # Check text
    if pd.isna(text):
        return True  # rating exists but text is NaN
    
    # Normalize text
    if isinstance(text, str):
        text = text.strip()  # remove spaces, tabs, newlines from both ends
        return text == ''  # True if empty after stripping
    else:
        return True  # if somehow not a string, treat as empty
    

In [14]:
# Normalize text first
df['review_text'] = df['review_text'].fillna('').astype(str).str.strip()

# Build conditions
has_rating = df['rating'].notna() & (df['rating'] > 0)
no_text = df['review_text'] == ''

df['rating_no_review'] = has_rating & no_text


# Inspect how many
print("Number of reviews with a rating but no text review:", df['rating_no_review'].sum())
print(df[df['rating_no_review']].head())

Number of reviews with a rating but no text review: 291
                                user_id                         review_id  \
17045  fea9b0a54f57f9be780c4e8404b388fb  3bf140bf5aa19174db227f970e9f3fdf   
25011  acea1d6a9e2df9c268fc65d6816909df  4bf0d1942bcf8e21edae732fef4608f2   
27532  82806811a06d3f90defce2254845533d  2ee1326a399e7bc363804a5a97043898   
34832  e6d58522010659d7b1dba59eda9c9be6  ffe45791cb44946bb4e190fa9c2b8eb7   
64544  785d9ac97b87aebef15d20a6e3d11ae9  36877ae8faaa8158a7ca689a3b6a5b7d   

      review_text  rating                      date_added  n_votes  \
17045                   2  Sun Mar 10 16:49:45 -0700 2013        2   
25011                   3  Sun Nov 18 14:16:11 -0800 2012        0   
27532                   4  Wed Sep 30 10:16:13 -0700 2015        0   
34832                   4  Sun Aug 18 18:22:29 -0700 2013        0   
64544                   1  Sat Jun 28 09:40:06 -0700 2014       21   

       contains_link  rating_no_review  plagiarized_exact  


In [10]:
# Save the dataset with another new column for rating and no reviews 
output_path = f"datasets/processed/goodreads_reviews_{genre_1}_with_links_flag_and_no_reviews_flag.csv"
df.to_csv(output_path, index=False)
print(f"Saved dataset with 'contains_link' AND 'rating_no_reviews columns to: {output_path}")

Saved dataset with 'contains_link' column to: datasets/processed/goodreads_reviews_mystery_thriller_crime_with_links_flag_and_no_reviews_flag.csv
