# Load Dataset - Mysteries, Thrillers, & Crimes

In [1]:
pip install langdetect

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import os
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0  # make results deterministic

file_path_1 = 'datasets/raw/goodreads_reviews_mystery_thriller_crime.json'

# Extract genre from the filename; ex: "goodreads_reviews_mystery_thriller_crime.json"
base_name_1 = os.path.basename(file_path_1)
genre = os.path.splitext(base_name_1)[0].replace('goodreads_reviews_', '')

print("Extracted genre:", genre)

Extracted genre: mystery_thriller_crime


In [3]:
# Load JSON Lines
df = pd.read_json(file_path_1, lines=True)

# Keep only needed columns
required_columns = ['user_id', 'review_id','review_text', 'rating', 'date_added', 'n_votes']
df = df[required_columns]

print(df.head())

# Print initial count of rows: 
print("Initial row count:", len(df))

                            user_id                         review_id  \
0  8842281e1d1347389f2ab93d60773d4d  5e212a62bced17b4dbe41150e5bb9037   
1  8842281e1d1347389f2ab93d60773d4d  2ede853b14dc4583f96cf5d120af636f   
2  8842281e1d1347389f2ab93d60773d4d  8e4d61801907e591018bdc3442a9cf2b   
3  8842281e1d1347389f2ab93d60773d4d  022bb6daffa49adc27f6b20b6ebeb37d   
4  8842281e1d1347389f2ab93d60773d4d  0e317947e1fd341f573192111bb2921d   

                                         review_text  rating  \
0  I haven't read a fun mystery book in a while a...       3   
1  A fun, fast paced science fiction thriller. I ...       3   
2           http://www.telegraph.co.uk/culture/10...       0   
3  An amazing and unique creation: JJ Abrams and ...       4   
4  The Name of the Rose is a thrilling Dan Brown-...       3   

                       date_added  n_votes  
0  Mon Jul 24 02:48:17 -0700 2017        6  
1  Tue Nov 15 11:29:22 -0800 2016       22  
2  Tue Nov 01 11:09:18 -0700 2016        

# Clean Up Data

### Filter out rows where review_text is null/empty/only whitespace. This is not free-text, so review is out of scope


In [4]:
# Filter out rows where review_text is null/empty/only whitespace
df = df[df['review_text'].notnull()]
df = df[df['review_text'].str.strip() != '']

### Each Entry must have: user_id, review_id, date_added

In [5]:
# If user_id, review_id, date_added filling is empty, remove them from the dataset. 
df = df.dropna(subset=['user_id', 'review_id', 'date_added'])

# Print new count of rows to confirm: 
print("Pre-Data Clean row count:", len(df))


Pre-Data Clean row count: 1848852


### Remove Duplicate Reviews -- Tech Error (same user_id, review_id, and review_text)

In [6]:
# Remove Duplicate Reviews (error on Goodreads End - flag to tech - Separate From Data Quality of reviews) 
df = df.drop_duplicates(subset=['user_id', 'review_id', 'review_text'])

# Remove Reviews that contain text that is not in English - Not in scope, though tailoring to non-English reviews could be future work


In [7]:
# Remove Reviews that contain text that is not in English - Though tailoring to non-English reviews could be future work

def review_is_english(text):
    """
    Returns True if the first 200 chars of the text review is detected as English ('en').
    Empty strings are considered False (but your earlier mask already handles them).
    """
    # only pass the first 200 characters for detection to speed performance
    if not text or text.isspace():
        return True
    try:
        sample = text[:200]  # limit to first 200 chars
        return detect(sample) == 'en'
    except:
        return False
    
# Apply language detection only to rows with review_text
print("Detecting languages... (this may take a while for large datasets)")

mask_english = df['review_text'].apply(review_is_english)
df_clean = df[mask_english]

print("Post-Data Clean row count:", len(df_clean))

Detecting languages... (this may take a while for large datasets)
Post-Data Clean row count: 1685280


# Save Cleaned Data

In [8]:
# Save cleaned CSV with genre in filename
output_path = f"datasets/cleaned/goodreads_reviews_{genre}_clean.csv"

df_clean.to_csv(output_path, index=False)

print(f"Saved cleaned CSV to: {output_path}")


Saved cleaned CSV to: datasets/cleaned/goodreads_reviews_mystery_thriller_crime_clean.csv
