In [1]:
import pandas as pd
import re
import os

file_path_1 = 'datasets/raw/goodreads_reviews_mystery_thriller_crime.json'

# Extract genre from the filename; ex: "goodreads_reviews_mystery_thriller_crime.json"
base_name_1 = os.path.basename(file_path_1)
genre_1 = os.path.splitext(base_name_1)[0].replace('goodreads_reviews_', '')

print("Extracted genre:", genre_1)

Extracted genre: mystery_thriller_crime


## Read in Cleaned Data CSV: 

In [2]:
df = pd.read_csv('datasets/cleaned/goodreads_reviews_mystery_thriller_crime_clean.csv')

print("Post-Data Clean row count:", len(df),"\n")
print(df.head())


Post-Data Clean row count: 1685280 

                            user_id                         review_id  \
0  8842281e1d1347389f2ab93d60773d4d  5e212a62bced17b4dbe41150e5bb9037   
1  8842281e1d1347389f2ab93d60773d4d  2ede853b14dc4583f96cf5d120af636f   
2  8842281e1d1347389f2ab93d60773d4d  022bb6daffa49adc27f6b20b6ebeb37d   
3  8842281e1d1347389f2ab93d60773d4d  0e317947e1fd341f573192111bb2921d   
4  8842281e1d1347389f2ab93d60773d4d  4276918357312212384ac6415ceb9159   

                                         review_text  rating  \
0  I haven't read a fun mystery book in a while a...       3   
1  A fun, fast paced science fiction thriller. I ...       3   
2  An amazing and unique creation: JJ Abrams and ...       4   
3  The Name of the Rose is a thrilling Dan Brown-...       3   
4  ** spoiler alert ** \n Hooked me equally as we...       3   

                       date_added  n_votes  
0  Mon Jul 24 02:48:17 -0700 2017        6  
1  Tue Nov 15 11:29:22 -0800 2016       22  
2  W

## Determine which rows have link in it - Add link data to Dataset

In [3]:
# Determine which rows have a review that contain a link. This could be potential spam or fraud links, and therefore, these reviews should be removed from the platform: 
def review_has_link(text):
    """
    This function determines whether the free-form text in the review contains a link within the text or not. 
    The function then flags whether or not it does, by adding a column to the original dataset containing boolean data regarding whether there "is_link" or not
    
    :param text: the string pulled from the 'review_text' column
    :return: 
    """
    if pd.isna(text) or not isinstance(text, str):
        return False
    
    link_patterns_regex = [
    r'http[s]?://[^\s]+',   # http:// or https://
    r'www\.[^\s]+',         # www.something
    r'\b[^\s]+\.com\b',     # something.com as a whole word
    r'\b[^\s]+\.org\b',     # something.org as a whole word
    r'\b[^\s]+\.net\b',     # something.net as a whole word
    ]
    
    # Combine all patterns into one OR pattern
    combined_pattern = "|".join(link_patterns_regex)
    
    # Search for any link-like pattern in the text of the review for each review
    return bool(re.search(combined_pattern, text, flags=re.IGNORECASE))
    

In [4]:
# Apply the function to create a new column
df['contains_link'] = df['review_text'].apply(review_has_link)

# Inspect how many rows contain links
print("Number of reviews containing links:", df['contains_link'].sum())
print(df[['review_text', 'contains_link']].head())

# Save the dataset with the new column
output_path = f"datasets/processed/goodreads_reviews_{genre_1}_with_links_flag.csv"
df.to_csv(output_path, index=False)
print(f"Saved dataset with 'contains_link' column to: {output_path}")

Number of reviews containing links: 51380
                                         review_text  contains_link
0  I haven't read a fun mystery book in a while a...          False
1  A fun, fast paced science fiction thriller. I ...          False
2  An amazing and unique creation: JJ Abrams and ...          False
3  The Name of the Rose is a thrilling Dan Brown-...          False
4  ** spoiler alert ** \n Hooked me equally as we...          False
Saved dataset with 'contains_link' column to: datasets/processed/goodreads_reviews_mystery_thriller_crime_with_links_flag.csv


## Print sample lines where there are links to confirm validity of flag 

In [5]:
# Show the first 5 rows that have links
print(df[df['contains_link']][['review_text', 'contains_link']].head())

                                           review_text  contains_link
305  FIVE-STAR CLARION REVIEW: "An Ordinary Tragedy...           True
403  Key to a Murder by Vicki Vass is a cozy myster...           True
404  Didn't see this one coming! \n The Corpse with...           True
409  A continuation of the Cait Morgan Mysteries, T...           True
416  Note: I received an Advanced Readers' Copy (AR...           True
