In [2]:
import re
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS

In [6]:
# amz data loading 
amzbks_metadata = pd.read_parquet('../amz_dataset/amazon_books_metadata_sample_20k.parquet')
amzbks_reviews = pd.read_parquet('../amz_dataset/amazon_books_reviews_sample_20k.parquet')

# merge amzbooks metadata with reviews
amzbks_metadata.rename(columns={'title': 'book_title'}, inplace=True)
amzbks_reviews.rename(columns={'title': 'review_title'}, inplace=True)
amzbks = pd.merge(amzbks_reviews, amzbks_metadata, left_on='asin', right_on='parent_asin', how='left')

# Remove duplicates based on user + book + review text
amzbks = amzbks.drop_duplicates(subset=['user_id', 'asin', 'text'], keep='first')

# data preprocessing
# removed the books with null category_sub coloumns (4092 (<1%))
amzbks = amzbks[amzbks['category_level_2_sub'].notnull()]
# removed the reviews with null review_text coloumns (111 (<1%)
amzbks = amzbks[amzbks['text'].notnull()]

# Using their category level 2 as genre 
# sentiment analysis: Is there a correlation between review sentiment and genre? 
tqdm.pandas()
sia = SentimentIntensityAnalyzer() 
amzbks['sentiment_score'] = amzbks['text'].progress_apply(lambda x: sia.polarity_scores(x)['compound'])
genre_sentiment = amzbks.groupby('category_level_2_sub')['sentiment_score'].agg(['mean', 'std', 'count']).reset_index()

100%|██████████| 715860/715860 [06:16<00:00, 1899.84it/s]


In [7]:
genre_sentiment_lvl3 = amzbks.groupby('category_level_3_detail')['sentiment_score'].agg(['mean', 'std', 'count']).reset_index()

In [8]:
amzbks.to_csv('amazon_books_full.csv', index=False)

In [9]:
# what words or topics are most common by genre?
# removing the word book, br, and HTML tags from the word cloud
stopwords = set(STOPWORDS)
stopwords.update(['book', 'br', 'S', 'read'])
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.lower().strip()

for genre in amzbks['category_level_2_sub'].unique():
    review_txt = amzbks[amzbks['category_level_2_sub'] == genre]['text'].astype(str)
    text = ' '.join(review_txt.apply(clean_text))   

    wordcloud = WordCloud(width=800, 
                            height=400, 
                            background_color='white',
                            stopwords=stopwords).generate(text)
        
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'{genre} - Most Common Words')
    plt.savefig(f'word_clouds/{genre}_wordcloud.png')
    plt.close()

In [9]:
# exploring the author_about column for additional insights
# Word clouds for author_about text by genre
stopwords_author = set(STOPWORDS)
stopwords_author.update(['author', 'book', 'books', 'novel', 'novels', 'writer', 'wrote', 'published'])

for genre in amzbks['category_level_2_sub'].unique():
    # Get unique author_about texts for this genre (avoid duplicates from multiple reviews)
    author_texts = amzbks[amzbks['category_level_2_sub'] == genre]['author_about'].dropna().unique()
    
    # Clean and join
    text = ' '.join([clean_text(str(bio)) for bio in author_texts])
    
    # Skip if no text
    if len(text.strip()) < 10:
        continue
    
    wordcloud = WordCloud(width=800, 
                          height=400, 
                          background_color='white',
                          stopwords=stopwords_author,
                          max_words=100).generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'{genre} - Author Bio Common Words')
    plt.savefig(f'author_word_clouds/{genre}_author_wordcloud.png')
    plt.close()