In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud

In [24]:
# amz data loading 
amzbks_metadata = pd.read_parquet('../amz_dataset/amazon_books_metadata_sample_20k.parquet')
amzbks_reviews = pd.read_parquet('../amz_dataset/amazon_books_reviews_sample_20k.parquet')

In [25]:
# merge amzbooks metadata with reviews
amzbks_metadata.rename(columns={'title': 'book_title'}, inplace=True)
amzbks_reviews.rename(columns={'title': 'review_title'}, inplace=True)
amzbks = pd.merge(amzbks_reviews, amzbks_metadata, left_on='asin', right_on='parent_asin', how='left')

# amzbks.to_csv('../amz_dataset_og/amazon_books_full.csv', index=False)

In [26]:
# data preprocessing
# removed the books with null category_sub coloumns (4092 (<1%))
amzbks = amzbks[amzbks['category_level_2_sub'].notnull()]
# removed the reviews with null review_text coloumns (111 (<1%)
amzbks = amzbks[amzbks['text'].notnull()]

In [None]:
# Using their category level 2 as genre 
# sentiment analysis: Is there a correlation between review sentiment and genre? 
amzbks['sentiment_score'] = amzbks['text'].apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x)['compound'])
genre_sentiment = amzbks.groupby('category_level_2_sub')['sentiment_score'].agg(['mean', 'std', 'count']).reset_index()

In [None]:
# Visualization 1: Bar chart of average sentiment by genre
plt.figure(figsize=(10, 6))
genre_sentiment['mean'].sort_values().plot(kind='barh')
plt.xlabel('Average Sentiment Score')
plt.ylabel('Genre')
plt.title('Average Review Sentiment by Genre')
plt.tight_layout()
plt.show()

In [None]:
# what words or topics are most common by genre?
for genre in amzbks['category_level_2_sub'].unique():
    text = ' '.join(amzbks[amzbks['category_level_2_sub'] == genre]['text'].astype(str))
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'{genre} - Most Common Words')
    plt.savefig(f'../eda/{genre}_wordcloud.png')
    plt.close()

In [None]:
# author_about section