# Naive Bayes

In [6]:
import re
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import svm
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from textblob import TextBlob
from sklearn.model_selection import train_test_split

books_file_path = 's3://sagemaker-studio-654654423427-qehmmj0rd4i/books_data.csv'
book_ratings_path = 's3://sagemaker-studio-654654423427-qehmmj0rd4i/Books_rating.csv'

In [7]:
books = pd.read_csv(books_file_path, usecols=["Title", "authors", "categories", "ratingsCount", "publishedDate"])
books.rename(columns={'Title': 'title', 'publishedDate': 'date_published', 'ratingsCount': 'ratings_count'}, inplace=True)

books["date_published"] = books["date_published"].str.strip()
books["date_published"] = books["date_published"].apply(lambda x: None if re.search(r'\d{4}', str(x)) is None else re.search(r'\d{4}', str(x)).group(0))
books = books.dropna()

books['date_published'] = books['date_published'].astype(int)
books = books[books["ratings_count"] >= 20]
book_ratings = pd.read_csv(book_ratings_path, usecols=["User_id", "Title", "Price", "review/score", "review/text", "review/summary"])
book_ratings.rename(columns={'User_id': 'user_id', 'Title': 'title', "Price": "price", 'review/score': 'rating', 'review/text': 'review_text', 'review/summary': 'review_summary'}, inplace=True)
book_ratings = book_ratings.dropna()

book_ratings_sum = book_ratings.groupby('title')['rating'].sum()
book_ratings_sum = book_ratings_sum.to_frame('book_ratings_sum')
book_ratings_count = book_ratings.groupby('title')['rating'].size()
book_ratings_count = book_ratings_count.to_frame('book_ratings_count')
merged_book_attributes = books.merge(book_ratings, on='title', how='inner')\
                              .drop_duplicates(subset=['title', 'user_id'], keep=False)

merged_book_attributes = merged_book_attributes.merge(book_ratings_sum, on='title', how='left')
merged_book_attributes = merged_book_attributes.merge(book_ratings_count, on='title', how='left')

merged_book_attributes["books_average_rating"] = (merged_book_attributes['book_ratings_sum'] - merged_book_attributes['rating']) / (merged_book_attributes['book_ratings_count'] - 1)
merged_book_attributes['books_average_rating'] = merged_book_attributes['books_average_rating'].fillna(book_ratings['rating'].mean())
                                
merged_book_attributes = merged_book_attributes[['price', 'review_summary', 'review_text', 'books_average_rating', 'rating']]
merged_book_attributes['full_review_sentiment_score'] = merged_book_attributes['review_text'].apply(lambda review: TextBlob(review).sentiment.polarity)
merged_book_attributes['review_summary_sentiment_score'] = merged_book_attributes['review_summary'].apply(lambda review_summary: TextBlob(review_summary).sentiment.polarity)

merged_book_attributes.drop('review_summary', axis=1, inplace=True)
merged_book_attributes.drop('review_text', axis=1, inplace=True)

columns = ['price', 'full_review_sentiment_score', 'review_summary_sentiment_score', 'books_average_rating', 'rating']
merged_book_attributes = merged_book_attributes.reindex(columns=columns)

merged_book_attributes.head()

Unnamed: 0,price,full_review_sentiment_score,review_summary_sentiment_score,books_average_rating,rating
0,16.93,0.130353,0.5,4.435897,4.0
1,16.93,0.186603,0.65,4.410256,5.0
2,16.93,0.169697,0.8,4.410256,5.0
3,16.93,0.173779,0.6,4.410256,5.0
4,16.93,0.126473,0.0,4.410256,5.0


In [8]:
# modify data to work on categories instead of float values
bins = [0, 1, 2, 3, 4, 5]
labels = [1, 2, 3, 4, 5]
merged_book_attributes['rating_category'] = pd.cut(merged_book_attributes['rating'], bins=bins, labels=labels, include_lowest=True)
X = merged_book_attributes.drop(['rating', 'rating_category'], axis=1)
y = merged_book_attributes['rating_category']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)


In [11]:
y_pred = nb.predict(X_test_scaled)

In [12]:
mse = mean_squared_error(y_test, y_pred) # goal is 0
r2 = r2_score(y_test, y_pred) # goal is positive number, the higher the better.
print(classification_report(y_test, y_pred))
print(f'Mean Squared Error: {mse}')
print(f'R-squared Score: {r2}')

              precision    recall  f1-score   support

           1       0.40      0.42      0.41       536
           2       0.14      0.01      0.01       363
           3       0.23      0.06      0.10       689
           4       0.22      0.05      0.08      1527
           5       0.66      0.94      0.77      4837

    accuracy                           0.61      7952
   macro avg       0.33      0.29      0.27      7952
weighted avg       0.50      0.61      0.52      7952

Mean Squared Error: 1.6607142857142858
R-squared Score: -0.1596221102842803
