# SVM

In [1]:
import re
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import svm
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from textblob import TextBlob

books_file_path = 's3://sagemaker-studio-654654423427-qehmmj0rd4i/books_data.csv'
book_ratings_path = 's3://sagemaker-studio-654654423427-qehmmj0rd4i/Books_rating.csv'


In [2]:
books = pd.read_csv(books_file_path, usecols=["Title", "authors", "categories", "ratingsCount", "publishedDate"])
books.rename(columns={'Title': 'title', 'publishedDate': 'date_published', 'ratingsCount': 'ratings_count'}, inplace=True)

books["date_published"] = books["date_published"].str.strip()
books["date_published"] = books["date_published"].apply(lambda x: None if re.search(r'\d{4}', str(x)) is None else re.search(r'\d{4}', str(x)).group(0))
books = books.dropna()

books['date_published'] = books['date_published'].astype(int)
books = books[books["ratings_count"] >= 20]

books.head(5)

Unnamed: 0,title,authors,date_published,categories,ratings_count
111,The Rabbi's Cat,['Joann Sfar'],2005,['Comics & Graphic Novels'],25.0
269,The Castle in the Attic,['Elizabeth Winthrop'],2012,['Juvenile Fiction'],21.0
368,Jean Paul Sartres No Exit and the Flies,['Jean-Paul Sartre'],1976,['Drama'],22.0
475,The Gods of Mars,['Edgar Rice Burroughs'],2020,['Fiction'],26.0
520,A Wise Woman,['Philippa Gregory'],1994,['Fiction'],21.0


In [3]:
book_ratings = pd.read_csv(book_ratings_path, usecols=["User_id", "Title", "Price", "review/score", "review/text", "review/summary"])
book_ratings.rename(columns={'User_id': 'user_id', 'Title': 'title', "Price": "price", 'review/score': 'rating', 'review/text': 'review_text', 'review/summary': 'review_summary'}, inplace=True)
book_ratings = book_ratings.dropna()

book_ratings_sum = book_ratings.groupby('title')['rating'].sum()
book_ratings_sum = book_ratings_sum.to_frame('book_ratings_sum')
book_ratings_count = book_ratings.groupby('title')['rating'].size()
book_ratings_count = book_ratings_count.to_frame('book_ratings_count')
merged_book_attributes = books.merge(book_ratings, on='title', how='inner')\
                              .drop_duplicates(subset=['title', 'user_id'], keep=False)

merged_book_attributes = merged_book_attributes.merge(book_ratings_sum, on='title', how='left')
merged_book_attributes = merged_book_attributes.merge(book_ratings_count, on='title', how='left')

merged_book_attributes["books_average_rating"] = (merged_book_attributes['book_ratings_sum'] - merged_book_attributes['rating']) / (merged_book_attributes['book_ratings_count'] - 1)
merged_book_attributes['books_average_rating'] = merged_book_attributes['books_average_rating'].fillna(book_ratings['rating'].mean())
                                
merged_book_attributes = merged_book_attributes[['price', 'review_summary', 'review_text', 'books_average_rating', 'rating']]
merged_book_attributes['full_review_sentiment_score'] = merged_book_attributes['review_text'].apply(lambda review: TextBlob(review).sentiment.polarity)
merged_book_attributes['review_summary_sentiment_score'] = merged_book_attributes['review_summary'].apply(lambda review_summary: TextBlob(review_summary).sentiment.polarity)

merged_book_attributes.drop('review_summary', axis=1, inplace=True)
merged_book_attributes.drop('review_text', axis=1, inplace=True)

columns = ['price', 'full_review_sentiment_score', 'review_summary_sentiment_score', 'books_average_rating', 'rating']
merged_book_attributes = merged_book_attributes.reindex(columns=columns)

merged_book_attributes.head()

Unnamed: 0,price,full_review_sentiment_score,review_summary_sentiment_score,books_average_rating,rating
0,16.93,0.130353,0.5,4.435897,4.0
1,16.93,0.186603,0.65,4.410256,5.0
2,16.93,0.169697,0.8,4.410256,5.0
3,16.93,0.173779,0.6,4.410256,5.0
4,16.93,0.126473,0.0,4.410256,5.0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(merged_book_attributes[['price', 'full_review_sentiment_score', 'review_summary_sentiment_score', 'books_average_rating']], merged_book_attributes[['rating']], test_size=0.2, random_state=42)

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
clf = svm.SVR(kernel='rbf')
clf.fit(X_train_scaled, y_train.values.ravel())

In [7]:
y_pred = clf.predict(X_test_scaled)

In [8]:
mse = mean_squared_error(y_test, y_pred) # goal is 0
r2 = r2_score(y_test, y_pred) # goal is positive number, the higher the better
print(f'Mean Squared Error: {mse}')
print(f'R-squared Score: {r2}')

Mean Squared Error: 1.1232373474173065
R-squared Score: 0.21568034046148643
