## Part 3: BERT classification system for predicting ratings from reviews.

### Importing necessary package

In [5]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

# visualisation
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo

pyo.init_notebook_mode()  

Output hidden; open in https://colab.research.google.com to view.

### Loading the beauty dataset

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
df_beauty = pd.read_json('/content/drive/My Drive/Courses/AML/Assignment01/beauty_products_reviews.json', lines=True)
df_beauty.head(10)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,
1,5,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,
2,1,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,,
3,5,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25.0,
4,5,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3.0,
5,5,False,"05 16, 2010",A24HQ2N7332W7W,B00006L9LC,{'Size:': ' 366'},Kindle Customer Joyce Wilson,"If you know the scent of Diva, you'll LOVE thi...",Diva is Heavenly,1273968000,,
6,5,False,"05 7, 2018",A2G90R2ZU6KU5D,B00006L9LC,{'Size:': ' Small'},Mike,Got this shampoo as a solution for my wife's d...,"Outstanding, no complains",1525651200,,
7,2,True,"05 7, 2018",A24W4W9E62FZP2,B00006L9LC,{'Size:': ' Small'},Reb,No change my scalp still itches like crazy. It...,No change my scalp still itches like crazy. It...,1525651200,,
8,1,True,"05 6, 2018",A7ID5H7FWLJHC,B00006L9LC,{'Size:': ' Small'},U. V.,Too expensive for such poor quality. There was...,Too expensive for such poor quality. There was...,1525564800,,
9,1,True,"05 6, 2018",AYKOSAJTP5AVS,B00006L9LC,{'Size:': ' Small'},Senthil Kumar M,"It dries my hair, doesnt help to reduce dandru...","Dries my hair, doesnt help to reduce dandruff....",1525564800,,


### Creating a new dataframe with reviews and rating

In [12]:
reviews_ratings_df = df_beauty[['reviewText', 'overall']]
reviews_ratings_df = reviews_ratings_df.rename(columns = {'reviewText': 'review', 'overall': 'rating'})
reviews_ratings_df

Unnamed: 0,review,rating
0,As advertised. Reasonably priced,5
1,Like the oder and the feel when I put it on my...,5
2,I bought this to smell nice after I shave. Wh...,1
3,HEY!! I am an Aqua Velva Man and absolutely lo...,5
4,If you ever want to feel pampered by a shampoo...,5
...,...,...
5264,I have genetic undereye darkness. Ive accepted...,5
5265,I absolutely love this eye gel.,5
5266,The eye gel is easy to apply and I use it morn...,5
5267,Ok this eye gel is good stuff.,5


In [13]:
reviews_ratings_df.describe(include = 'all')

Unnamed: 0,review,rating
count,5264,5269.0
unique,1363,
top,Great product,
freq,15,
mean,,4.771873
std,,0.743204
min,,1.0
25%,,5.0
50%,,5.0
75%,,5.0


In [14]:
rating_count = reviews_ratings_df['rating'].value_counts()
rating_count

5    4649
4     332
1     115
3     109
2      64
Name: rating, dtype: int64

In [35]:
fig = px.bar(rating_count, labels = {'index': 'rating', 'value' : 'count'}, title = 'Plotting Rating Count')

fig.update_traces(marker_line=dict(width=1, color='DarkSlateGray'))
fig.update_layout(showlegend= False)

fig.show(renderer='colab')

Almost 89% of the data has rating of 5. This implies that the data is heavily skewed towards positive ratings.

In [36]:
# Making a new numpy array of reviews and rating data

data = reviews_ratings_df[['review','rating']].to_numpy()
print(data.shape)
print(data)

(5269, 2)
[['As advertised. Reasonably priced' 5]
 ['Like the oder and the feel when I put it on my face.  I have tried other brands but the reviews from people I know they prefer the oder of this brand. Not hard on the face when dry.  Does not leave dry skin.'
  5]
 ['I bought this to smell nice after I shave.  When I put it on I smelled awful.  I am 19 and I smelled like a grandmother with too much perfume.'
  1]
 ...
 ['The eye gel is easy to apply and I use it morning and night. It is cool to the touch, and the dispenser is ingenious.'
  5]
 ['Ok this eye gel is good stuff.' 5]
 ['This is the first eye gel/cream that actually does what it said it was gonna do.'
  5]]


In [37]:
# splitting the dataset into training and test set (70-30)

data_train, data_test = train_test_split(data, test_size=0.3,random_state=25, shuffle=True)
print(data_train.shape, data_test.shape)
print(data_train)
print()
print(data_test)

(3688, 2) (1581, 2)
[["I'm giving this three stars because it does work but the results do not justify the price.\n\nI've been using this for over two months. Before I started I couldn't even get my lashes on a curler. They were that short. Within two and a half weeks I could see a difference. Within a month I could get my lashes on a curler.\n\nThis product did make my eyelids itchy. No irritation in my eye though when I accidentally got some in it.  I also noticed a weird looking line above my lahses that is lighter than the rest of my skin.\n\nI've used this as directed.  I did see improvement but I think there is a natural plateau to the length and amount of eyelashes that one's DNA will allow. I've hit mine.  Overall a good product but I can't see spending the money on it again (I got mine for 60% off and still can't justify it)."
  3]
 ['The shampoo is very watery, and I did not see too much of a difference in my dandruff.'
  2]
 ['I received my order on time, and the products we

In [38]:
# taking x as reviews and y as rating

x_train = data_train[:,0]
print(x_train.shape)
print(x_train)

y_train = data_train[:,1]
print(y_train.shape)
print(y_train)

x_test = data_test[:,0]
print(x_test.shape)
print(x_test)

y_test = data_test[:,1]
print(y_test.shape)
print(y_test)


(3688,)
["I'm giving this three stars because it does work but the results do not justify the price.\n\nI've been using this for over two months. Before I started I couldn't even get my lashes on a curler. They were that short. Within two and a half weeks I could see a difference. Within a month I could get my lashes on a curler.\n\nThis product did make my eyelids itchy. No irritation in my eye though when I accidentally got some in it.  I also noticed a weird looking line above my lahses that is lighter than the rest of my skin.\n\nI've used this as directed.  I did see improvement but I think there is a natural plateau to the length and amount of eyelashes that one's DNA will allow. I've hit mine.  Overall a good product but I can't see spending the money on it again (I got mine for 60% off and still can't justify it)."
 'The shampoo is very watery, and I did not see too much of a difference in my dandruff.'
 'I received my order on time, and the products were not broken which I was

In [40]:
# one-hot encoding of y 

enc = OneHotEncoder()
enc.fit(y_train.reshape(-1,1))
print(f"Categories: {enc.categories_}")
y_train_onehot = enc.transform(y_train.reshape(-1,1)).toarray()
print(y_train)
print(y_train_onehot)


print()


enc.fit(y_test.reshape(-1,1))
print(f"Categories: {enc.categories_}")
y_test_onehot = enc.transform(y_test.reshape(-1,1)).toarray()
print(y_test)
print(y_test_onehot)


Categories: [array([1, 2, 3, 4, 5], dtype=object)]
[3 2 4 ... 5 5 5]
[[0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]

Categories: [array([1, 2, 3, 4, 5], dtype=object)]
[5 5 5 ... 5 4 5]
[[0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
