<a href="https://colab.research.google.com/github/knrakshitha/sentiment_analyzer_amazon_reviews/blob/main/Sentiment_Analysis_in_Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

In [None]:
#Installing packages

!pip uninstall tb-nightly tensorboardX tensorboard  #uninstall to avoid version clashes
!pip install tf-nightly #installing the latest version

#importing libraries
import tensorflow as tf
import os
import tensorflow_datasets as tfds
import datetime

%load_ext tensorboard

In [None]:
import pkg_resources

for entry_point in pkg_resources.iter_entry_points('tensorboard_plugins'):
  print(entry_point.dist)


tensorboard 2.4.1
tensorboard-plugin-wit 1.8.0


In [None]:
print(tf.__version__) ##checking for the latest version

2.5.0-dev20210308


In [None]:
dataset, info = tfds.load('amazon_us_reviews/Mobile_Electronics_v1_00', with_info=True)
train_data = dataset['train']

In [None]:
info #studying the dataset

In [None]:
print(len(train_data)) #fetches the number of records availible

104975


In [None]:
BUFFER_SIZE = 3000 #number of records to preload 
BATCH_SIZE = 128 #number of records to process at a time

train_data = train_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False) #shuffles the data to ensure that there no imbalanced classes

In [None]:
for reviews in train_data.take(2):  #tensorflow data cannot be printed directly like pandas. It needs to be iterated
  print(reviews)

{'data': {'customer_id': <tf.Tensor: shape=(), dtype=string, numpy=b'38089071'>, 'helpful_votes': <tf.Tensor: shape=(), dtype=int32, numpy=2>, 'marketplace': <tf.Tensor: shape=(), dtype=string, numpy=b'US'>, 'product_category': <tf.Tensor: shape=(), dtype=string, numpy=b'Mobile_Electronics'>, 'product_id': <tf.Tensor: shape=(), dtype=string, numpy=b'B00I5X59EG'>, 'product_parent': <tf.Tensor: shape=(), dtype=string, numpy=b'558314514'>, 'product_title': <tf.Tensor: shape=(), dtype=string, numpy=b'26 Watt Bluetooth Vibration Speaker with Built-in Mic, Battery, NFC, Touch Control, 360 Degree Resonance'>, 'review_body': <tf.Tensor: shape=(), dtype=string, numpy=b"This is a great resonance speaker.  It was easy to pair using either NFC or standard Bluetooth paring, and clear instructions for both are included.  As with all vibration speakers, the sound quality will depend on the surface it is placed on.  My favorite surface is my glass-covered desk, but it also works great on a wood desk a

In [None]:
for reviews in train_data.take(10):
  get_review = reviews['data']
  print(get_review.get('review_body').numpy())
  

b"This is a great resonance speaker.  It was easy to pair using either NFC or standard Bluetooth paring, and clear instructions for both are included.  As with all vibration speakers, the sound quality will depend on the surface it is placed on.  My favorite surface is my glass-covered desk, but it also works great on a wood desk and a wood chair.  Padded surfaces work but provide a muffled sound.  (It's fun to experiment.  On our padded dining room table, I got improved results by putting it on top of an upside-down Corelle dinner plate.)<br /><br />This is not surround-sound stereo, and there can be some distortion.  It is heavy but for portable presentations, it is a lot better than bulky speaker sets.  This turns your cell phone into a very useful speakerphone for group conversations.  It also works with your tablet when using Skype, Google Hangouts, or FaceTime.<br /><br />It is a bit pricey, but there are not many options if you are looking for something with a built-in microphon

In [None]:
tokenizer = tfds.deprecated.text.Tokenizer()

vocabulary_set = set() #obtain a set of unique words from review body
for _,reviews in train_data.enumerate():
  get_review = reviews['data']
  reviews_tokens = tokenizer.tokenize(get_review.get('review_body').numpy())
  vocabulary_set.update(reviews_tokens)

vocabulary_size = len(vocabulary_set)

In [None]:
encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set) #assigns integer value to words

In [None]:
def encode(text_tensor, label_tensor):
  encoded_text = encoder.encode(text_tensor.numpy())
  label = tf.where(label_tensor>3,1,0)
  return encoded_text, label

def encode_map(tensor):
  data = tensor['data']
  reviews_text = data.get('review_body')
  label = data.get('star_rating')

  encoded_review, label = tf.py_function(encode, inp=[reviews_text, label], Tout=[tf.int64, tf.int32])

  return encoded_review, label

In [None]:
encoded_data = train_data.map(encode_map)

In [None]:
for f0, f1 in encoded_data.take(2):
  print(f0)
  print(f1)

tf.Tensor(
[57313 32353 70985 29209 33125 35292 20014  4215 58923  9273 24104 15863
  4530 72541 47904 26492 15282 36745 55104 65951 19472 22983 68718 35064
 28135 17849 12892 31123 11517 17650  3397 48149 70682 44094 36949 10226
  3397 38517 44773 32353 33794 10226 62253 41485 38517 32353 70849 11659
 28054 55386  5541 44773 31473 53873 29209 10226 70985  9654 55386 55104
 70985  9654 62359 57523 58305 42185  5541 56754 70985 72241 48149 20014
 70377 66585  9273  2785 50389  7685 66985 16960 14566 56553 56336 52762
 16852  2417 39688  9150 44773 10226 11302 60279 69959 28964 35956  1334
 38332 25975 31421 31421 57313 32353 47331 27483 48149 44132 55104 65225
 55901 26285 70296 21543 20014 32353 61906  5541 22983 45922 62794 44773
 32353 70985  3573 68092 63783 25172 35292 30830 57313  7244 22676 63560
  8785 52132 70985 16887  7738  2909 22983 31730 51628 20014 31473 53873
 12892 22676   661  5151 15863 68255 48423 51358 47904 57728 31421 31421
 20014 32353 70985 20115 45219  5541 652

In [None]:
TAKE_SIZE = 10000
#adds 0s to match the length of the longest review
padded_shapes = ([None],())
train_data = encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=padded_shapes)

test_data = encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=padded_shapes)

vocabulary_size+=1 #since we're adding 0s as padding

In [None]:
for f0, f1 in test_data.take(2):
  print(tf.unique_with_counts(f1)[2].numpy())

[88 40]
[40 88]


In [None]:
#creating the model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocabulary_size, 128))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))
model.add(tf.keras.layers.Dense(1))

In [None]:
checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath='/tmp/sentiment_analyzer.hdf5', verbose=1, save_best_only=True)

In [None]:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])

In [None]:
history = model.fit(train_data, epochs=4, validation_data=test_data, callbacks=[checkpointer])

Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.29935, saving model to /tmp/sentiment_analyzer.hdf5
Epoch 2/4

Epoch 00002: val_loss did not improve from 0.29935
Epoch 3/4

Epoch 00003: val_loss improved from 0.29935 to 0.29134, saving model to /tmp/sentiment_analyzer.hdf5
Epoch 4/4


In [None]:
model.save('/tmp/final_sentiment_analysis.hdf5')

In [None]:
!ls -alrt /tmp/*.hdf5

-rw-r--r-- 1 root root         0 Mar 11 06:58 /tmp/final_sentiment_analyzer.hdf5
-rw-r--r-- 1 root root 118634552 Mar 11 07:15 /tmp/sentiment_analyzer.hdf5
