# Sentiment Analysis using TensorFlow 2

The obejctive of the project is to understand how to process English sentences, apply NLP techniques, make the deep learning model understand the context of the sentence, and classify the sentiment the sentence implies.

###### About the Dataset
IMDB reviews: Will get the dataset from the tensorflow_datasets

# Importing the Modules

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

tf.random.set_seed(42)

import tensorflow_datasets as tfds
import warnings
warnings.filterwarnings('ignore')

# Loading the Dataset

In [2]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
print(datasets.keys())



dict_keys(['test', 'train', 'unsupervised'])


In [3]:
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

print(train_size , test_size)

25000 25000


# Exploring the Data

In [4]:
for X_batch, y_batch in datasets["train"].batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:100], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside.  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things in ...
Label: 0 = Negative



# Defining the preprocess function

In [5]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    # For each string in the input Tensor X_batch, it creates a substring starting at index pos(here 0) with a total length of len(here 300). So basically, it returns substrings from Tensor of strings.
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    # Replaces elements of X_batch matching regex pattern <br\s*/?> with rewrite
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    # Split elements of input X_batch into a RaggedTensor.
    X_batch = tf.strings.split(X_batch)
    #Converts the RaggedTensor into a tf.Tensor. default_value is the value to set for indices not specified in X_batch. Empty values are assigned default_value(here <pad>).
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [6]:
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 53), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
         b'on', b'the', b'sette', b'and', b'having', b'j

# Constructing the Vocabulary

In [7]:
from collections import Counter
vocabulary = Counter()

In [8]:
# For each review in every batch of the train data, vocabulary dictionary is made containing the words and their counts correspondingly
for X_batch, y_batch in datasets["train"].batch(2).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [9]:
vocabulary.most_common()[:5]

[(b'<pad>', 63155),
 (b'the', 61137),
 (b'a', 38564),
 (b'of', 33983),
 (b'and', 33431)]

In [10]:
len(vocabulary)

53893

# Truncating the Vocabulary

In [11]:
# There are more than 50,000 words in the vocabulary. Truncating it to have only 10,000 most common words.
vocab_size = 10000
truncated_vocabulary = [ word for word, count in vocabulary.most_common()[:vocab_size]]

# Creating a lookup table

In [12]:
# Computer can only process numbers but not words. Thus converting the words in truncated_vocabulary into numbers.
#Create a tensor containing the words of truncated_vocabulary.
words = tf.constant(truncated_vocabulary)

#Creating the word_ids using the corresponding indices of words in truncated_vocabulary.
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)

#Creating the table initializer  using tf.lookup.KeyValueTensorInitializer, given the keys(here words) and the values(here word_ids) tensors.
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)


In [13]:
# Creating the lookup table

num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [14]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

###### The words “this,” “movie,” and “was” were found in the table, so their IDs are lower than 10,000, while the word “faaaaaantastic” was not found, so it was mapped to one of the oov buckets, with an ID greater than or equal to 10,000.

# Creating the Final Train and Test sets

In [15]:
# Defining the function to encode the words of train data using the lookup table table.

def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

In [16]:
# Applying the preprocess function on every batch of data with 32 samples repeatedly on the train data datasets.

train_set = datasets["train"].repeat().batch(32).map(preprocess)

In [17]:
# Applying the function encode_words to the train_set and parallelly fetching the next batch.

train_set = train_set.map(encode_words).prefetch(1)

In [18]:
# Applying same functions to test_set

test_set = datasets["test"].batch(10000).map(preprocess)
test_set = test_set.map(encode_words)

In [19]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


# Building the Model

In [20]:
embed_size = 128 #embedding size of each word 

In [21]:
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
               mask_zero=True,
               input_shape=[None]),
    keras.layers.GRU(4, return_sequences=True),
    keras.layers.GRU(2),
    keras.layers.Dense(1, activation="sigmoid")
])

In [22]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Training and Testing the Model

In [23]:
import time
start = time.time()

In [24]:
model.fit(train_set, steps_per_epoch=train_size // 32, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7eff555b0ac8>

In [25]:
end = time.time()

In [26]:
print("Time of execution:", end-start)

Time of execution: 281.828164100647


In [27]:
model.evaluate(test_set)



[0.7501710653305054, 0.7394000291824341]