In [None]:
#importing numpy and name it as npy
import numpy as npy

# importing tensor flow library and importing dataset for the project
import tensorflow_datasets as ten_ds
import tensorflow as ten_fw

ten_ds.disable_progress_bar()


In [None]:
# downloading information and dataset for project and dataset have label
# the dataset is based on the reviews of movies colled by imdb
dataset, info = ten_ds.load('imdb_reviews', with_info=True,
                          as_supervised=True)
tr_data_set, tst_data_set = dataset['train'], dataset['test']

tr_data_set.element_spec

In [None]:
#These lines of code iterate over the dataset items using a for loop and increment a counter for each element to count the number of rows in the train and test datasets.
num_rows_train = 0
for _ in tr_data_set:
    num_rows_train += 1

num_rows_test = 0
for _ in tst_data_set:
    num_rows_test += 1

print("Number of rows in train dataset:", num_rows_train)
print("Number of rows in test dataset:", num_rows_test)


In [None]:
# Print top 5 rows of train dataset
for text, label in tr_data_set.take(5):
    print(text.numpy(), label.numpy())
    


In [None]:
# Print top 5 rows of test dataset
for text, label in tst_data_set.take(5):
    print(text.numpy(), label.numpy())


In [None]:
# it shows the code filters the rows in the train and test datasets that have empty text, and then prints the number of those rows.
empty_rows_train = tr_data_set.filter(lambda x, y: ten_fw.strings.strip(x) == "")
empty_rows_test = tst_data_set.filter(lambda x, y: ten_fw.strings.strip(x) == "")

print("Number of empty rows in train dataset:", len(list(empty_rows_train)))
print("Number of empty rows in test dataset:", len(list(empty_rows_test)))


In [None]:
#printing the random one example from the dataset
for example, label in tr_data_set.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

In [None]:
# setting the size of buffer and batch size for the executation
# BUFFER_SIZE is a hyperparameter that determines the number of elements from the dataset that the tf.data.Dataset object should prefetch at each iteration. It is used to speed up the training process by overlapping the preprocessing of data with the training of the model.
#Another hyperparameter, BATCH_SIZE, controls the number of samples that will be used in a single training cycle. The average gradient derived from the samples in one batch is used to update the model's parameters. The batch size is a compromise between the accuracy of the model updates and computing efficiency. Although a higher batch size might result in more stable updates, it might also call for more memory and processing power.
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [None]:
# randomized the dataset so that model doesnot learn similar kind of pattern
# randomized in model is necessary so that model does not learn simliar kind of pattern.
tr_data_set = tr_data_set.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(ten_fw.data.AUTOTUNE)
tst_data_set = tst_data_set.batch(BATCH_SIZE).prefetch(ten_fw.data.AUTOTUNE)

In [None]:
# now printing the texts and labels in the training dataset from the randomized dataset 
for example, label in tr_data_set.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

In [None]:
# now printing the texts and labels in the training dataset from the randomized dataset 
for example, label in tst_data_set.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

In [None]:
# it means that the thousands most frequent words will be keep in the vocabulary at the time of tokenizations
# It alters the text input so that it may be represented numerically and supplied into a machine learning model.
VOCAB_SIZE = 1000
encoder = ten_fw.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(tr_data_set.map(lambda text, label: text))

In [None]:
# retreving the first twenty words from the vocab which will be used for the training
vocab = npy.array(encoder.get_vocabulary())
vocab[:20]

In [None]:
#	encoder(example) applies the text vectorization to the example and returns a tensor of encoded tokens, where each token is represented as an integer.
# the first three elements in the examples are printed
encoded_example = encoder(example)[:3].numpy()
encoded_example

In [None]:
# it shows the original dataset and below it there is After encoded the original dataset which is generated using encoded example
for n in range(3):
  print("Original: ", example[n].numpy())
  print("After encoded the original dataset: ", " ".join(vocab[encoded_example[n]]))
  print()

In [None]:
# The Keras API is used in the code to define a deep learning model. The following layers make up the model.
model = ten_fw.keras.Sequential([
    encoder,
# it is used to create dense vector representing of each integer into the sequence of integers product 
    ten_fw.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
# Use masking to handle the variable sequence lengths
        mask_zero=True),
# It uses the bidirectional LSTM layer that processes the the sequence in both the direction
    ten_fw.keras.layers.Bidirectional(ten_fw.keras.layers.LSTM(64)),
# the dense hidden layer function is used as relu
    ten_fw.keras.layers.Dense(64, activation='relu'),
    ten_fw.keras.layers.Dense(1)
])

In [None]:
# This code snippet prints a list indicating whether each layer in the model supports masking or not. 
# Masking refers to the process of ignoring certain timesteps in the input sequence during the computation, based on the value of a mask tensor
print([layer.supports_masking for layer in model.layers])

In [None]:
# here a movie review is written and the model will predict that the sample text review is +ve or -ve
# predict on a sample text without padding.

sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(npy.array([sample_text]))
print(predictions[0])

In [None]:
# predict on a sample text with padding
# The model should output a higher score for the positive sentiment sentence "sample_text" compared to the padding sequence.

padding = "the " * 2000
predictions = model.predict(npy.array([sample_text, padding]))
print(predictions[0])

In [None]:
#	loss is the objective that the model tries to minimize during training. 
# optimizer is the algorithm that updates the weights of the neural network during training in order to minimize the loss function
#	metrics is a list of metrics used to evaluate the performance of the model during training and testing. 
# Here, the metric is set to accuracy which is commonly used in binary classification problems to measure the fraction of correctly classified samples
model.compile(loss=ten_fw.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=ten_fw.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
# The fit() method updates the model parameters (i.e., weights) using the algorithm and the optimization algorithm specified in model
# During training, it also evaluates the model on the tst_data_set using the validation_data argument, and it computes the validation accuracy at every epoch using the validation_steps argument.
history = model.fit(tr_data_set, epochs=10,
                    validation_data=tst_data_set,
                    validation_steps=30)

In [None]:
# It uses performance of training dataset on test dataset
test_loss, test_acc = model.evaluate(tst_data_set)

print('Train Loss:', test_loss)
print('Train Accuracy:', test_acc)

In [None]:
# importing matplotlib library for the graph and the graphs will be plotted against Epochs and metric
import matplotlib.pyplot as mat_plt


def plot_graphs(history, metric):
  mat_plt.plot(history.history[metric])
  mat_plt.plot(history.history['val_'+metric], '')
  mat_plt.xlabel("Epochs")
  mat_plt.ylabel(metric)
  mat_plt.legend([metric, 'val_'+metric])

In [None]:
# The left plot shows the accuracy of the model during training and validation with accuracy as a term
# And right plot shows for loss 
mat_plt.figure(figsize=(16, 8))
mat_plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
mat_plt.ylim(None, 1)
mat_plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
mat_plt.ylim(0, None)

In [None]:
# It generates predictions for the given inpyut text using the trained model
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(npy.array([sample_text]))

In [None]:
# encoder - it converts converts text into integer sequences
# Embedding - a layer that maps the integer-encoded tokens into dense vectors of fixed size with dimension ogf 64
# Activation function is relu 
# dropout - layer with a rate of 0.5, to randomly drop 50% of the units in the previous layer during training
# dense is used for prediction for the sentiment of the inpyut text (positive or negative).
model = ten_fw.keras.Sequential([
    encoder,
    ten_fw.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
    ten_fw.keras.layers.Bidirectional(ten_fw.keras.layers.LSTM(64,  return_sequences=True)),
    ten_fw.keras.layers.Bidirectional(ten_fw.keras.layers.LSTM(32)),
    ten_fw.keras.layers.Dense(64, activation='relu'),
    ten_fw.keras.layers.Dropout(0.5),
    ten_fw.keras.layers.Dense(1)
])

In [None]:
# It builds the model using an Adam optimizer with a learning rate of 1e-4, a binary cross-entropy loss function, and accuracy as the evaluation metric.
model.compile(loss=ten_fw.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=ten_fw.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
# It contain the for code testing the LSTM model defined earlier on the training dataset for 10 epochs, and evaluates the model on the testing dataset after each epoch for 30 validation steps. The training history is stored in the history variable.
history = model.fit(tr_data_set, epochs=10,
                    validation_data=tst_data_set,
                    validation_steps=30)

In [None]:
 # This block measures the test loss and test accuracy after applying the trained LSTM model to the test dataset.
test_loss, test_acc = model.evaluate(tst_data_set)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
# predict on a sample text without padding.
# Using the trained LSTM model, the code creates a prediction about the sentiment of the input sample_text.
sample_text = ('The movie was not good. The animation and the graphics '
               'were terrible. I would not recommend this movie.')
predictions = model.predict(npy.array([sample_text]))
print(predictions)

In [None]:
# Graphs representing accuracy and loss during training and validation of the LSTM model are plotted using this code.

mat_plt.figure(figsize=(16, 6))
mat_plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
mat_plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')