<h4> <b> Importing Required Libraries </b></h4>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.datasets import imdb

<h4> <b> Defining Hyperparameters </b> </h4>

In [2]:
vocabulary_size = 20000     # 20k - Maximum number of unique words to consider
sentence_length = 1000      # Maximum length of each review (take only first 1000 words from the review)
dimension = 256             # Dimension of the embedding vector for each word

<h4><b> Loading the IMDB Dataset </b></h4>

In [3]:
# load the imdb data from imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(max_word=vocabulary_size)

In [4]:
# Combining training and test sets for custom train-validation split
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

print(X.shape, y.shape)

(50000,) (50000,)


In [5]:
X[:5]

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1

In [6]:
y[:5]

array([1, 0, 0, 1, 0])

In [7]:
# Just checking the maximum and minimus length of the review
max_length = max([len(x) for x in X])
min_length = min([len(x) for x in X])
print(max_length)
print(min_length)

2494
7


<h4><b> Padding Sequences </b></h4>

In [8]:
from tensorflow.keras.utils import pad_sequences

In [9]:
X = pad_sequences(X, padding='pre', maxlen=sentence_length)
X.shape

(50000, 1000)

<h4> <b> Defining and Building the RNN Model </b> </h4>

In [10]:
"""
    Model structure:
    - Embedding layer: Converts word tokens into dense vectors
    - SimpleRNN layer: Basic recurrent layer with ReLU activation
    - Dense output layer: Single neuron with sigmoid for binary classification
"""

model = Sequential()
model.add(Embedding(input_dim=vocabulary_size, output_dim=dimension))
model.add(SimpleRNN(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Model with expected input shape
model.build(input_shape=(None, sentence_length))

In [11]:
model.summary()

<h4> <b> Set Up Early Stopping and Check for availability of GPUs </b> </h4>

In [12]:
from tensorflow.keras.callbacks import EarlyStopping

In [13]:
# Stop early if validation loss does not improve in consecutive 5 iterations
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [14]:
# check if gpu is availabel for use
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available: 1


<h4> <b> Model Compilation </b> </h4>

In [15]:
# Using Adam optimizer and binary crossentropy for binary classification
model.compile(optimizer='adam', loss='binary_crossentropy', metrics =['accuracy'])

<h4> <b> Model Training </b> </h4>

In [25]:
history = model.fit(
    np.array(X.tolist()),
    np.array(y.tolist()),
    epochs = 10,
    batch_size = 64,
    validation_split = 0.2
    # callbacks = [early_stopping_callback]
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 91ms/step - accuracy: 0.9329 - loss: 0.1631 - val_accuracy: 0.8399 - val_loss: 0.4031
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 89ms/step - accuracy: 0.9417 - loss: 0.1450 - val_accuracy: 0.8338 - val_loss: 0.4358
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 88ms/step - accuracy: 0.9542 - loss: 0.1151 - val_accuracy: 0.8386 - val_loss: 0.4613
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 91ms/step - accuracy: 0.9627 - loss: 0.0969 - val_accuracy: 0.8428 - val_loss: 0.5054
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 91ms/step - accuracy: 0.9664 - loss: 0.0834 - val_accuracy: 0.8446 - val_loss: 0.5044
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 90ms/step - accuracy: 0.9737 - loss: 0.0684 - val_accuracy: 0.7957 - val_loss: 0.5921
Epoch 7/10
[1m6

In [None]:
history_df = pd.DataFrame(history.history)
history_df.to_csv('./../files/training_history.csv', index=False)
history_df.head()

Unnamed: 0,accuracy,loss,val_accuracy,val_loss
0,0.92515,0.175104,0.8399,0.403118
1,0.934625,0.16134,0.8338,0.435813
2,0.95,0.125903,0.8386,0.461294
3,0.95735,0.106517,0.8428,0.505435
4,0.963175,0.092602,0.8446,0.504429


In [26]:
loss, accuracy = model.evaluate(X, y, verbose=1)
print(f"\nFinal Evaluation on Full Dataset:")
print(f"Loss     : {loss:.4f}")
print(f"Accuracy : {accuracy:.4f}")

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 24ms/step - accuracy: 0.9861 - loss: 0.0453

Final Evaluation on Full Dataset:
Loss     : 0.1567
Accuracy : 0.9580


In [None]:
model.save('./../models/imdb_rnn_model.h5')
model.save('./../models/imdb_rnn_model.keras')

