In [1]:
from sklearn.datasets import load_files

# load IMDb dataset
trainData = load_files('data/processed/train', categories=['pos', 'neg'],
                        shuffle=True,
                        load_content=True,
                        encoding='UTF-8',
                        random_state=42)
testData = load_files('data/processed/test',
                       categories=['pos', 'neg'],
                        shuffle=True,
                        load_content=True,
                        encoding='UTF-8',
                        random_state=42)

# split dataset for training and testing
xTrain, yTrain, xTest, yTest = trainData.data, trainData.target, testData.data, testData.target

In [3]:
from tensorflow.keras.layers import TextVectorization

# preprocess the text data (tokenization, padding)
tokens = 10000
v = TextVectorization(max_tokens=tokens,
                      output_mode='int',
                      output_sequence_length=500)
v.adapt(xTrain)

xTrainTokens = v(xTrain)
xTestTokens = v(xTest)

2025-05-16 01:01:16.176978: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2025-05-16 01:01:16.177190: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-05-16 01:01:16.177205: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-05-16 01:01:16.177240: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-16 01:01:16.177264: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

# build RNN model
model = Sequential()
model.add(Embedding(input_dim=tokens, output_dim=128, input_length=50))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))

# train and compile on training set
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', 'AUC'])

model.fit(xTrainTokens, yTrain, batch_size=64, epochs=5, validation_data=(xTestTokens, yTest))

# evaluate the model and its performance
model.evaluate(xTestTokens, yTest)
model.summary()



Epoch 1/5


2025-05-16 01:01:26.689934: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 314ms/step - AUC: 0.7301 - accuracy: 0.6621 - loss: 0.5898 - val_AUC: 0.8885 - val_accuracy: 0.7660 - val_loss: 0.5049
Epoch 2/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 298ms/step - AUC: 0.9201 - accuracy: 0.8567 - loss: 0.3560 - val_AUC: 0.9067 - val_accuracy: 0.8037 - val_loss: 0.4287
Epoch 3/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 273ms/step - AUC: 0.9493 - accuracy: 0.8896 - loss: 0.2852 - val_AUC: 0.9225 - val_accuracy: 0.8523 - val_loss: 0.3931
Epoch 4/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 259ms/step - AUC: 0.9703 - accuracy: 0.9216 - loss: 0.2138 - val_AUC: 0.9234 - val_accuracy: 0.8528 - val_loss: 0.3740
Epoch 5/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 260ms/step - AUC: 0.9828 - accuracy: 0.9452 - loss: 0.1593 - val_AUC: 0.9205 - val_accuracy: 0.8516 - val_loss: 0.3977
[1m782/782[0m [32m━━━━

In [7]:
# sample text predictions
sampleTexts = ["This movie was absolutely fantastic!",
               "I hated every minute of it.",
               "An emotional rollercoaster! The performances were heartfelt and the cinematography was stunning.",
               "I found the plot to be predictable and the characters lacked depth. Not a memorable film.",
               "It was an average experience—some scenes were entertaining, but the pacing was inconsistent.",
               "Brilliant direction and a captivating storyline! Easily one of the most compelling films this year.",
               "The soundtrack was incredible, but the acting left much to be desired.",
               "A beautifully shot movie with a script that kept me on the edge of my seat!",
               "An instant classic! The chemistry between the leads was palpable and the ending was perfect.",
               "The humor felt forced and the dialogue was awkward at times. Disappointing overall."]

for text in sampleTexts:
    textTokenized = v([text])
    predictions = model.predict(textTokenized)
    print(text)
    predLabel = predictions[0][0]
    print(f"Sample prediction {sampleTexts.index(text)+1}: {predLabel:.4f} ({'neg' if predLabel < 0.5 else 'pos'})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 341ms/step
This movie was absolutely fantastic!
Sample prediction 1: 0.8725 (pos)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
I hated every minute of it.
Sample prediction 2: 0.2192 (neg)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
An emotional rollercoaster! The performances were heartfelt and the cinematography was stunning.
Sample prediction 3: 0.9647 (pos)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
I found the plot to be predictable and the characters lacked depth. Not a memorable film.
Sample prediction 4: 0.0656 (neg)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
It was an average experience—some scenes were entertaining, but the pacing was inconsistent.
Sample prediction 5: 0.4117 (neg)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Brilliant direction and a captivating storyline! Eas