# Imports and Preprocessing

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import imdb
import numpy as np
import json

In [2]:
# Load IMDb dataset
max_words = 10000  # Number of most frequent words to consider
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words)

In [3]:
print(f"Training data shape: {x_train.shape}, Testing data shape: {x_test.shape}")

Training data shape: (25000,), Testing data shape: (25000,)


In [4]:
# Pad the sequences to make sure all inputs are of the same length
max_sequence_length = 500 

x_train = pad_sequences(x_train, maxlen=max_sequence_length)
x_test = pad_sequences(x_test, maxlen=max_sequence_length)

# Check the shape of the data after padding
print(f"Padded Training data shape: {x_train.shape}, Padded Testing data shape: {x_test.shape}")

Padded Training data shape: (25000, 500), Padded Testing data shape: (25000, 500)


# Training and Evaluation

## FNN model

In [5]:
# Model architecture
model = models.Sequential([
    layers.Embedding(input_dim=max_words, output_dim=128),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')  # Sigmoid for binary classification (positive/negative)
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summarize the model architecture
model.summary()

In [6]:
# Train the model
history = model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 17ms/step - accuracy: 0.5328 - loss: 0.6793 - val_accuracy: 0.7984 - val_loss: 0.4623
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 17ms/step - accuracy: 0.7995 - loss: 0.4404 - val_accuracy: 0.8254 - val_loss: 0.3820
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.8650 - loss: 0.3211 - val_accuracy: 0.8212 - val_loss: 0.3747
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.8709 - loss: 0.3013 - val_accuracy: 0.8800 - val_loss: 0.3026
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.9010 - loss: 0.2519 - val_accuracy: 0.8780 - val_loss: 0.3010


In [7]:
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
print(f"Test Accuracy: {test_acc:.4f}")

782/782 - 2s - 3ms/step - accuracy: 0.8683 - loss: 0.3119
Test Accuracy: 0.8683


In [94]:
# load imdb vocab for the tokens
word_index = imdb.get_word_index()
word_index = {k:v for k,v in word_index.items() if v<max_words}

In [83]:
# save tokenization mapping
with open('word_index.json', 'w') as f:
    json.dump(word_index, f, indent=4)

In [95]:
index_to_word = {v:k for k,v in word_index.items()}

def decode(enc):
    return ' '.join([index_to_word.get(i, '?') for i in enc if i!=0])

tests = x_test[0]
print(f"Exemple of negative review: '{decode(test)}'")
print(f"Real Sentiment: {'Positive' if y_test[0] > 0.5 else 'Negative'}")
pred = model.predict(test.reshape(1,-1))
print(f"Prediction: {'Positive' if pred > 0.5 else 'Negative'}")

Exemple of negative review: 'the as you world's is quite br mankind most that quest are chase to being quickly of little it time hell to plot br of something long put are of every place this consequence and of interplay storytelling being nasty not of you warren in is failed club i i of films pay so sequences and film okay uses to received and if time done for room sugar viewer as cartoon of gives to forgettable br be because many these of reflection sugar contained gives it wreck scene to more was two when had find as you another it of themselves probably who interplay storytelling if itself by br about 1950's films not would effects that her box to miike for if hero close seek end is very together movie of wheel got say kong sugar fred close bore there is playing lot of and pan place trilogy of lacks br of their time much this men as on it is telling program br silliness okay and to frustration at corner and she of sequences to political clearly in of drugs keep guy i i was throwing 

In [96]:
test = x_test[1]
print(f"Exemple of postive review: '{decode(test)}'")
print(f"Real Sentiment: {'Positive' if y_test[1] > 0.5 else 'Negative'}")
pred = model.predict(test.reshape(1,-1))
print(f"Prediction: {'Positive' if pred > 0.5 else 'Negative'}")

Exemple of postive review: 'the as you world's is quite br mankind most that quest are chase to being quickly of little it time hell to plot br of something long put are of every place this consequence and of interplay storytelling being nasty not of you warren in is failed club i i of films pay so sequences and film okay uses to received and if time done for room sugar viewer as cartoon of gives to forgettable br be because many these of reflection sugar contained gives it wreck scene to more was two when had find as you another it of themselves probably who interplay storytelling if itself by br about 1950's films not would effects that her box to miike for if hero close seek end is very together movie of wheel got say kong sugar fred close bore there is playing lot of and pan place trilogy of lacks br of their time much this men as on it is telling program br silliness okay and to frustration at corner and she of sequences to political clearly in of drugs keep guy i i was throwing r

In [49]:
import spacy
# !python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

In [110]:
def text_to_vect(text):
    # Or: tokens = text.lower().split()
    tokens = nlp(text.lower(), disable=["ner", "parser", "senter"])
    tokens = [t.text for t in tokens if not t.is_punct]
    seq = [word_index[word] for word in tokens if word in word_index]
    return np.array(seq)

In [113]:
reviews = ["This movie was fantastic! Masterpiece!!!! Best thing ever.",
           "this was a fucking disaster. shame on anyone who worked on this",
           "AMAZING"
          ]
seq = [text_to_vect(r) for r in reviews]
seq = pad_sequences(seq, maxlen=max_sequence_length)

predictions = model.predict(seq)
for rev, pred in zip(reviews, predictions):
    print(f"Review: {rev}\nPrediction: {'Positive' if pred > 0.5 else 'Negative'} ({pred})\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Review: This movie was fantastic! Masterpiece!!!! Best thing ever.
Prediction: Negative ([0.31264302])

Review: this was a fucking disaster. shame on anyone who worked on this
Prediction: Negative ([0.30045286])

Review: AMAZING
Prediction: Negative ([0.29550177])



## RNN model

In [67]:
model2 = models.Sequential([
    layers.Embedding(input_dim=max_words, output_dim=128),
    layers.Bidirectional(layers.LSTM(64, return_sequences=False)),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

hist = model2.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=1)

test_loss, test_acc = model2.evaluate(x_test, y_test, verbose=2)
print(f"Test Accuracy: {test_acc:.4f}")

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 365ms/step - accuracy: 0.6687 - loss: 0.5680 - val_accuracy: 0.8550 - val_loss: 0.3300
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m224s[0m 358ms/step - accuracy: 0.8900 - loss: 0.2871 - val_accuracy: 0.8676 - val_loss: 0.3380
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 329ms/step - accuracy: 0.8827 - loss: 0.2879 - val_accuracy: 0.8636 - val_loss: 0.3495
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 324ms/step - accuracy: 0.9497 - loss: 0.1470 - val_accuracy: 0.8576 - val_loss: 0.4152
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 328ms/step - accuracy: 0.9656 - loss: 0.1055 - val_accuracy: 0.8616 - val_loss: 0.3961
782/782 - 93s - 119ms/step - accuracy: 0.8619 - loss: 0.4021
Test Accuracy: 0.8619


In [112]:
reviews = ["This movie was fantastic! Masterpiece!!!! Best thing ever.",
           "this was a fucking disaster. shame on anyone who worked on this",
           "AMAZING"
          ]
seq = [text_to_vect(r) for r in reviews]
seq = pad_sequences(seq, maxlen=max_sequence_length)

predictions = model2.predict(seq)
for rev, pred in zip(reviews, predictions):
    print(f"Review: {rev}\nPrediction: {'Positive' if pred > 0.5 else 'Negative'} ({pred})\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Review: This movie was fantastic! Masterpiece!!!! Best thing ever.
Prediction: Positive ([0.6410363])

Review: this was a fucking disaster. shame on anyone who worked on this
Prediction: Negative ([0.3016865])

Review: AMAZING
Prediction: Positive ([0.81673086])



In [114]:
model_dir = "../../models"
model.save(f'{model_dir}/rnn_imdb_sent_analyser.keras')