Exploring sentiment analysis using LSTMs

In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dropout, Dense, Embedding
from tensorflow.keras.regularizers import l2
# Using IMDB dataset for sentiment analysis
from tensorflow.keras.datasets import imdb

In [71]:
max_words = 10000  # Number of unique words to keep
max_len = 100  # Maximum review length

# Load the dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_words)

In [72]:
# Decode sample review
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}
decoded_review = " ".join([reverse_word_index.get(i - 3, "?") for i in X_train[0]])

print(f"Sample review: {decoded_review}")
print(f"Label: {'Positive' if y_train[0] == 1 else 'Negative'}")

Sample review: ? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have don

In [73]:
# Pad sequences (LSTM expects a fixed input)
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

Possible overfitting: the significant gap between training and validation accuracy suggests model may be overfitting, trying dropout for regularization, early stopping, L2 norm, and reducing model complexity

In [74]:
# Build the model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len, input_shape=(max_len,)),  # Explicit input shape
    LSTM(64, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

model.save('sentiment_model.h5')

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 51ms/step - accuracy: 0.6413 - loss: 1.2078 - val_accuracy: 0.8136 - val_loss: 0.4709
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 52ms/step - accuracy: 0.8419 - loss: 0.4375 - val_accuracy: 0.8228 - val_loss: 0.4435
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 52ms/step - accuracy: 0.8722 - loss: 0.3758 - val_accuracy: 0.8398 - val_loss: 0.4176
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 50ms/step - accuracy: 0.8921 - loss: 0.3324 - val_accuracy: 0.8128 - val_loss: 0.4793
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 49ms/step - accuracy: 0.8929 - loss: 0.3367 - val_accuracy: 0.8312 - val_loss: 0.4343




In [85]:
# Test the model on the IMDB test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
print(f"Test Loss: {test_loss:.4f}")



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.8389 - loss: 0.4196
Test Accuracy: 83.77%
Test Loss: 0.4181


In [None]:
# Tokenize the custom text
texts = [
    "I had an amazing experience at the park, the weather was perfect!",
    "The movie was really boring and I almost fell asleep."
]

# Recreate the tokenizer with the same settings as during training
tokenizer = Tokenizer(num_words=max_words)

# Decode X_train indices back to raw text
decoded_reviews = [" ".join([reverse_word_index.get(i - 3, "?") for i in review]) for review in X_train]

# Fit the tokenizer on the decoded text
tokenizer.fit_on_texts(decoded_reviews)

# Now you can tokenize custom text the same way as during training
for text in texts:
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)

    # Get model prediction (output probability before sigmoid)
    raw_pred = model.predict(padded_sequence)
    
    # Convert raw prediction to sentiment
    sentiment = "Positive" if raw_pred > 0.5 else "Negative"
    confidence = raw_pred[0][0]

    # Print the results
    print(f"Text: {text}")
    print(f"Raw Prediction: {raw_pred}")
    print(f"Prediction (Sentiment): {sentiment} | Confidence: {confidence}")
    print("-" * 60)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Text: I had an amazing experience at the park, the weather was perfect!
Raw Prediction: [[0.866734]]
Prediction (Sentiment): Positive | Confidence: 0.8667340278625488
------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Text: The movie was really boring and I almost fell asleep.
Raw Prediction: [[0.40936]]
Prediction (Sentiment): Negative | Confidence: 0.40935999155044556
------------------------------------------------------------
