In [11]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re
import urllib.request

In [12]:
print("Loading IMDB dataset from internet...")
url = "https://huggingface.co/datasets/nocode-ai/imdb-movie-reviews/resolve/main/IMDB%20Dataset.csv"
filename = "data/IMDB_Dataset.csv"
urllib.request.urlretrieve(url, filename)

df = pd.read_csv(filename)

df_subset = df.sample(n=5000, random_state=42).reset_index(drop=True)

# Clean text function
def clean_text(text):
    """Clean and preprocess text"""
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
    text = text.lower().strip()
    return text

# Preprocess data
print("Preprocessing data...")
df_subset['review_clean'] = df_subset['review'].apply(clean_text)

# Map sentiment to binary
sentiment_mapping = {'positive': 1, 'negative': 0}
df_subset['sentiment_binary'] = df_subset['sentiment'].map(sentiment_mapping)

# Remove any null values
df_subset = df_subset.dropna()

print(f"Dataset summary:")
print(f"Total reviews: {len(df_subset)}")
print(f"Positive reviews: {sum(df_subset['sentiment_binary'])}")
print(f"Negative reviews: {len(df_subset) - sum(df_subset['sentiment_binary'])}")

# Prepare features and labels
X = df_subset['review_clean'].values
y = df_subset['sentiment_binary'].values

# Tokenize text
print("Tokenizing text...")
vocab_size = 5000
max_length = 200

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

# Convert to sequences
sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(sequences, maxlen=max_length, truncating='post')

print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Sequence length: {max_length}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

Loading IMDB dataset from internet...
Preprocessing data...
Dataset summary:
Total reviews: 5000
Positive reviews: 2519
Negative reviews: 2481
Tokenizing text...
Vocabulary size: 54467
Sequence length: 200
Training samples: 4000
Test samples: 1000


In [13]:
# Build LSTM model
print("Building LSTM model...")
lstm_model = Sequential([
    Embedding(vocab_size, 64, input_length=max_length),
    LSTM(32, dropout=0.2, recurrent_dropout=0.2),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("\nLSTM Model Architecture:")
lstm_model.summary()

# Train LSTM model
print("\nTraining LSTM model...")
lstm_history = lstm_model.fit(
    X_train, y_train,
    batch_size=64,
    epochs=5,
    validation_data=(X_test, y_test),
    verbose=1
)

Building LSTM model...

LSTM Model Architecture:



Training LSTM model...
Epoch 1/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 79ms/step - accuracy: 0.5155 - loss: 0.6928 - val_accuracy: 0.5840 - val_loss: 0.6821
Epoch 2/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 79ms/step - accuracy: 0.6686 - loss: 0.6441 - val_accuracy: 0.7160 - val_loss: 0.5821
Epoch 3/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 77ms/step - accuracy: 0.8057 - loss: 0.4806 - val_accuracy: 0.7130 - val_loss: 0.5725
Epoch 4/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 77ms/step - accuracy: 0.8823 - loss: 0.3414 - val_accuracy: 0.7430 - val_loss: 0.5828
Epoch 5/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 75ms/step - accuracy: 0.9004 - loss: 0.2818 - val_accuracy: 0.7160 - val_loss: 0.6184


In [14]:
# Build Simple RNN model for comparison
print("\n" + "="*50)
print("Building Simple RNN model for comparison...")
from tensorflow.keras.layers import SimpleRNN

rnn_model = Sequential([
    Embedding(vocab_size, 64, input_length=max_length),
    SimpleRNN(32, dropout=0.2, recurrent_dropout=0.2),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("\nSimple RNN Model Architecture:")
rnn_model.summary()

# Train RNN model
print("\nTraining Simple RNN model...")
rnn_history = rnn_model.fit(
    X_train, y_train,
    batch_size=64,
    epochs=5,
    validation_data=(X_test, y_test),
    verbose=1
)


Building Simple RNN model for comparison...

Simple RNN Model Architecture:



Training Simple RNN model...
Epoch 1/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.5035 - loss: 0.7130 - val_accuracy: 0.5100 - val_loss: 0.6936
Epoch 2/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.5093 - loss: 0.6996 - val_accuracy: 0.5340 - val_loss: 0.6913
Epoch 3/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.5104 - loss: 0.7012 - val_accuracy: 0.5260 - val_loss: 0.6915
Epoch 4/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.5165 - loss: 0.6984 - val_accuracy: 0.5200 - val_loss: 0.6931
Epoch 5/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.5197 - loss: 0.6936 - val_accuracy: 0.5160 - val_loss: 0.6920


In [15]:
# Compare results
print("\n" + "="*50)
print("PERFORMANCE COMPARISON")
print("="*50)

# Evaluate both models
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test, y_test, verbose=0)
rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test, y_test, verbose=0)

print(f"\nLSTM Model:")
print(f"  Test Accuracy: {lstm_accuracy:.4f}")
print(f"  Test Loss: {lstm_loss:.4f}")
print(f"  Final Training Accuracy: {lstm_history.history['accuracy'][-1]:.4f}")
print(f"  Final Validation Accuracy: {lstm_history.history['val_accuracy'][-1]:.4f}")

print(f"\nSimple RNN Model:")
print(f"  Test Accuracy: {rnn_accuracy:.4f}")
print(f"  Test Loss: {rnn_loss:.4f}")
print(f"  Final Training Accuracy: {rnn_history.history['accuracy'][-1]:.4f}")
print(f"  Final Validation Accuracy: {rnn_history.history['val_accuracy'][-1]:.4f}")

print(f"\nAccuracy Improvement: {((lstm_accuracy - rnn_accuracy) / rnn_accuracy * 100):+.1f}%")


PERFORMANCE COMPARISON

LSTM Model:
  Test Accuracy: 0.7160
  Test Loss: 0.6184
  Final Training Accuracy: 0.8947
  Final Validation Accuracy: 0.7160

Simple RNN Model:
  Test Accuracy: 0.5160
  Test Loss: 0.6920
  Final Training Accuracy: 0.5205
  Final Validation Accuracy: 0.5160

Accuracy Improvement: +38.8%


In [16]:

# Prediction functions for both models
def predict_sentiment_lstm(text):
    """Predict sentiment using LSTM model"""
    cleaned_text = clean_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(sequence, maxlen=max_length, truncating='post')
    prediction = lstm_model.predict(padded, verbose=0)[0][0]
    
    confidence = prediction if prediction > 0.5 else 1 - prediction
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    return f"{sentiment} (confidence: {confidence:.3f})"

def predict_sentiment_rnn(text):
    """Predict sentiment using Simple RNN model"""
    cleaned_text = clean_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(sequence, maxlen=max_length, truncating='post')
    prediction = rnn_model.predict(padded, verbose=0)[0][0]
    
    confidence = prediction if prediction > 0.5 else 1 - prediction
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    return f"{sentiment} (confidence: {confidence:.3f})"

# Test both models on the same examples
print("\n" + "="*50)
print("PREDICTION COMPARISON")
print("="*50)

# Test cases that highlight LSTM's advantages (long sequences, context changes)
comparison_cases = [
    "This movie is absolutely amazing and fantastic!",
    "This movie is terrible and awful",
    "The movie started very badly with poor acting but gradually improved and became absolutely amazing by the end",
    "Although the film had excellent cinematography and great visual effects, the confusing plot and terrible acting ruined the entire experience",
    "Despite having a slow and boring beginning, the second half was incredible with fantastic performances and brilliant storytelling",
    "The movie began with promise and good acting but became increasingly disappointing with poor direction and awful ending"
]

print(f"{'Review':<80} {'LSTM Prediction':<25} {'RNN Prediction':<25}")
print("-" * 130)

for text in comparison_cases:
    lstm_result = predict_sentiment_lstm(text)
    rnn_result = predict_sentiment_rnn(text)
    
    # Truncate text for display
    display_text = text[:77] + "..." if len(text) > 80 else text
    print(f"{display_text:<80} {lstm_result:<25} {rnn_result:<25}")

# Test on actual examples from dataset
print(f"\nTesting on actual dataset examples:")
print(f"{'Review':<80} {'Actual':<10} {'LSTM':<25} {'RNN':<25}")
print("-" * 140)

for i in range(3):
    actual_text = df_subset.iloc[i]['review'][:77] + "..."
    actual_sentiment = "Positive" if df_subset.iloc[i]['sentiment_binary'] == 1 else "Negative"
    lstm_pred = predict_sentiment_lstm(df_subset.iloc[i]['review'])
    rnn_pred = predict_sentiment_rnn(df_subset.iloc[i]['review'])
    
    print(f"{actual_text:<80} {actual_sentiment:<10} {lstm_pred:<25} {rnn_pred:<25}")

print(f"\n" + "="*50)
print("SUMMARY")
print("="*50)
print(f"LSTM shows superior performance due to:")
print(f"  ✓ Long-term memory (avoids vanishing gradient)")
print(f"  ✓ Better handling of long sequences")
print(f"  ✓ Context understanding across sentence")
print(f"  ✓ Forget gate mechanism")
print(f"\nSimple RNN struggles with:")
print(f"  ✗ Vanishing gradient in long sequences")
print(f"  ✗ Poor long-term memory")
print(f"  ✗ Context loss in complex sentences")

print(f"\nTraining completed!")
print(f"LSTM Final Accuracy: {lstm_history.history['accuracy'][-1]:.4f}")
print(f"RNN Final Accuracy: {rnn_history.history['accuracy'][-1]:.4f}")


PREDICTION COMPARISON
Review                                                                           LSTM Prediction           RNN Prediction           
----------------------------------------------------------------------------------------------------------------------------------
This movie is absolutely amazing and fantastic!                                  Positive (confidence: 0.922) Positive (confidence: 0.507)
This movie is terrible and awful                                                 Negative (confidence: 0.937) Negative (confidence: 0.509)
The movie started very badly with poor acting but gradually improved and beca... Negative (confidence: 0.926) Positive (confidence: 0.519)
Although the film had excellent cinematography and great visual effects, the ... Negative (confidence: 0.988) Positive (confidence: 0.516)
Despite having a slow and boring beginning, the second half was incredible wi... Positive (confidence: 0.936) Positive (confidence: 0.509)
The movie began wi