In [2]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Limit the vocabulary size (e.g., top 10,000 most frequent words)
vocab_size = 10000

# Load IMDB data
# train_data, test_data are sequences of word indices
(train_sequences, train_labels), (test_sequences, test_labels) = datasets.imdb.load_data(num_words=vocab_size)

print("Train samples:", len(train_sequences))
print("Test samples:", len(test_sequences))
print("Example sequence length:", len(train_sequences[0]))
print("Label example:", train_labels[0])  # 0 = negative, 1 = positive


Train samples: 25000
Test samples: 25000
Example sequence length: 218
Label example: 1


In [8]:
max_len = 250  # max number of words per review

train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
test_padded  = pad_sequences(test_sequences,  maxlen=max_len, padding='post', truncating='post')

print("Padded train shape:", train_padded.shape)  
print("Padded test shape:", test_padded.shape)    


Padded train shape: (25000, 250)
Padded test shape: (25000, 250)


In [9]:
embedding_dim = 128
max_len = 400      # whatever you used in pad_sequences
vocab_size = 10000 # same as in imdb.load_data(num_words=...)

model = models.Sequential([
    # Explicit input shape: sequence of length max_len
    layers.Input(shape=(max_len,), dtype='int32'),

    # Embedding: turns word indices into dense vectors
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),

    # LSTM layer
    layers.LSTM(64),

    # Optional dense layer for more capacity
    layers.Dense(32, activation='relu'),

    # Output: 1 unit with sigmoid for binary classification
    layers.Dense(1, activation='sigmoid')
])

model.summary()


In [10]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)


In [15]:
history = model.fit(
    train_padded,
    train_labels,
    epochs=10,
    batch_size=128,
    validation_split=0.2,
    verbose=2
)


Epoch 1/10
157/157 - 42s - 265ms/step - accuracy: 0.9924 - loss: 0.0301 - val_accuracy: 0.8342 - val_loss: 0.6793
Epoch 2/10
157/157 - 42s - 265ms/step - accuracy: 0.9912 - loss: 0.0293 - val_accuracy: 0.8386 - val_loss: 0.7046
Epoch 3/10
157/157 - 42s - 265ms/step - accuracy: 0.9943 - loss: 0.0205 - val_accuracy: 0.7932 - val_loss: 0.8825
Epoch 4/10
157/157 - 41s - 258ms/step - accuracy: 0.9905 - loss: 0.0295 - val_accuracy: 0.8468 - val_loss: 0.8538
Epoch 5/10
157/157 - 42s - 266ms/step - accuracy: 0.9955 - loss: 0.0158 - val_accuracy: 0.8278 - val_loss: 0.8193
Epoch 6/10
157/157 - 41s - 259ms/step - accuracy: 0.9946 - loss: 0.0188 - val_accuracy: 0.8330 - val_loss: 0.7949
Epoch 7/10
157/157 - 43s - 271ms/step - accuracy: 0.9966 - loss: 0.0120 - val_accuracy: 0.8284 - val_loss: 0.8324
Epoch 8/10
157/157 - 48s - 303ms/step - accuracy: 0.9955 - loss: 0.0149 - val_accuracy: 0.8096 - val_loss: 0.8483
Epoch 9/10
157/157 - 47s - 299ms/step - accuracy: 0.9992 - loss: 0.0042 - val_accuracy: 

In [16]:
test_loss, test_acc = model.evaluate(test_padded, test_labels, verbose=0)
print(f"Test accuracy: {test_acc:.4f}")
print(f"Test loss: {test_loss:.4f}")


Test accuracy: 0.8275
Test loss: 1.1009
