In [7]:
pip install h5py

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tensorflow-datasets

Collecting tensorflow-datasets
  Obtaining dependency information for tensorflow-datasets from https://files.pythonhosted.org/packages/16/e0/657192dbc03636532ccbd5c90669d31a65187365b99ba685db36bb31dd67/tensorflow_datasets-4.9.9-py3-none-any.whl.metadata
  Downloading tensorflow_datasets-4.9.9-py3-none-any.whl.metadata (11 kB)
Collecting dm-tree (from tensorflow-datasets)
  Obtaining dependency information for dm-tree from https://files.pythonhosted.org/packages/35/3e/a46933e0157b0ac87619a754ce1a796b2afc6386fca7c11f95c010f40745/dm_tree-0.1.9-cp311-cp311-win_amd64.whl.metadata
  Downloading dm_tree-0.1.9-cp311-cp311-win_amd64.whl.metadata (2.5 kB)
Collecting etils[edc,enp,epath,epy,etree]>=1.9.1 (from tensorflow-datasets)
  Obtaining dependency information for etils[edc,enp,epath,epy,etree]>=1.9.1 from https://files.pythonhosted.org/packages/e7/98/87b5946356095738cb90a6df7b35ff69ac5750f6e783d5fbcc5cb3b6cbd7/etils-1.13.0-py3-none-any.whl.metadata
  Downloading etils-1.13.0-py3-none-any.wh

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

# 1. Import necessary libraries. (Done above)

# Load and preprocess the text dataset (IMDb)
# Pad sequences and prepare labels.
print("Loading and preparing IMDb dataset...")
(train_data, test_data), info = tfds.load(
    'imdb_reviews', split=[tfds.Split.TRAIN, tfds.Split.TEST], as_supervised=True, with_info=True
)

Loading and preparing IMDb dataset...


In [2]:
train_sentences = [sent.numpy().decode('utf8') for sent, _ in train_data]
train_labels_list = [label.numpy() for _, label in train_data]
test_sentences = [sent.numpy().decode('utf8') for sent, _ in test_data]
test_labels_list = [label.numpy() for _, label in test_data]

vocab_size = 10000
max_length = 100
embedding_dim = 64
oov_tok = '<OOV>'

tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)

train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

train_padded = keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

train_labels = np.array(train_labels_list)
test_labels = np.array(test_labels_list)
print("Data loaded, tokenized, and padded.")
print(f"Train data shape: {train_padded.shape}, Test data shape: {test_padded.shape}")

Data loaded, tokenized, and padded.
Train data shape: (25000, 100), Test data shape: (25000, 100)


In [3]:
# 4. Build an RNN model with Embedding and SimpleRNN layers.
print("\nBuilding SimpleRNN model...")
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.SimpleRNN(embedding_dim),
    keras.layers.Dense(1, activation='sigmoid')
])
model.summary()


Building SimpleRNN model...
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           640000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                8256      
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 648,321
Trainable params: 648,321
Non-trainable params: 0
_________________________________________________________________


In [4]:
# 5. Compile the model with loss and optimizer.
print("\nCompiling model...")
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# 6. Train the model on training data.
print("\nTraining model...")
epochs = 10
history = model.fit(train_padded, train_labels, epochs=epochs,
                    validation_data=(test_padded, test_labels), verbose=1)
print("Model training complete.")


Compiling model...

Training model...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model training complete.


In [5]:
# 7. Evaluate the model on test data.
print("\nEvaluating model on test data...")
loss, accuracy = model.evaluate(test_padded, test_labels, verbose=0)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")


Evaluating model on test data...
Test Loss: 1.6121, Test Accuracy: 0.5925


In [6]:
# 8. Predict sentiment for new inputs.
print("\nPredicting sentiment for new inputs...")
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    prediction = model.predict(padded_sequence, verbose=0)[0][0]
    sentiment = "Positive" if prediction >= 0.5 else "Negative"
    return sentiment, prediction

new_reviews = [
    "This film was a breathtaking masterpiece, truly exceptional!",
    "Utterly dreadful, a complete waste of two hours.",
    "It was decent, but nothing to write home about.",
]

for i, review in enumerate(new_reviews):
    sentiment, score = predict_sentiment(review)
    print(f"Review {i+1}: \"{review}\"\n  -> Sentiment: {sentiment} (Score: {score:.4f})\n")

print("Experiment complete.")


Predicting sentiment for new inputs...
Review 1: "This film was a breathtaking masterpiece, truly exceptional!"
  -> Sentiment: Positive (Score: 0.9127)

Review 2: "Utterly dreadful, a complete waste of two hours."
  -> Sentiment: Positive (Score: 0.5658)

Review 3: "It was decent, but nothing to write home about."
  -> Sentiment: Positive (Score: 0.5661)

Experiment complete.
