In [1]:
# Import needed libraries
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

In [2]:
# Split datasets into train and test
max_words = 30000
max_len = 200

(train_sequence,train_labels),(test_sequence,test_labels) = keras.datasets.imdb.load_data(num_words=max_words)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
def preprocess(sequences,labels):
  return sequences,labels.astype(np.int32)

train_sequence,train_labels = preprocess(train_sequence,train_labels)
test_sequence,test_labels = preprocess(test_sequence,test_labels)

In [4]:
vocab = keras.datasets.imdb.get_word_index()
char_to_ind = vocab
ind_to_char = {ind: char for (char,ind) in vocab.items()}

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [5]:
# Now we can create a datasets from python generator
def gen():
  for sequence,label in zip(train_sequence,train_labels):
    yield sequence,label

In [6]:
# Use padded_batch transformation function with from_generator to create datasets from Python generators
train_data = tf.data.Dataset.from_generator(gen,output_signature=(
    tf.TensorSpec(shape=(None,),dtype=tf.int32),
    tf.TensorSpec(shape=(),dtype=tf.int32)
))

train_data = train_data.padded_batch(32)

# for sequence,label in train_data:
#   print(sequence.shape,label.shape)

In [7]:
train_data = tf.data.Dataset.from_generator(gen,output_signature=(
    tf.TensorSpec(shape=(None,),dtype=tf.int32),
    tf.TensorSpec(shape=(),dtype=tf.int32)
))

# alternatively, we can use bucketing.

buckets = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
bucket_batch_size = [32] * (len(buckets) + 1)
train_data = train_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0],
                                                  bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)

In [8]:
# Now build the model
model = keras.Sequential([keras.layers.Embedding(max_words,20,mask_zero=True),
                         keras.layers.LSTM(12,return_sequences=True),
                         keras.layers.LSTM(15),
                         keras.layers.Dense(1)])

In [9]:
# we compile the model
model.compile(optimizer='Adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 20)          600000    
                                                                 
 lstm (LSTM)                 (None, None, 12)          1584      
                                                                 
 lstm_1 (LSTM)               (None, 15)                1680      
                                                                 
 dense (Dense)               (None, 1)                 16        
                                                                 
Total params: 603280 (2.30 MB)
Trainable params: 603280 (2.30 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
# Fit the model and train out datasets
model.fit(train_data,batch_size=32,epochs=5,verbose=1,steps_per_epoch=150)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7dbc5669ff10>