In [25]:
import keras
from keras import layers
from keras import ops

batch_size = 32

train_ds = keras.utils.text_dataset_from_directory( 
    '../../aclImdb/train/', 
    validation_split=0.2, 
    subset="training", 
    seed=123,
    batch_size=batch_size)

val_ds = keras.utils.text_dataset_from_directory( 
    '../../aclImdb/train/', 
    validation_split=0.2, 
    subset="validation", 
    seed=123, # same seed as above!
    batch_size=batch_size)

test_ds = keras.utils.text_dataset_from_directory( 
    '../../aclImdb/test/', 
    batch_size=batch_size)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [26]:
max_tokens = 20000 # maximum vocabulary size 
max_length = 200 # maximum length of sequences

vectorization_layer = keras.layers.TextVectorization( 
    max_tokens=max_tokens, 
    output_mode='int',
    output_sequence_length=max_length,
)

# Adapt the layer to the text data 
train_texts = train_ds.map(lambda x, y: x) 
vectorization_layer.adapt(train_texts)

# Apply the vectorization to the datasets 
train_ds_int = train_ds.map(lambda x, y: (vectorization_layer(x), y)) 
val_ds_int = val_ds.map(lambda x, y: (vectorization_layer(x), y))
test_ds_int = test_ds.map(lambda x, y: (vectorization_layer(x), y))

In [27]:
for (x, y) in train_ds_int.take(1):
    print(x[0])
    print(y[0])

tf.Tensor(
[ 1582  2643  4133     8     4  9746   678     8    16   543    21    25
   927   259     6    94   272     5    49   226    41   221  2138     4
  3644  2209     8  5133     1     2     1  3725     6   702    41   561
    21   288    34   312  7729  1307  9392    21     2  5741     5    25
  7880     8     2 13496     5     2   319    19    15    28  2388     9
  8661    46     5  1134    15    25  1312 13174    80  5528  7197  3720
    15    45     9   146   174     6    27  3205    16     2   257     5
   213     2  1349   131  1032     7  1676     8     2  5269     5  1980
     1    13    11   319  2709  2210     2  1275     5     2    20    15
    74    15  2097 17473  2422     6   161    46    87    28   841  1582
  2643  6213     7    78    34  5207     5     2   403  2313     5     2
 10814   596     6  2686   171     8     2  5269     5 16774     3     1
    13     9     7    78   235     2   115    20   475    66     8     2
   437    11   333    13    13    30    

In [28]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim):
        super().__init__()
        self.token_embeddings = layers.Embedding(input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=output_dim)

    def call(self, inputs):
        length = ops.shape(inputs)[-1]
        positions = ops.arange(start=0, stop=length, step=1)
        embedded_positions = self.position_embeddings(positions)
        embedded_tokens = self.token_embeddings(inputs)
        return embedded_tokens + embedded_positions


In [29]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads):
        super().__init__()
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation='relu'),
             layers.Dense(embed_dim)])
        self.layernorm_1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm_2 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs):
        attention_output = self.attention(
            inputs, inputs)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)
    

In [32]:
vocab_size = max_tokens
sequence_length = max_length
embed_dim = 32
num_heads = 2
dense_dim = 32

inputs = layers.Input(shape=(max_length,))
x = TokenAndPositionEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])
model.summary()

In [33]:
history = model.fit(
    train_ds_int, batch_size=32, epochs=5, validation_data=(val_ds_int)
)


Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 22ms/step - accuracy: 0.6804 - loss: 0.5688 - val_accuracy: 0.8656 - val_loss: 0.3108
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 21ms/step - accuracy: 0.9063 - loss: 0.2429 - val_accuracy: 0.8694 - val_loss: 0.3391
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.9476 - loss: 0.1503 - val_accuracy: 0.8596 - val_loss: 0.4830
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 32ms/step - accuracy: 0.9701 - loss: 0.0863 - val_accuracy: 0.8532 - val_loss: 0.6247
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 33ms/step - accuracy: 0.9862 - loss: 0.0471 - val_accuracy: 0.8538 - val_loss: 0.7047


In [34]:
model.evaluate(test_ds_int)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.8175 - loss: 0.8927


[0.8746194243431091, 0.8207600116729736]