In [1]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers

import tensorflow_datasets as tfds

from datasets import load_dataset
from transformers import AutoTokenizer

tf.version.VERSION

'2.6.0'

In [2]:
dataset = tfds.load('imdb_reviews')
train_ds, test_ds = dataset['train'], dataset['test']

INFO:absl:No config specified, defaulting to first: imdb_reviews/plain_text
INFO:absl:Load dataset info from /home/kiddos/tensorflow_datasets/imdb_reviews/plain_text/1.0.0
INFO:absl:Field info.config_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.config_description from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.citation from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.splits from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.module_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Reusing dataset imdb_reviews (/home/kiddos/tensorflow_datasets/imdb_reviews/plain_text/1.0.0)
INFO:absl:Constructing tf.data.Dataset imdb_reviews for split None, from /home/kiddos/tensorflow_datasets/imdb_reviews/plain_text/1.0.0


In [3]:
train_ds.cardinality()

<tf.Tensor: shape=(), dtype=int64, numpy=25000>

In [4]:
test_ds.cardinality()

<tf.Tensor: shape=(), dtype=int64, numpy=25000>

In [5]:
def show_train_example():
    for entry in train_ds.take(3):
        print(entry['text'].numpy().decode('utf-8'))
        print()


show_train_example()

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.

I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was con

In [6]:
# Hyperparameters
BATCH_SIZE = 32
FF_SIZE = 32
NUM_HEADS = 2
BUFFER_SIZE = 20000
EMBEDDING_SIZE = 32
DROPOUT = 0.1
EPOCHS = 5

In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [8]:
def tokenize(tensor):
    s = tensor.numpy().decode('utf-8')
    result = tokenizer(s, truncation=True, padding='max_length')
    return tf.constant(result['input_ids'])


def preprocess(entry):
    encoded = tf.py_function(tokenize, [entry['text']], Tout=tf.int32)
    return encoded, entry['label']


def show_encoded(ds):
    for example in encoded_ds.take(2):
        ids, label = example
        print(ids.numpy())
        print(ids.numpy().shape)
        print(label.numpy())


encoded_ds = train_ds.map(preprocess)
show_encoded(encoded_ds)

[  101  1188  1108  1126  7284  6434  2523   119  1790   112   189  1129
 19615  1181  1107  1118  4978 10065  1424  1137  1847  5621  5570   119
  2695  1132  1632  5681   117  1133  1142  1538  2566  1129  1147  4997
  1648  1107  1607   119  2431  1147  1632  3176  1180  1136  1894  3051
  1306  1142  2523   112   188  9944  9844   119  1188  2523  1110  1126
  1346  2551  4338  1646 11516  2727   119  1109  1211 18970  4429  1127
  1343  1165  1103  3132  1179  9283  1127  1543  1147  2740  1111  8011
  1116   119  3406 16752  4313  1777 18918  1691   185  8613  1183   117
  1105  1123 23563   118  1567  7033  1114 10065  1424  1108  1720  1133
   170 18970  6438 15244  1107   170  2523  1115  1108 25755  1104  1251
  1842  2764   119   146  1821  9333  1115  1175  1132  5558  1176  1142
   117 10677  1158  2811   112   188  1176  4978 10065  1424   112   188
  1363  1271   119   146  1180  3742  3465  1194  1122   119   102     0
     0     0     0     0     0     0     0     0   

In [9]:
encoded_train_ds = train_ds.shuffle(BUFFER_SIZE).map(preprocess).batch(BATCH_SIZE)
encoded_test_ds = test_ds.map(preprocess).batch(BATCH_SIZE)

In [10]:
def embedding(x):
    vocab_size = tokenizer.vocab_size
    max_len = tokenizer.model_max_length
    tok_embedding = layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_SIZE)
    pos_embedding = layers.Embedding(input_dim=max_len, output_dim=EMBEDDING_SIZE)
    positions = tf.range(start=0, limit=max_len, delta=1)
    positions = pos_embedding(positions)
    x = tok_embedding(x)
    return x + positions


def ffn(x):
    x = layers.Dense(FF_SIZE, activation="relu")(x)
    x = layers.Dense(EMBEDDING_SIZE)(x)
    return x


def transformer(x):
    m = layers.MultiHeadAttention(EMBEDDING_SIZE, 8)(x, x)
    d = layers.Dropout(DROPOUT)(m)
    n = layers.LayerNormalization(epsilon=1e-6)(x + d)
    x = ffn(n)
    x = layers.Dropout(DROPOUT)(x)
    return layers.LayerNormalization(epsilon=1e-6)(x + n)


def create_model():
    max_len = tokenizer.model_max_length
    inputs = layers.Input(shape=(max_len), dtype=tf.int32)
    x = embedding(inputs)
    x = transformer(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(DROPOUT)(x)
    x = layers.Dense(20, activation="relu")(x)
    x = layers.Dropout(DROPOUT)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs=[inputs], outputs=[outputs])
    optimizer = keras.optimizers.Adam(1e-3)
    
    model.compile(optimizer=optimizer,
                  loss=keras.losses.BinaryCrossentropy(from_logits=False),
                  metrics=['accuracy'])
    return model


tf.keras.backend.clear_session()
model = create_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 512, 32)      927872      input_1[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.add (TFOpLambd (None, 512, 32)      0           embedding[0][0]                  
__________________________________________________________________________________________________
multi_head_attention (MultiHead (None, 512, 32)      33568       tf.__operators__.add[0][0]       
                                                                 tf.__operators__.add[0][0]   

In [11]:
history = model.fit(
    encoded_train_ds,
    epochs=EPOCHS,
    validation_data=encoded_test_ds)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
