In [1]:
import io
import numpy as np
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

2023-04-19 11:20:49.219301: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(
    "aclImdb_v1.tar.gz", url, untar=True, cache_dir=".", cache_subdir=""
)

dataset_dir = os.path.join(os.path.dirname(dataset), "aclImdb")
os.listdir(dataset_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


['README', 'imdbEr.txt', 'imdb.vocab', 'test', 'train']

In [4]:
train_dir = os.path.join(dataset_dir, "train")
os.listdir(train_dir)

['labeledBow.feat',
 'urls_neg.txt',
 'neg',
 'pos',
 'urls_unsup.txt',
 'unsup',
 'unsupBow.feat',
 'urls_pos.txt']

In [5]:
remove_dir = os.path.join(train_dir, "unsup")
shutil.rmtree(remove_dir)

In [6]:
batch_size = 1024
seed = 123
train_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=seed,
)
val_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=seed,
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


2023-04-19 11:43:52.256256: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-19 11:43:52.277157: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-19 11:43:52.277306: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-19 11:43:52.277745: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [7]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [8]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )

In [9]:
# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Make a text-only dataset (no labels) and call `Dataset.adapt` to build the
# vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [10]:
embedding_dim = 16
text_embedding = Embedding(vocab_size, embedding_dim, name="embedding")

In [11]:
text_input = tf.keras.Sequential(
    [vectorize_layer, text_embedding], name="text_input"
)
classifier_head = tf.keras.Sequential(
    [GlobalAveragePooling1D(), Dense(16, activation="relu"), Dense(1)],
    name="classifier_head",
)

model = tf.keras.Sequential([text_input, classifier_head])

In [12]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [13]:
model.compile(
    optimizer="adam",
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

In [14]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback],
)

Epoch 1/15


2023-04-19 13:02:57.323581: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f03a420e9d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-04-19 13:02:57.323599: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 2070, Compute Capability 7.5
2023-04-19 13:02:57.326655: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.
2023-04-19 13:02:57.401794: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.


 1/20 [>.............................] - ETA: 25s - loss: 0.6932 - accuracy: 0.4961


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.


 3/20 [===>..........................] - ETA: 2s - loss: 0.6931 - accuracy: 0.5124


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.





You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.





You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.





You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.





You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.





You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.





You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.





You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.





You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f04a50a0e80>

In [17]:
#Vocabulary remapping

In [15]:
embedding_weights_base = (
    model.get_layer("text_input").get_layer("embedding").get_weights()[0]
)
vocab_base = vectorize_layer.get_vocabulary()

In [16]:
# Vocabulary size and number of words in a sequence.
vocab_size_new = 10200
sequence_length = 100

vectorize_layer_new = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size_new,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer_new.adapt(text_ds)

# Get the new vocabulary
vocab_new = vectorize_layer_new.get_vocabulary()

In [18]:
# View the new vocabulary tokens that weren't in `vocab_base`
set(vocab_base) ^ set(vocab_new)

{'bullying',
 'bumps',
 'canvas',
 'carole',
 'chains',
 'chairman',
 'checks',
 'coarse',
 'competitive',
 'component',
 'compound',
 'confirm',
 'contemplate',
 'coping',
 'corporations',
 'costuming',
 'counterpart',
 'crop',
 'custody',
 'cyborgs',
 'daft',
 'danced',
 'daphne',
 'darkest',
 'davids',
 'december',
 'declared',
 'defence',
 'delve',
 'demonstration',
 'dense',
 'denver',
 'devilish',
 'devious',
 'dickinson',
 'digs',
 'directorwriter',
 'download',
 'effortless',
 'electricity',
 'elliot',
 'enlightenment',
 'erratic',
 'exceedingly',
 'eyeballs',
 'fearless',
 'fenton',
 'fiennes',
 'filter',
 'fireworks',
 'flipping',
 'float',
 'foggy',
 'forgivable',
 'framework',
 'fulllength',
 'funds',
 'gamut',
 'geeks',
 'glee',
 'goo',
 'gripe',
 'hardest',
 'harmony',
 'henchman',
 'heritage',
 'hg',
 'hi',
 'hightech',
 'homework',
 'houston',
 'howards',
 'hunger',
 'imho',
 'immigrants',
 'improvised',
 'impulse',
 'inch',
 'interpret',
 'intimidating',
 'iowa',
 'jaf

In [19]:
# Generate the updated embedding matrix
updated_embedding = tf.keras.utils.warmstart_embedding_matrix(
    base_vocabulary=vocab_base,
    new_vocabulary=vocab_new,
    base_embeddings=embedding_weights_base,
    new_embeddings_initializer="uniform",
)
# Update the model variable
updated_embedding_variable = tf.Variable(updated_embedding)

In [20]:
updated_embedding_variable.shape

TensorShape([10200, 16])

In [21]:
text_embedding_layer_new = Embedding(
    vectorize_layer_new.vocabulary_size(), embedding_dim, name="embedding"
)
text_embedding_layer_new.build(input_shape=[None])
text_embedding_layer_new.embeddings.assign(updated_embedding)
text_input_new = tf.keras.Sequential(
    [vectorize_layer_new, text_embedding_layer_new], name="text_input_new"
)
text_input_new.summary()

# Verify the shape of updated weights
# The new weights shape should reflect the new vocabulary size
text_input_new.get_layer("embedding").get_weights()[0].shape

Model: "text_input_new"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, 100)              0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 100, 16)           163200    
                                                                 
Total params: 163,200
Trainable params: 163,200
Non-trainable params: 0
_________________________________________________________________


(10200, 16)

In [22]:
warm_started_model = tf.keras.Sequential([text_input_new, classifier_head])
warm_started_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_input_new (Sequential)  (None, 100, 16)          163200    
                                                                 
 classifier_head (Sequential  (None, 1)                289       
 )                                                               
                                                                 
Total params: 163,489
Trainable params: 163,489
Non-trainable params: 0
_________________________________________________________________


In [23]:
# New vocab words
base_vocab_index = vectorize_layer("the")[0]
new_vocab_index = vectorize_layer_new("the")[0]
print(
    warm_started_model.get_layer("text_input_new").get_layer("embedding")(
        new_vocab_index
    )
    == embedding_weights_base[base_vocab_index]
)

tf.Tensor(
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True], shape=(16,), dtype=bool)


In [24]:
model.compile(
    optimizer="adam",
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback],
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f04a42ea040>