In [1]:
import io
import os
import re
import string
import time

import tensorflow as tf
import tensorflow.keras as keras
from tensorboard.plugins import projector

2022-07-14 21:32:05.193082: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
def base_data_dir():
    return os.path.join(os.curdir, "large_movie_review_dataset", "aclImdb")

def train_data_dir():
    return os.path.join(base_data_dir(), "train")

def test_data_dir():
    return os.path.join(base_data_dir(), "test")

print(os.listdir(train_data_dir()))
print(os.listdir(test_data_dir()))

['pos', 'neg']
['pos', 'neg']


In [3]:
def configure_dataset(dataset):
    return dataset.cache().prefetch(tf.data.AUTOTUNE)

In [4]:
train_dataset = keras.utils.text_dataset_from_directory(train_data_dir(), seed=42)
validation_dataset = keras.utils.text_dataset_from_directory(test_data_dir(), seed=42, validation_split=.4, subset="validation")
test_dataset = keras.utils.text_dataset_from_directory(test_data_dir(), seed=42, validation_split=.4, subset="training")

train_dataset = configure_dataset(train_dataset)
validation_dataset = configure_dataset(validation_dataset)
test_dataset = configure_dataset(test_dataset)

Found 25000 files belonging to 2 classes.


2022-07-14 21:32:07.243607: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-14 21:32:07.648996: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22307 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:68:00.0, compute capability: 8.6


Found 25000 files belonging to 2 classes.
Using 10000 files for validation.
Found 25000 files belonging to 2 classes.
Using 15000 files for training.


In [5]:
for (text, label) in train_dataset.unbatch().batch(2).take(1):
    print(text)
    print(label)

for (text, label) in validation_dataset.unbatch().batch(2).take(1):
    print(text)
    print(label)

for (text, label) in test_dataset.unbatch().batch(2).take(1):
    print(text)
    print(label)

tf.Tensor(
[b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
 b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes as they get into complicat

2022-07-14 21:32:09.606221: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2022-07-14 21:32:09.641046: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2022-07-14 21:32:09.677461: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. I

In [6]:
def base_log_dir():
  return os.path.join(os.curdir, ".tflogs")

def get_tensorboard_cb(profile_batch=0):
    base_dir = base_log_dir()
    run_id = time.strftime("run_%Y_%m_%d_%H_%M_%S")
    run_dir = os.path.join(base_dir, run_id)
    file_writer = tf.summary.create_file_writer(run_dir)
    file_writer.set_as_default()
    return keras.callbacks.TensorBoard(run_dir, profile_batch=profile_batch) 

In [7]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

def create_model(dataset, vocab_size=10000, sequence_length=100, embedding_dim=16):
    text_vectorization = keras.layers.TextVectorization(
        standardize=custom_standardization,
        max_tokens=vocab_size,
        output_mode="int",
        output_sequence_length=sequence_length,
        name="text_vectorization",
    )
    
    text_vectorization.adapt(dataset.map(lambda x, y: x))
    
    model = keras.models.Sequential([
        text_vectorization,
        keras.layers.Embedding(vocab_size, embedding_dim, name="embedding"),
        keras.layers.GlobalAveragePooling1D(),
        keras.layers.Dense(embedding_dim, activation="relu"),
        keras.layers.Dense(1),
    ])
            
    return model
            
def train_model(model, train_dataset, validation_dataset):
    model.compile(
        optimizer="adam",
        loss=keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=["accuracy"],
    )
    
    model.fit(
        train_dataset,
        validation_data=validation_dataset,
        epochs=15,
        callbacks=[get_tensorboard_cb()],
    )

In [8]:
model = create_model(train_dataset)
train_model(model, train_dataset, validation_dataset)

Epoch 1/15
 23/782 [..............................] - ETA: 5s - loss: 0.6930 - accuracy: 0.5177

2022-07-14 21:32:15.979917: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [14]:
def embeddings_dir():
    return os.path.join(base_log_dir(), "embeddings")

os.makedirs(embeddings_dir(), exist_ok=True)
weights = model.get_layer("embedding").get_weights()[0][1:]
vocab = model.get_layer("text_vectorization").get_vocabulary()[1:]
weights_var = tf.Variable(weights)
print(len(weights))
print(len(vocab))

vec_path = os.path.join(embeddings_dir(), "embedding.ckpt")
vocab_path = os.path.join(embeddings_dir(), "metadata.tsv")

with io.open(vocab_path, "w", encoding="utf-8") as vocab_f:
    for i, word in enumerate(vocab):
        vocab_f.write(word + "\n")

checkpoint = tf.train.Checkpoint(embedding=weights_var)
checkpoint.save(vec_path)


config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = "metadata.tsv"
projector.visualize_embeddings(embeddings_dir(), config)

9999
9999
