# Data preparation

In [1]:
import io
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D, TextVectorization

2024-08-29 22:00:19.696644: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

['imdbEr.txt', 'test', 'imdb.vocab', 'README', 'train']

In [6]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['urls_unsup.txt',
 'neg',
 'urls_pos.txt',
 'unsup',
 'urls_neg.txt',
 'pos',
 'unsupBow.feat',
 'labeledBow.feat']

In [7]:
shutil.rmtree(os.path.join(train_dir, 'unsup'))

## Create training and validation sets

In [8]:
batch_size=1024
seed=123

train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2, subset='training', seed=seed
)
val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2, subset='validation', seed=seed
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [9]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(5):
    print(label_batch[i].numpy(), text_batch.numpy()[i])

0 b"Oh My God! Please, for the love of all that is holy, Do Not Watch This Movie! It it 82 minutes of my life I will never get back. Sure, I could have stopped watching half way through. But I thought it might get better. It Didn't. Anyone who actually enjoyed this movie is one seriously sick and twisted individual. No wonder us Australians/New Zealanders have a terrible reputation when it comes to making movies. Everything about this movie is horrible, from the acting to the editing. I don't even normally write reviews on here, but in this case I'll make an exception. I only wish someone had of warned me before I hired this catastrophe"
1 b'This movie is SOOOO funny!!! The acting is WONDERFUL, the Ramones are sexy, the jokes are subtle, and the plot is just what every high schooler dreams of doing to his/her school. I absolutely loved the soundtrack as well as the carefully placed cynicism. If you like monty python, You will love this film. This movie is a tad bit "grease"esk (without

2024-08-29 22:03:06.394516: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## Autotune for performance

In [10]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# Embedding Layer
Define an embedding layer with 1000 words and 5 dimensions. The Embedding layer **trains** the word embedding on the fly via *backpropagation*. The initial weights are randomized.

In [11]:
embedding_layer = tf.keras.layers.Embedding(1000, 5)

This embedding layer will **convert** an integer to a 5D array.

In [12]:
result = embedding_layer(tf.constant([1]))
result.numpy()

array([[ 0.00117086,  0.01907944, -0.02387185,  0.03346392, -0.03524822]],
      dtype=float32)

For sequences, the embedding layer expects a **2D tensor** of shape `(samples, seq_length)`.  Suppose we have 2 sentences of length 3. The input is a 2 by 3 matrix. The output has shape `(2, 3, N)` where N is the embedding dimension.

In [13]:
result = embedding_layer(tf.constant([
    [0, 1, 2], # sentence 1
    [3, 4, 5]  # sentence 2
]))
result.shape

TensorShape([2, 3, 5])

In [14]:
result.numpy()

array([[[-0.02405632, -0.01018284, -0.03051524, -0.02742959,
         -0.03068431],
        [ 0.00117086,  0.01907944, -0.02387185,  0.03346392,
         -0.03524822],
        [ 0.01899647,  0.04343804,  0.03378962, -0.04803031,
          0.03657239]],

       [[-0.01285064, -0.00517671, -0.01517154,  0.01236069,
          0.04086245],
        [-0.01367376,  0.03471819,  0.01825732, -0.04489787,
         -0.01411694],
        [-0.04037186,  0.00299137, -0.00416181, -0.00955235,
         -0.01033883]]], dtype=float32)

# Text preprocessing

In [15]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

2024-08-29 22:03:08.380086: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# Classification Model

In [16]:
embedding_dim=16

model = tf.keras.Sequential([
    # transforms the input text to a sequence of numbers
    vectorize_layer, 

    # turns positive integers (indexes) into dense vectors of fixed size.
    Embedding(vocab_size, embedding_dim, name="embedding"),
    
    # Remainder of model
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1)
])  

In [17]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [18]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback])

Epoch 1/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 117ms/step - accuracy: 0.5037 - loss: 0.6917 - val_accuracy: 0.4886 - val_loss: 0.6842
Epoch 2/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 88ms/step - accuracy: 0.5037 - loss: 0.6811 - val_accuracy: 0.4886 - val_loss: 0.6694
Epoch 3/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 88ms/step - accuracy: 0.5039 - loss: 0.6642 - val_accuracy: 0.4892 - val_loss: 0.6479
Epoch 4/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 93ms/step - accuracy: 0.5097 - loss: 0.6400 - val_accuracy: 0.5204 - val_loss: 0.6198
Epoch 5/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 89ms/step - accuracy: 0.5642 - loss: 0.6087 - val_accuracy: 0.6034 - val_loss: 0.5871
Epoch 6/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 92ms/step - accuracy: 0.6455 - loss: 0.5725 - val_accuracy: 0.6636 - val_loss: 0.5530
Epoch 7/15
[1m20/20[0m [32m━━━

<keras.src.callbacks.history.History at 0x1342fb2c0>

Word embeddings are weights of the Embedding layer.

In [19]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [20]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

Load this word embedding in the [Embedding Projector](http://projector.tensorflow.org/?_gl=1*12piodl*_ga*MTYwNzA5NzYxNy4xNzI0OTI3NTY5*_ga_W0YLR4190T*MTcyNDk0MzgwNC4yLjAuMTcyNDk0MzgwNC4wLjAuMA..) and search for "beautiful", here are the similar words that come up:

- best
- brilliant
- fun
- wonderful
- perfectly
- favorite
- great
- greatest
- solid
- surprised
- chan

Definitely not a bad way to train word embedding. But there is a better way using Word2Vec algorithm.