## Sentiment analysis with TextVectorization, GRU and masking

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import os

%load_ext tensorboard

In [None]:
(raw_train_set, raw_valid_set, raw_test_set), info = tfds.load(
    name='imdb_reviews',
    split=["train[:90%]", "train[90%:]", "test"],
    as_supervised=True,
    with_info=True
)

In [3]:
train_set = raw_train_set.shuffle(5000, seed=42).batch(32).prefetch(1)
valid_set = raw_valid_set.batch(32).prefetch(1)
test_set = raw_test_set.batch(32).prefetch(1)

In [4]:
for review, label in raw_train_set.take(4):
    print(review.numpy().decode('utf-8'))
    print('Label:', label.numpy())

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.
Label: 0
I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development

2023-10-21 12:56:14.938174: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Tokenizing and vectorizing text at words level

In [5]:
vocab_size = 1000
embed_size = 128
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_set.map(lambda reviews, labels: reviews))

In [6]:
raw_txt = tf.constant([
    'Mann photographs the Alberta Rocky Mountains \
in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable\
performances as they always seem to do. ',
    'Mann photographs the Alberta Rocky'
])

In [7]:
embed = text_vec_layer.call(raw_txt).numpy()
txt = tf.Variable(embed, dtype=tf.int64)

In [8]:
txt

<tf.Variable 'Variable:0' shape=(2, 24) dtype=int64, numpy=
array([[  1,   1,   2,   1,   1,   1,   8,   4, 892,   1,   3,   1,   1,
          3,   1,   1, 193,   1,  15,  35, 204, 288,   6,  80],
       [  1,   1,   2,   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>

In [9]:
out = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)(txt)

In [10]:
out

<tf.Tensor: shape=(2, 24, 128), dtype=float32, numpy=
array([[[ 0.03722339, -0.04976665, -0.00990369, ..., -0.025424  ,
         -0.03561782,  0.0061574 ],
        [ 0.03722339, -0.04976665, -0.00990369, ..., -0.025424  ,
         -0.03561782,  0.0061574 ],
        [ 0.03500311,  0.03601236, -0.03146477, ...,  0.00944851,
          0.03138024,  0.01298708],
        ...,
        [-0.03836131, -0.04680197,  0.01024247, ..., -0.02890631,
          0.03405721, -0.04705429],
        [-0.03527834,  0.03068871,  0.01395898, ..., -0.0067153 ,
         -0.03649485,  0.03601872],
        [ 0.03476927,  0.01939769, -0.0440035 , ..., -0.0431875 ,
         -0.0413559 ,  0.002615  ]],

       [[ 0.03722339, -0.04976665, -0.00990369, ..., -0.025424  ,
         -0.03561782,  0.0061574 ],
        [ 0.03722339, -0.04976665, -0.00990369, ..., -0.025424  ,
         -0.03561782,  0.0061574 ],
        [ 0.03500311,  0.03601236, -0.03146477, ...,  0.00944851,
          0.03138024,  0.01298708],
        ...,


## This yields the whole sequence, keeping the 24 long sentence with embedding dimension (here 128) transformed to the GRU layer size dimensionality, e.g. 64 here
## The returned sequence dimension is the middle one here, 24

In [11]:
tf.keras.layers.GRU(64, return_sequences=True)(out)

2023-10-21 12:56:30.593490: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600


<tf.Tensor: shape=(2, 24, 64), dtype=float32, numpy=
array([[[-2.31193732e-02, -6.00226456e-03, -1.25218844e-02, ...,
         -1.88639499e-02,  5.60823688e-03,  1.02528976e-02],
        [-3.42716873e-02, -1.01342322e-02, -1.98849048e-02, ...,
         -2.65447143e-02,  6.25704229e-03,  1.21794287e-02],
        [-6.13229256e-03, -2.51790862e-05, -1.57100204e-02, ...,
         -7.11231586e-03, -1.67892314e-02, -3.92597355e-03],
        ...,
        [-7.57235277e-04, -1.07401498e-02,  1.17213195e-02, ...,
          1.99803594e-03, -2.06704997e-03,  1.36174494e-02],
        [-1.94050826e-03, -1.36491684e-02, -8.55845213e-03, ...,
          7.65117723e-03,  1.69179551e-02,  8.02480709e-03],
        [ 3.29882652e-03, -6.46380475e-03, -2.21510250e-02, ...,
          6.24734955e-03,  2.06992067e-02, -5.07170800e-03]],

       [[-2.31193732e-02, -6.00226456e-03, -1.25218844e-02, ...,
         -1.88639499e-02,  5.60823688e-03,  1.02528976e-02],
        [-3.42716873e-02, -1.01342322e-02, -1.9884

In [14]:
#out = tf.keras.layers.GRU(128)(out[tf.newaxis, ...])
out = tf.keras.layers.GRU(128)(out)

## Here the only remaining dimentions are batch and GRU size, the text length dimention is collapsed by the GRU, which is not returning sequences.

## If there were several GRUs then we'd need to return sequence (along with a mask) and only at the last GRU collapse the text.

In [15]:
out

<tf.Tensor: shape=(2, 128), dtype=float32, numpy=
array([[ 8.63684062e-03,  1.90944560e-02, -1.15395281e-02,
        -1.56724826e-02, -4.33316128e-03,  7.85726868e-03,
        -5.20819146e-03,  7.30504282e-03, -5.35266800e-03,
        -2.22881958e-02,  1.82193480e-02,  8.86726100e-03,
         1.17614493e-02, -2.75767059e-04, -1.84563652e-03,
        -1.27304569e-02,  1.07006393e-02,  1.07654342e-02,
         1.26036917e-04,  2.14793291e-02,  1.64245330e-02,
        -3.40776262e-03, -3.24496962e-02,  2.43742345e-03,
         7.30309775e-03, -1.82676669e-02, -3.01983841e-02,
        -7.01783644e-03, -1.35894131e-03, -9.84857790e-03,
        -1.68229137e-02, -1.41781550e-02,  8.87791254e-03,
        -6.22299453e-03, -1.04935337e-02, -2.86249891e-02,
         5.69162518e-03, -3.00332578e-03,  3.54943518e-03,
        -3.76814022e-03, -1.41054792e-02, -1.25163514e-02,
        -5.45388181e-03,  1.35139013e-02,  1.97317377e-02,
        -4.06170171e-03, -7.91178644e-03,  8.20245408e-03,
      

In [16]:
tf.keras.layers.Dense(1, activation='sigmoid')(out)

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[0.5019576 ],
       [0.50356525]], dtype=float32)>

## Masking properties
* To propagate mask further add supports_masking=True to layer
* To update mask one has to implement compute_mask() method
* For recurrent layers with return_sequence=True mask is propagated, with False it is not

## This model has poor performance due to 0-padding.

## It can be enhanced by using masking

In [17]:
embed_size = 128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(vocab_size, embed_size), # here by default return_sequence=False
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [18]:
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])

In [None]:
history = model.fit(train_set, validation_data=valid_set, epochs=2)

Epoch 1/2


2023-10-05 22:54:29.962123: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-10-05 22:54:30.017976: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fbceccb67a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-10-05 22:54:30.017996: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2023-10-05 22:54:30.024144: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-10-05 22:54:30.142468: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


132/704 [====>.........................] - ETA: 54s - loss: 0.6942 - accuracy: 0.5062

## Masking with a custom model

### To make a custom masking layer:

* Add mask argument to layer call() and use it in the layer
* To propagate mask set self.supports_masking=True in the constructor
* To update mask before propagating implement compute_mask() method


## Embedding layer makes masks automatically
## Otherwise one has to create masking with Masking layer or create it manually

In [None]:
inputs = tf.keras.layers.Input([], dtype=tf.string)
tokenized_input = text_vec_layer(inputs)
mask = tf.math.not_equal(tokenized_input, 0)

Z = tf.keras.layers.Embedding(vocab_size, embed_size)(tokenized_input)

# Here we pass mask in the call argument, not layer constructor
Z = tf.keras.layers.GRU(128, dropout=0.2)(Z, mask=mask)

# Output gets no mask because GRU is not returning sequence here and skips masked input
# In other cases we would need to pass the mask explicitly so it is used with loss
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(Z)

model = tf.keras.Model(inputs=[inputs], outputs=[outputs])

In [None]:
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])

In [None]:
from tensorboard.plugins import projector
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")

In [None]:
log_dir = './logs'

In [None]:
model.layers

In [None]:
# This is the mask function, suffix number is just 
# a part of layer name but'function': 'math.not_equal'
model.layers[3].get_config()

In [None]:
len(model.layers[2].get_weights())

In [None]:
model.layers[2].get_weights()[0].shape

In [None]:
model.layers[2].get_weights()[0]

In [None]:
# Skipping weights attached to token 0
weights = tf.Variable(model.layers[2].get_weights()[0][1:])

In [None]:
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, 'embedding.ckpt'))

In [None]:
config = projector.ProjectorConfig()
embedding = config.embeddings.add()

In [None]:
embedding.tensor_name = 'embedding/.ATTRIBUTES/VARIABLE_VALUE'

In [None]:
projector.visualize_embeddings(log_dir, config)

In [None]:
text_vec_layer.get_vocabulary()[192]

In [None]:
# %tensorboard --logdir ./logs

In [None]:
history = model.fit(train_set, validation_data=valid_set, epochs=20, callbacks=[tensorboard_callback])

## Ragged tensor is another way of representing data of varied length, skipping on tail paddings

## It is done in TextVectorization layer initialization

## Such a layer is used without masking in a model

## Note that this is different from sample weighting

## It may still be that ragged tensors can't serve as model targets, but this may change

In [None]:
text_vec_layer_ragged = tf.keras.layers.TextVectorization(max_tokens=vocab_size, ragged=True)

In [None]:
text_vec_layer_ragged.adapt(train_set.map(lambda reviews, labels: reviews))

In [None]:
text_vec_layer_ragged.call(raw_txt)