In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf

### Padding Mask

In [2]:
def create_padding_mask(decoder_token_ids):
    """
    Creates a matrix mask for the padding cells
    
    Arguments:
        decoder_token_ids (matrix like): matrix of size (n, m)
    
    Returns:
        mask (tf.Tensor): binary tensor of size (n, 1, m)
    """    
    seq = 1. - tf.cast(tf.math.equal(decoder_token_ids, 0), tf.float32)
  
    # add extra dimensions to add the padding
    # to the attention logits. 
    # this will allow for broadcasting later when comparing sequences
    return seq[:, tf.newaxis, :] 

In [3]:
x = tf.constant([[7., 6., 0., 0., 0.], [1., 2., 3., 0., 0.], [3., 0., 0., 0., 0.]])
create_padding_mask(x)

<tf.Tensor: shape=(3, 1, 5), dtype=float32, numpy=
array([[[1., 1., 0., 0., 0.]],

       [[1., 1., 1., 0., 0.]],

       [[1., 0., 0., 0., 0.]]], dtype=float32)>

Adding a negative infinity (-1e<sup>9</sup>) to the mask in places where the masking elements are 0s optimizes the outputs for passing to softmax layer

In [6]:
# Create the mask for x
mask = create_padding_mask(x)

# Extend the dimension of x to match the dimension of the mask
x_extended = x[:, tf.newaxis, :]

print("Softmax of non-masked vectors:")
print(tf.keras.activations.softmax(x_extended).numpy())

print("\nSoftmax of masked vectors:")
print(tf.keras.activations.softmax(x_extended + (1 - mask) * -1.0e9).numpy())

Softmax of non-masked vectors:
[[[7.2959948e-01 2.6840466e-01 6.6530862e-04 6.6530862e-04 6.6530862e-04]]

 [[8.4437378e-02 2.2952460e-01 6.2391251e-01 3.1062776e-02 3.1062776e-02]]

 [[8.3392531e-01 4.1518699e-02 4.1518696e-02 4.1518696e-02 4.1518696e-02]]]

Softmax of masked vectors:
[[[0.7310586  0.26894143 0.         0.         0.        ]]

 [[0.09003057 0.24472848 0.66524094 0.         0.        ]]

 [[1.         0.         0.         0.         0.        ]]]


### Look-Ahead Mask

In [11]:
def create_look_ahead_mask(sequence_length):
    """
    Returns a lower triangular matrix filled with ones
    
    Arguments:
        sequence_length (int): matrix size
    
    Returns:
        mask (tf.Tensor): binary tensor of size (sequence_length, sequence_length)
    """
    mask = tf.linalg.band_part(tf.ones((1, sequence_length, sequence_length)), -1, 0)
    return mask + (1-mask)*(-1.0e9)

In [12]:
x = tf.random.uniform((1, 3))
create_look_ahead_mask(x.shape[1])

<tf.Tensor: shape=(1, 3, 3), dtype=float32, numpy=
array([[[ 1.e+00, -1.e+09, -1.e+09],
        [ 1.e+00,  1.e+00, -1.e+09],
        [ 1.e+00,  1.e+00,  1.e+00]]], dtype=float32)>