# Masking PAD symbols in attention weights

Our example dataset is consists of 4 samples of various length, 10 being the longest one.

In [1]:
import torch

x = torch.arange(10, 50, dtype=torch.float).view(4, 10)
x

tensor([[10., 11., 12., 13., 14., 15., 16., 17., 18., 19.],
        [20., 21., 22., 23., 24., 25., 26., 27., 28., 29.],
        [30., 31., 32., 33., 34., 35., 36., 37., 38., 39.],
        [40., 41., 42., 43., 44., 45., 46., 47., 48., 49.]])

`xlen` contains the length of each sample

In [2]:
xlen = torch.LongTensor([4, 8, 1, 10])
xlen

tensor([ 4,  8,  1, 10])

## Let's create a mask for the 'valid' symbols

In [3]:
sample_no = x.size(0)
maxlen = x.size(1)

m = torch.arange(maxlen).unsqueeze(0).expand(x.size())
m

tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

In [4]:
xlen_expand = xlen.unsqueeze(1).expand(x.size())
xlen_expand

tensor([[ 4,  4,  4,  4,  4,  4,  4,  4,  4,  4],
        [ 8,  8,  8,  8,  8,  8,  8,  8,  8,  8],
        [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [10, 10, 10, 10, 10, 10, 10, 10, 10, 10]])

In [5]:
mask = m < xlen_expand
mask

tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.uint8)

In [6]:
x[mask]

tensor([10., 11., 12., 13., 20., 21., 22., 23., 24., 25., 26., 27., 30., 40.,
        41., 42., 43., 44., 45., 46., 47., 48., 49.])

In [7]:
x[mask] += 100
x

tensor([[110., 111., 112., 113.,  14.,  15.,  16.,  17.,  18.,  19.],
        [120., 121., 122., 123., 124., 125., 126., 127.,  28.,  29.],
        [130.,  31.,  32.,  33.,  34.,  35.,  36.,  37.,  38.,  39.],
        [140., 141., 142., 143., 144., 145., 146., 147., 148., 149.]])

## Inverse selection

In [8]:
x[~mask]

tensor([14., 15., 16., 17., 18., 19., 28., 29., 31., 32., 33., 34., 35., 36.,
        37., 38., 39.])

## Now we want to compute softmax on the 'valid' elements

Setting the values to zero before calling softmax results in nonzero probabilities since `exp(0)=1`: 

In [9]:
x = torch.arange(10, 50, dtype=torch.float).view(4, 10)
x[~mask] = 0
torch.softmax(x, dim=1)

tensor([[3.2058e-02, 8.7144e-02, 2.3688e-01, 6.4391e-01, 1.4554e-06, 1.4554e-06,
         1.4554e-06, 1.4554e-06, 1.4554e-06, 1.4554e-06],
        [5.7661e-04, 1.5674e-03, 4.2606e-03, 1.1582e-02, 3.1482e-02, 8.5577e-02,
         2.3262e-01, 6.3233e-01, 1.1885e-12, 1.1885e-12],
        [1.0000e+00, 9.3576e-14, 9.3576e-14, 9.3576e-14, 9.3576e-14, 9.3576e-14,
         9.3576e-14, 9.3576e-14, 9.3576e-14, 9.3576e-14],
        [7.8013e-05, 2.1206e-04, 5.7645e-04, 1.5669e-03, 4.2594e-03, 1.1578e-02,
         3.1473e-02, 8.5552e-02, 2.3255e-01, 6.3215e-01]])

We want the values after calling softmax to be zero, so we need to set them to minus infinity:

In [10]:
x = torch.arange(10, 50, dtype=torch.float).view(4, 10)
x[~mask] = float('-inf')
torch.softmax(x, dim=1)

tensor([[0.0321, 0.0871, 0.2369, 0.6439, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0006, 0.0016, 0.0043, 0.0116, 0.0315, 0.0856, 0.2326, 0.6323, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0001, 0.0002, 0.0006, 0.0016, 0.0043, 0.0116, 0.0315, 0.0856, 0.2326,
         0.6321]])

In practice, a large negative number might work:

In [11]:
x = torch.arange(10, 50, dtype=torch.float).view(4, 10)
x[~mask] = -100000
torch.softmax(x, dim=1)

tensor([[0.0321, 0.0871, 0.2369, 0.6439, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0006, 0.0016, 0.0043, 0.0116, 0.0315, 0.0856, 0.2326, 0.6323, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0001, 0.0002, 0.0006, 0.0016, 0.0043, 0.0116, 0.0315, 0.0856, 0.2326,
         0.6321]])