In [9]:
import torch

"""Print mask"""
def get_tgt_mask(size):
        # Generates a squeare matrix where the each row allows one word more to be seen
        mask = torch.tril(torch.ones(size, size) == 1) # Lower triangular matrix
        mask = mask.float()
        mask = mask.masked_fill(mask == 0, float('-inf')) # Convert zeros to -inf
        mask = mask.masked_fill(mask == 1, float(0.0)) # Convert ones to 0

        return mask
        


print(get_tgt_mask(33))
print(get_tgt_mask(33).shape)

tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
        [0., 0., -inf,  ..., -inf, -inf, -inf],
        [0., 0., 0.,  ..., -inf, -inf, -inf],
        ...,
        [0., 0., 0.,  ..., 0., -inf, -inf],
        [0., 0., 0.,  ..., 0., 0., -inf],
        [0., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([33, 33])


In [38]:
import torch.nn as nn
import math

"""Print positional encodings.
Shape = (batch_size, seq_len, d_model)
For every item in batch the positional encodings are identical: pos_encoding[1,:,:]=pos_encoding[2,:,:]
Each item in the positional encoding depends on the position of the sequence (multiplier inside cos/sin) and the position
in the dim_model dimension. Thus, every feature for the same base gets different value, and every seq gets unique values:
pos_encoding[1,1,1]!=pos_encoding[1,1,2] and pos_encoding[1,1,1]!=pos_encoding[1,2,1]
Formula is the same as in https://arxiv.org/pdf/1706.03762.pdf, log and exp in the formula below cancel each other.
Have been tested with P(0,0), P(0,1), P(1,0), P(1,1)

Dropout affects during train mode, but not in eval. Dropout needed, if using embeddings.
"""
class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, dropout_p, max_len):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout_p)

        # Calculate positional encodings
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, dim_model, 2).float() * -(math.log(10000.0) / dim_model))
        pos_encoding = torch.zeros((max_len, dim_model))
        pos_encoding[:, 0::2] = torch.sin(position * div_term)
        pos_encoding[:, 1::2] = torch.cos(position * div_term)
        pos_encoding = pos_encoding.unsqueeze(0)

        self.register_buffer('pos_encoding', pos_encoding)

    def forward(self, token_embedding):
        # Expand the positional encoding to match the batch size
        pos_encoding = self.pos_encoding[:, :token_embedding.size(1)].expand(token_embedding.size(0), -1, -1)
        # Residual connection + pos encoding
        print("---")
        print(token_embedding[1,0,:])
        print(pos_encoding[1,0,:])
        print((token_embedding + pos_encoding)[1,0,:])
        print((self.dropout(token_embedding + pos_encoding))[1,0,:])
        return self.dropout(token_embedding + pos_encoding)
    
class Transformer(nn.Module):
    def __init__(self, d_model=84, nhead=2, num_encoder_layers=6, num_decoder_layers=6):
        super(Transformer, self).__init__()
        self.positional_encoder = PositionalEncoding(
            dim_model=d_model, dropout_p=0.1, max_len=34
        )
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                         batch_first=True)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, src, tgt, tgt_mask=None):
        """
        Parameters:
        src: Input matrix of shape (batch_size, seq_len, embedding_dim),
                seq_len=33, embedding_dim=84
        tgt: Target matrix of shape (batch_size, seq_len, embedding_dim)"""
        src = self.positional_encoder(src)
        print(src[1,0,:])
        return
        tgt = self.positional_encoder(tgt) #
        output = self.transformer(src, tgt, tgt_mask=tgt_mask)
        output = self.out(output)
        return output
    
    
    def get_tgt_mask(self, size) -> torch.tensor:
        # Generates a squeare matrix where the each row allows one word more to be seen
        mask = torch.tril(torch.ones(size, size) == 1) # Lower triangular matrix
        mask = mask.float()
        mask = mask.masked_fill(mask == 0, float('-inf')) # Convert zeros to -inf
        mask = mask.masked_fill(mask == 1, float(0.0)) # Convert ones to 0

        return mask
    


# Create a tensor with random values
tensor = torch.zeros((128, 34, 84))
positional_encoder_input = PositionalEncoding(dim_model=84, dropout_p=0.1, max_len=34)
result = positional_encoder_input(tensor)
print(result[1,0,:])
result = positional_encoder_input(tensor)
print(result[1,0,:])
net=Transformer()
net.eval()
net(tensor, None)

---
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
tensor([0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.])
tensor([0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0.

In [41]:
import numpy as np
np.array([2])
np.array([3])

array([3])