### Transformers are complex. They have several parts that need to be implemented. Lets go step by step. Here we will use PyTorch, so let's import the libraries.

In [56]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
import math

### This first part will be creating our embeddings. Remember again the lesson on data representation. We need to create these embeddings for the transformer to encode. The embeddings are the the same size as the vocabulary that is passed in and uses the embedding dimensions to transform eaach word or word part into a number of dimensions (tensors)

In [57]:
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_dims, requires_grad=False):
        super(EmbeddingLayer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dims)

    def forward(self, x):
        return self.embedding(x)

### Positional Encoding tells the transformer about the meaning and position of words in the input. We use a series of sine and cosine values. We use the y-values on the sine and cosine curves to find the corrosponding x-axis coordinate for each word.

In [58]:
class PositionalEncoding(nn.Module):

    def __init__(self, embed_dims, dropout, max_seq_length):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_seq_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dims, 2) * (-math.log(10000.0) / embed_dims))
        pe = torch.zeros(max_seq_length, 1, embed_dims)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

### Self Attention works by comparing word similarity for every word in the sentence. In the transformer, we calculate the similarity between the query and keys. Larger similarities indicate stronger similarity. We then use a softmax function to determine what percentage of each word should be used to encode the query.

In [59]:
class MultiheadAttention(nn.Module):
    def __init__(self, embed_dims, nhead, dropout=0.1):
        super(MultiheadAttention, self).__init__()
        
        assert embed_dims % nhead == 0, "embed_dims must be divisible by nhead"
        self.nhead = nhead
        self.head_dim = embed_dims // nhead
        self.embed_dims = embed_dims
        
        # Linear transformations for query, key, and value for each head
        self.q_linear = nn.Linear(embed_dims, embed_dims)
        self.k_linear = nn.Linear(embed_dims, embed_dims)
        self.v_linear = nn.Linear(embed_dims, embed_dims)
        
        # Final linear transformation after concatenating the heads
        self.out_linear = nn.Linear(embed_dims, embed_dims)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        # Linearly transform query, key, and value for each head
        query = self.q_linear(query).view(batch_size, -1, self.nhead, self.head_dim)
        key = self.k_linear(key).view(batch_size, -1, self.nhead, self.head_dim)
        value = self.v_linear(value).view(batch_size, -1, self.nhead, self.head_dim)
        
        # Transpose to make dimensions compatible for batch-wise matrix multiplication
        query = query.permute(0, 2, 1, 3)
        key = key.permute(0, 2, 1, 3)
        value = value.permute(0, 2, 1, 3)
        
        # Compute scaled dot-product attention for each head
        scale_factor = torch.sqrt(torch.tensor(self.head_dim, dtype=query.dtype))
        scores = torch.matmul(query, key.permute(0, 1, 3, 2)) / scale_factor
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attention_weights = torch.nn.functional.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        
        # Apply attention to values
        context = torch.matmul(attention_weights, value)
        
        # Reshape and concatenate the outputs from different heads
        context = context.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.embed_dims)
        
        # Apply a final linear layer
        output = self.out_linear(context)
        
        return output, attention_weights

In [60]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, embed_dims, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(embed_dims, d_ff)
        self.fc2 = nn.Linear(d_ff, embed_dims)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [61]:
class EncoderLayer(nn.Module):
    def __init__(self, embed_dims, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(embed_dims, num_heads)
        self.feed_forward = PositionWiseFeedForward(embed_dims, d_ff)
        self.norm1 = nn.LayerNorm(embed_dims)
        self.norm2 = nn.LayerNorm(embed_dims)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [62]:
class FeedForward(nn.Module):
    def __init__(self, embed_dims, dim_feedforward):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(embed_dims, dim_feedforward)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(dim_feedforward, embed_dims)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        return x

In [63]:
class DecoderLayer(nn.Module):
    def __init__(self, embed_dims, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(embed_dims, num_heads)
        self.cross_attn = MultiheadAttention(embed_dims, num_heads)
        self.feed_forward = FeedForward(embed_dims, d_ff)
        self.norm1 = nn.LayerNorm(embed_dims)
        self.norm2 = nn.LayerNorm(embed_dims)
        self.norm3 = nn.LayerNorm(embed_dims)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [64]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_dims, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, embed_dims)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, embed_dims)
        self.positional_encoding = PositionalEncoding(embed_dims, dropout, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(embed_dims, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(embed_dims, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(embed_dims, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output


In [72]:
src_vocab_size = 5000
tgt_vocab_size = 5000
embed_dims = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, embed_dims, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

# Assuming src_data and tgt_data are NumPy arrays or lists of lists
src_data = torch.tensor(src_data, dtype=torch.long)  # Convert to PyTorch tensor
tgt_data = torch.tensor(tgt_data, dtype=torch.long)  # Convert to PyTorch tensor

# Assuming you have your Transformer model defined as 'transformer'
optimizer = torch.optim.Adam(transformer.parameters())
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(100):
    optimizer.zero_grad()
    output = transformer(src_data, tgt_data[:, :-1])
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
    loss.backward()
    optimizer.step()


  src_data = torch.tensor(src_data, dtype=torch.long)  # Convert to PyTorch tensor
  tgt_data = torch.tensor(tgt_data, dtype=torch.long)  # Convert to PyTorch tensor


TypeError: dropout(): argument 'input' (position 1) must be Tensor, not tuple