##  Transformer Architecture in PyTorch

### Imports

In [10]:
import torch
import torch.nn as nn

tranformer_model = nn.Transformer(
    d_model=512, #dimensionality of model inputs
    nhead=8, #numbers of attention heads
    num_encoder_layers=6, #number of encoder layers
    num_decoder_layers=6, #number of decoder layers
    )

print(tranformer_model)

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, o



### Embedding and Positional Encoding

In [12]:
import math #for sine and cosine functions and square root

class InputEmbeddings(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x)*math.sqrt(self.d_model)  #scaling by sqrt(d_model) - standard practice

In [15]:
# Creating embeddings 
embedding_layer = InputEmbeddings(vocab_size=10_000, d_model=512)
embedded_output = embedding_layer(torch.tensor([[1, 2, 3, 4],[5,6,7,8]])) #example input
print(embedded_output)
print(embedded_output.shape) #should be (2, 4, 512) - (batch_size, sequence_length, d_model)

tensor([[[ 15.3231, -26.9085,  17.9909,  ...,  -1.6260,  -6.2211,   7.4548],
         [-31.8864,  22.6316, -31.2517,  ...,  41.4395,  31.4917,  -1.4392],
         [-13.8347,  21.0569, -21.4930,  ...,  31.8133,  -2.1015, -14.7802],
         [ 11.0359,   9.9550, -50.8254,  ...,   7.0253,  -8.0004,  13.0305]],

        [[ 25.3953,  14.7083,  38.2098,  ...,  -4.5308,  -1.2147,   2.8883],
         [-34.0946,  11.9953,  -1.5688,  ...,  14.2127, -20.3298, -33.7656],
         [ 19.7240,  32.0493, -27.7060,  ...,  -6.8598, -33.6846, -11.5936],
         [ 29.2914, -13.9668, -16.3890,  ...,  13.3196,   5.1478, -14.3283]]],
       grad_fn=<MulBackward0>)
torch.Size([2, 4, 512])
