## 0. Import the libraries and set a manual seed

In [49]:
import torch
import math
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
# Fixed seed so all our results will be the same
torch.manual_seed(1)

<torch._C.Generator at 0x1c6d9ec2470>

## 1. Perform Tokenization

Tokenization is different from simply building your vocabulary. This involves putting a numerical representation for each "word" or token within the NLP problem. It's also very possible that one "word" doesn't really mean one token. To simplify things, a very straightforward and naive way to tokenize something is to map it to an index where the lookup value is the "token" itself.

In [50]:
vocabulary_lookup = {
    "A": 0,
    "B": 1,
    "C": 2
}

## 2. Create Embeddings Representation for Tokens

We can use `torch`'s `Embedding` module to define a learnable set of parameters that will output the embeddings of a given token input. In this case, the first argument will be the model's sequence length and the second argument will be the size of the embeddings themselves.

In [51]:
embeddings = torch.nn.Embedding(3, 5)

## 3. Create an Embedding Instance for our Token

* `lookup_tensor` will be our container of tokens as input to produce these embeddings. Notice that we still make it a tensor to be fed to the next step.
* These tokens are then fed to the `forward()` method of the `Embedding` instance to come up with our initial embeddings.

In [52]:
lookup_tensor = torch.tensor([vocabulary_lookup["A"], vocabulary_lookup["B"], vocabulary_lookup["C"]])

lookup_tensor

tensor([0, 1, 2])

In [53]:
initial_embeddings = embeddings(lookup_tensor)

initial_embeddings

tensor([[ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519],
        [-0.1661, -1.5228,  0.3817, -1.0276, -0.5631],
        [-0.8923, -0.0583, -0.1955, -0.9656,  0.4224]],
       grad_fn=<EmbeddingBackward0>)

## 4. Create Positional Embeddings

In [54]:
max_len = 3
d_model = 5

In [55]:
def gen_pe(max_length, d_model, n):

  # generate an empty matrix for the positional encodings (pe)
  pe = np.zeros(max_length*d_model).reshape(max_length, d_model)

  # for each position
  for k in np.arange(max_length):

    # for each dimension
    for i in np.arange(d_model//2):

      # calculate the internal value for sin and cos
      theta = k / (n ** ((2*i)/d_model))

      # even dims: sin
      pe[k, 2*i] = math.sin(theta)

      # odd dims: cos
      pe[k, 2*i+1] = math.cos(theta)

  return pe

positional_embeddings = gen_pe(max_len, d_model, 1000)

positional_embeddings = torch.tensor(positional_embeddings, dtype=torch.float)

positional_embeddings

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000],
        [ 0.8415,  0.5403,  0.0631,  0.9980,  0.0000],
        [ 0.9093, -0.4161,  0.1259,  0.9920,  0.0000]])

## 5. Create Input Embeddings by Adding Initial + Positional

In [56]:
input_embeddings = torch.add(initial_embeddings, positional_embeddings)

input_embeddings

tensor([[ 0.6614,  1.2669,  0.0617,  1.6213, -0.4519],
        [ 0.6753, -0.9825,  0.4447, -0.0296, -0.5631],
        [ 0.0170, -0.4744, -0.0697,  0.0264,  0.4224]], grad_fn=<AddBackward0>)

## Exercise: Completing a Transformer Module

1. Create a class called `TransformerModule` with the following attributes:

* Input should be the `max_length` of the model (i.e. how much can it accept) and `d_model` (i.e. how long are the embeddings.
* The forward function should take in `x` where `x` is similar to `lookup_tensor`
* Complete the Transformer module by solving for the actual attention matrix and multiplying it to a parameter `V`. `V` can be created by declaring a `Parameter` module (see example below).

2. Test the `forward` method of the model by passing in a token sequence with some defined length.

You can use `torch.transpose` [https://pytorch.org/docs/stable/generated/torch.transpose.html](https://pytorch.org/docs/stable/generated/torch.transpose.html) for transposing a matrix.

In [57]:
m = 5
n = 5

In [58]:
window_length = m
embedding_dims = n
k = np.expand_dims(np.arange(0, window_length), axis=1)
positional_embedding = np.zeros((window_length,embedding_dims)) + k
tensor_values = np.arange(embedding_dims) // 2
div = 10000**(tensor_values/embedding_dims)
positional_embedding = positional_embedding/div
positional_embedding[:,0::2] = np.sin(positional_embedding[:,0::2])
positional_embedding[:,1::2] = np.cos(positional_embedding[:,1::2])
positional_embedding

array([[ 0.        ,  1.        ,  0.        ,  1.        ,  0.        ],
       [ 0.84147098,  0.54030231,  0.15782664,  0.98746684,  0.02511622],
       [ 0.90929743, -0.41614684,  0.31169715,  0.9501815 ,  0.0502166 ],
       [ 0.14112001, -0.9899925 ,  0.45775455,  0.88907861,  0.07528529],
       [-0.7568025 , -0.65364362,  0.59233773,  0.80568978,  0.10030649]])

In [77]:
# a V trainable parameter component with dimensionality m and n
torch.manual_seed(1)

class TransformerModule(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self,m, n, positional_embedding):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        self.emb_dim = n
        self.sequence_length = m
        self.trainable_parameters = torch.nn.Parameter(torch.randn(self.sequence_length, self.emb_dim))
        self.token_embedding = torch.nn.Embedding(self.sequence_length,self.emb_dim)
        self.pe = torch.Tensor(positional_embedding)
        
    def forward(self, x):
        x = self.token_embedding(x)
        query = x + self.pe
        # x = query @ query.T
        x = query @ query.transpose(0,1)
        x /= self.emb_dim ** 0.5
        x =  F.softmax(x, dim=1)
        x = x @ self.trainable_parameters
        return x
    
torch.set_printoptions(precision=2, threshold=2)
model = TransformerModule(m, n, positional_embedding)
x = torch.tensor([1,2,3,3,2])
model(x)




tensor([[0.75, 0.11, 0.09, 0.04, 0.02],
        [0.11, 0.48, 0.02, 0.02, 0.38],
        [0.28, 0.06, 0.45, 0.20, 0.01],
        [0.22, 0.11, 0.38, 0.25, 0.05],
        [0.01, 0.16, 0.00, 0.00, 0.83]], grad_fn=<SoftmaxBackward0>)


tensor([[-1.05, -0.59, -0.66, -1.37,  0.09],
        [-0.45, -0.18, -1.36,  0.48,  0.54],
        [ 0.26,  0.06, -0.17, -1.12,  0.24],
        [ 0.19,  0.21, -0.24, -0.83,  0.41],
        [-0.15,  0.60, -1.42,  2.10,  0.10]], grad_fn=<MmBackward0>)