# <font color = 'blue'> Self Attention Encoder

# <font color = 'blue'> Import Libraries
As always, let's import all the required modules and set the random seeds for reproducability.

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

import torchtext 
from torchtext.datasets import Multi30k
from torchtext.vocab import vocab

import numpy as np
from collections import Counter, OrderedDict

import random

import pandas as pd
from pathlib import Path

In [3]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
torchtext.__version__, torch.__version__, torch.cuda.is_available(), spacy.__version__

('0.11.0', '1.10.0', True, '3.2.1')

# <font color = 'blue'>  Preparing the Data



In [5]:
data_folder = Path('/home/harpreet/Insync/google_drive_shaannoor/Data/NLP')
project_folder = Path('/home/harpreet/Insync/google_drive_harpreet/Research/NLP/pytorch-seq2seq')

In [6]:
torchtext.__version__, torch.__version__, torch.cuda.is_available(), spacy.__version__

('0.11.0', '1.10.0', True, '3.2.1')

We'll then create our tokenizers as before.

## <font color = 'blue'> Load tokenized data

In [7]:
df_train = pd.read_pickle(project_folder/'df_train_en_de.pickel')

In [8]:
df_train

Unnamed: 0,source_tokens,target_tokens,source_tokens_reverse
0,"[zwei, junge, weiße, männer, sind, i, m, freie...","[two, young, ,, white, males, are, outside, ne...","[., büsche, vieler, nähe, der, in, freien, m, ..."
1,"[mehrere, männer, mit, schutzhelmen, bedienen,...","[several, men, in, hard, hats, are, operating,...","[., antriebsradsystem, ein, bedienen, schutzhe..."
2,"[ein, kleines, mädchen, klettert, in, ein, spi...","[a, little, girl, climbing, into, a, wooden, p...","[., holz, aus, spielhaus, ein, in, klettert, m..."
3,"[ein, mann, in, einem, blauen, hemd, steht, au...","[a, man, in, a, blue, shirt, is, standing, on,...","[., fenster, ein, putzt, und, leiter, einer, a..."
4,"[zwei, männer, stehen, am, herd, und, bereiten...","[two, men, are, at, the, stove, preparing, foo...","[., zu, essen, bereiten, und, herd, am, stehen..."
...,...,...,...
28995,"[., wand, verschnörkelten, einer, hinter, schr...","[a, woman, behind, a, scrolled, wall, is, writ...","[eine, frau, schreibt, hinter, einer, verschnö..."
28996,"[., kletterwand, einer, an, übt, bergsteiger, ...","[a, rock, climber, practices, on, a, rock, cli...","[ein, bergsteiger, übt, an, einer, kletterwand..."
28997,"[., hauses, einem, vor, straße, einer, auf, ar...","[two, male, construction, workers, are, workin...","[zwei, bauarbeiter, arbeiten, auf, einer, stra..."
28998,"[., fassade, einer, vor, wagen, einem, mit, ju...","[an, elderly, man, sits, outside, a, storefron...","[ein, älterer, mann, sitzt, mit, einem, jungen..."


## <font color = 'blue'> Small subset of data

In [9]:
df_train_small= df_train[0:4]

In [10]:
df_train_small

Unnamed: 0,source_tokens,target_tokens,source_tokens_reverse
0,"[zwei, junge, weiße, männer, sind, i, m, freie...","[two, young, ,, white, males, are, outside, ne...","[., büsche, vieler, nähe, der, in, freien, m, ..."
1,"[mehrere, männer, mit, schutzhelmen, bedienen,...","[several, men, in, hard, hats, are, operating,...","[., antriebsradsystem, ein, bedienen, schutzhe..."
2,"[ein, kleines, mädchen, klettert, in, ein, spi...","[a, little, girl, climbing, into, a, wooden, p...","[., holz, aus, spielhaus, ein, in, klettert, m..."
3,"[ein, mann, in, einem, blauen, hemd, steht, au...","[a, man, in, a, blue, shirt, is, standing, on,...","[., fenster, ein, putzt, und, leiter, einer, a..."


In [11]:
df_train_small= df_train_small.drop(columns=['source_tokens_reverse'])

In [12]:
df_train_small

Unnamed: 0,source_tokens,target_tokens
0,"[zwei, junge, weiße, männer, sind, i, m, freie...","[two, young, ,, white, males, are, outside, ne..."
1,"[mehrere, männer, mit, schutzhelmen, bedienen,...","[several, men, in, hard, hats, are, operating,..."
2,"[ein, kleines, mädchen, klettert, in, ein, spi...","[a, little, girl, climbing, into, a, wooden, p..."
3,"[ein, mann, in, einem, blauen, hemd, steht, au...","[a, man, in, a, blue, shirt, is, standing, on,..."


## <font color = 'blue'> Build Vocab

In [13]:
def create_vocab(text, min_freq, specials):
    my_counter = Counter()
    for line in text:
       my_counter.update(line)
    my_vocab = vocab(my_counter, min_freq=min_freq)
    for i, special in enumerate(specials):
        my_vocab.insert_token(special, i)
    my_vocab.set_default_index(0)
    return my_vocab

Create source vocab, We will add four special tokens - ```['<unk>', '<BOS>', '<EOS>', '<PAD>']```

### <font color = 'blue'> Source Vocab

In [14]:
source_vocab = create_vocab(df_train_small['source_tokens'], 1, ['<unk>', '<BOS>', '<EOS>', '<PAD>'])

In [15]:
len(source_vocab)

41

In [16]:
pd.DataFrame(source_vocab.get_stoi().items(), columns=['tokens', 'index']).sort_values(by = ['index'])[0:10]

Unnamed: 0,tokens,index
28,<unk>,0
34,<BOS>,1
23,<EOS>,2
19,<PAD>,3
17,zwei,4
27,junge,5
14,weiße,6
12,männer,7
16,sind,8
7,i,9


In [17]:
# check index of unknown word - it should be zero
source_vocab['abracdabra']

0

### <font color = 'blue'> Target Vocab

In [18]:
target_vocab = create_vocab(df_train_small['target_tokens'], 1, ['<unk>', '<BOS>', '<EOS>', '<PAD>'])

In [19]:
len(target_vocab)

40

## <font color = 'blue'> Create Dataset and Dataloader

In [20]:
class EngGerman(Dataset):
    def __init__(self, X1, X2):
        self.X1 = X1
        self.X2 = X2
        
    def __len__(self):
        return len(self.X1)
    
    def __getitem__(self, indices):
        return (self.X1.iloc[indices] , self.X2.iloc[indices]) 

In [21]:
trainset = EngGerman(df_train_small['source_tokens'], df_train['target_tokens'])

In [22]:
trainset[0]

(['zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'i',
  'm',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.'],
 ['two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.'])

In [23]:
trainset[1]

(['mehrere',
  'männer',
  'mit',
  'schutzhelmen',
  'bedienen',
  'ein',
  'antriebsradsystem',
  '.'],
 ['several',
  'men',
  'in',
  'hard',
  'hats',
  'are',
  'operating',
  'a',
  'giant',
  'pulley',
  'system',
  '.'])

In [24]:
trainset[2]

(['ein',
  'kleines',
  'mädchen',
  'klettert',
  'in',
  'ein',
  'spielhaus',
  'aus',
  'holz',
  '.'],
 ['a', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '.'])

In [25]:
trainset[3]

(['ein',
  'mann',
  'in',
  'einem',
  'blauen',
  'hemd',
  'steht',
  'auf',
  'einer',
  'leiter',
  'und',
  'putzt',
  'ein',
  'fenster',
  '.'],
 ['a',
  'man',
  'in',
  'a',
  'blue',
  'shirt',
  'is',
  'standing',
  'on',
  'a',
  'ladder',
  'cleaning',
  'a',
  'window',
  '.'])

<font color = 'green'> **Function to replace words woth their index. Alaso add tokens BOS and EOS for beginning and end of sentences**

In [26]:
def text_transform (my_vocab, text):
     text_numerical = [my_vocab[token] for token in text]
     return torch.tensor([my_vocab['<BOS>']] + text_numerical + [my_vocab['<EOS>']])
     #return list(my_vocab['<BOS>']) + text_numerical + list(my_vocab['<EOS>'])

In [27]:
text = trainset[0][1]
print(text)
text_transform(target_vocab, text)

['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


tensor([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  2])

In [28]:
text = trainset[1][1]
print(text)
text_transform(target_vocab, text)

['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']


tensor([ 1, 15, 16, 17, 18, 19,  9, 20, 21, 22, 23, 24, 14,  2])

<font color = 'green'> Create a function that will be use by dataloaders to group obsevations. We will first use transform function to add eos and bos tokens and replace words with indexes. Finally we will add pad tokens for smaller sentences in a batch.

In [29]:
def collate_batch(batch):
   source_list, target_list = [], []
   for (source_text, target_text) in batch:
        source_transform = text_transform(source_vocab, source_text)
        source_list.append(source_transform)
        target_transform =text_transform(target_vocab, target_text)
        target_list.append(target_transform)
        
   source_pad = pad_sequence(source_list, padding_value=3.0, batch_first = True)
   target_pad = pad_sequence(target_list, padding_value=3.0, batch_first = True)
   #print(source_list)
   return (source_pad, target_pad)

In [30]:
batch_size = 2

train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collate_batch)

In [31]:
for i, (source, target) in enumerate(train_loader):
  print('batch number:' ,i)
  print(source)

batch number: 0
tensor([[ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,  2],
        [ 1, 18,  7, 19, 20, 21, 22, 23, 17,  2,  3,  3,  3,  3,  3,  3]])
batch number: 1
tensor([[ 1, 22, 30, 12, 31, 32, 33, 34, 35, 36, 37, 38, 39, 22, 40, 17,  2],
        [ 1, 22, 24, 25, 26, 12, 22, 27, 28, 29, 17,  2,  3,  3,  3,  3,  3]])


In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
device

device(type='cuda')

In [33]:
src = source.clone()
print(src)
print(src.shape)

tensor([[ 1, 22, 30, 12, 31, 32, 33, 34, 35, 36, 37, 38, 39, 22, 40, 17,  2],
        [ 1, 22, 24, 25, 26, 12, 22, 27, 28, 29, 17,  2,  3,  3,  3,  3,  3]])
torch.Size([2, 17])


In [34]:
trg= target.clone
print(src)
print(src.shape)

tensor([[ 1, 22, 30, 12, 31, 32, 33, 34, 35, 36, 37, 38, 39, 22, 40, 17,  2],
        [ 1, 22, 24, 25, 26, 12, 22, 27, 28, 29, 17,  2,  3,  3,  3,  3,  3]])
torch.Size([2, 17])


# <font color = 'blue'> Building the Model

## <font color = 'blue'> Encoder

Similar to the ConvSeq2Seq model, the Transformer's encoder does not attempt to compress the entire source sentence, $X = (x_1, ... ,x_n)$, into a single context vector, $z$. Instead it produces a sequence of context vectors, $Z = (z_1, ... , z_n)$. So, if our input sequence was 5 tokens long we would have $Z = (z_1, z_2, z_3, z_4, z_5)$. Why do we call this a sequence of context vectors and not a sequence of hidden states? A hidden state at time $t$ in an RNN has only seen tokens $x_t$ and all the tokens before it. However, each context vector here has seen all tokens at all positions within the input sequence.

![](assets/transformer-encoder.png)

### <font color = 'blue'>  **Embeddings**

1. First, the tokens are passed through a standard embedding layer. 
2. the token embeddings are multiplied by a scaling factor which is $\sqrt{d_{model}}$, where $d_{model}$ is the hidden dimension size, `hid_dim`.This supposedly reduces variance in the embeddings and the model is difficult to train reliably without this scaling factor. 
3.Next, the position of the token within the sequence (starting with the first token, the `<sos>` (start of sequence) token, in position 0) is passed through another embedding layer called a *positional embedding layer*. This is a standard embedding layer where the input is not the token itself but teh position. FOr our small example, the position embedding has a "vocabulary" size of 20, which means our model can accept sentences up to 20 tokens long. This can be increased if we want to handle longer sentences.<font color = 'red'> **Checked the fixed static embeddings used in the original paper** </font>

4. Next, the token and positional embeddings are elementwise summed together to get a vector which contains information about the token and also its position with in the sequence.  <font color = 'red'>*Dropout is then applied to the combined embeddings.**>/font>

5. The combined embeddings are then passed through $N$ *encoder layers* to get $Z$, which is then used by the decoder. We will use 2 layers for our smal example.    
   

#### <font color = 'blue'>  Step 1 Token embedding

In [35]:
src.device

device(type='cpu')

In [36]:
src = src.to(device)
src.device

device(type='cuda', index=0)

In [37]:
input_dim = len(source_vocab)
hid_dim = 8
torch.manual_seed(0)
token_embedding_layer = nn.Embedding(input_dim, hid_dim)

In [38]:
token_embedding_layer.to(device)

Embedding(41, 8)

In [39]:
print(f'{token_embedding_layer.weight[0:5]}')

tensor([[-1.1258, -1.1524, -0.2506, -0.4339,  0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.3223, -1.2633,  0.3500,  0.3081,  0.1198,  1.2377,  1.1168, -0.2473],
        [-1.3527, -1.6959,  0.5667,  0.7935,  0.5988, -1.5551, -0.3414,  1.8530],
        [ 0.7502, -0.5855, -0.1734,  0.1835,  1.3894,  1.5863,  0.9463, -0.8437],
        [-0.6136,  0.0316, -0.4927,  0.2484,  0.4397,  0.1124,  0.6408,  0.4412]],
       device='cuda:0', grad_fn=<SliceBackward0>)


In [40]:
src[0][0]
# batch_size, src_len

tensor(1, device='cuda:0')

In [41]:
src_embedding = token_embedding_layer(src)

In [42]:
src_embedding.shape
# batch_size, seq_len, hid_dim

torch.Size([2, 17, 8])

In [43]:
src_embedding[0][0]

tensor([ 0.3223, -1.2633,  0.3500,  0.3081,  0.1198,  1.2377,  1.1168, -0.2473],
       device='cuda:0', grad_fn=<SelectBackward0>)

#### <font color = 'blue'>  Step 2 - scale output of embedding
<font color = 'red'> **NOT UNDERSTOOD**</font>

In [44]:
torch.var(src_embedding [0][0])

tensor(0.6142, device='cuda:0', grad_fn=<VarBackward0>)

In [45]:
scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
scale

tensor([2.8284], device='cuda:0')

In [46]:
src_embedding_scaled = src_embedding*scale

In [47]:
torch.var(src_embedding_scaled[0][0])

tensor(4.9136, device='cuda:0', grad_fn=<VarBackward0>)

#### <font color = 'blue'>  Step 3 Positional Embedding

In [48]:
max_length = 20
position_embedding_layer = nn.Embedding(max_length, hid_dim)

In [49]:
position_embedding_layer.to(device)

Embedding(20, 8)

In [50]:
position_embedding_layer.weight[0:5]

tensor([[-0.5627, -0.8328, -1.3955, -0.3993, -0.3099, -0.0561,  0.5174, -1.5962],
        [ 0.3570, -2.2975, -0.8711, -1.6740,  0.5631, -1.4351,  0.7194, -1.3707],
        [ 0.3221, -0.1016,  0.2060,  1.2168,  1.2359, -0.1002,  2.1364,  0.0700],
        [ 0.4990,  0.0565,  0.4061, -1.7384,  1.1901,  2.6352,  0.2284,  0.3241],
        [-1.1154,  2.1914,  0.1158,  0.7773, -1.0921, -0.0611, -1.4928, -1.7644]],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [51]:
batch_size = src_embedding_scaled.shape[0]
src_len = src_embedding_scaled.shape[1]

In [52]:
print(batch_size)
print(src_len)

2
17


In [53]:
position = torch.arange(0, src_len)
print(position)
print(position.shape)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16])
torch.Size([17])


In [54]:
position = position.unsqueeze(0)
print(position.shape)

torch.Size([1, 17])


In [55]:
position = position.repeat(batch_size,1)
position = position.to(device)
# [batch_size, seq_len]

In [56]:
position.shape

torch.Size([2, 17])

In [57]:
position

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]],
       device='cuda:0')

In [58]:
position_embedding = position_embedding_layer(position)

In [59]:
position_embedding.shape
# [ batch_size, seq_len, hid_dim]

torch.Size([2, 17, 8])

In [60]:
encoder_input = position_embedding + src_embedding

In [61]:
encoder_input[0][0]

tensor([-0.2404, -2.0961, -1.0456, -0.0912, -0.1901,  1.1816,  1.6342, -1.8435],
       device='cuda:0', grad_fn=<SelectBackward0>)

In [62]:
encoder_input_dropout_layer = nn.Dropout(p=0.1)

In [63]:
encoder_input_after_dropout = encoder_input_dropout_layer(encoder_input)

In [64]:
encoder_input_after_dropout[0][0]
# [batch_size, seq_len, hid_dim]

tensor([-0.2672, -2.3290, -1.1617, -0.1013, -0.2112,  0.0000,  1.8158, -2.0484],
       device='cuda:0', grad_fn=<SelectBackward0>)

### <font color = 'blue'>  <font size =5> **Encoder Layer**

The encoder layers are where all of the "meat" of the encoder is contained. We first pass the source sentence and its mask into the *multi-head attention layer*, then perform dropout on it, apply a residual connection and pass it through a [Layer Normalization](https://arxiv.org/abs/1607.06450) layer. We then pass it through a *position-wise feedforward* layer and then, again, apply dropout, a residual connection and then layer normalization to get the output of this layer which is fed into the next layer. The parameters are not shared between layers. 

The mutli head attention layer is used by the encoder layer to attend to the source sentence, i.e. it is calculating and applying attention over itself instead of another sequence, hence we call it *self attention*.

[This](https://mlexplained.com/2018/01/13/weight-normalization-and-layer-normalization-explained-normalization-in-deep-learning-part-2/) article goes into more detail about layer normalization, but the gist is that it normalizes the values of the features, i.e. across the hidden dimension, so each feature has a mean of 0 and a standard deviation of 1. This allows neural networks with a larger number of layers, like the Transformer, to be trained easier.

#### <font color = 'blue'> **Sublayer - Attention**

**Mutli Head Attention Layer**

One of the key, novel concepts introduced by the Transformer paper is the *multi-head attention layer*. 

![](assets/transformer-attention.png)

Attention can be though of as *queries*, *keys* and *values* - where the query is used with the key to get an attention vector (usually the output of a *softmax* operation and has all values between 0 and 1 which sum to 1) which is then used to get a weighted sum of the values.

The Transformer uses *scaled dot-product attention*, where the query and key are combined by taking the dot product between them, then applying the softmax operation and scaling by $d_k$ before finally then multiplying by the value. $d_k$ is the *head dimension*, `head_dim`, which we will shortly explain further.

$$ \text{Attention}(Q, K, V) = \text{Softmax} \big( \frac{QK^T}{\sqrt{d_k}} \big)V $$ 

This is similar to standard *dot product attention* but is scaled by $d_k$, which the paper states is used to stop the results of the dot products growing large, causing gradients to become too small.

However, the scaled dot-product attention isn't simply applied to the queries, keys and values. Instead of doing a single attention application the queries, keys and values have their `hid_dim` split into $h$ *heads* and the scaled dot-product attention is calculated over all heads in parallel. This means instead of paying attention to one concept per attention application, we pay attention to $h$. We then re-combine the heads into their `hid_dim` shape, thus each `hid_dim` is potentially paying attention to $h$ different concepts.

$$ \text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1,...,\text{head}_h)W^O $$

$$\text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) $$

$W^O$ is the linear layer applied at the end of the multi-head attention layer, `fc`. $W^Q, W^K, W^V$ are the linear layers `fc_q`, `fc_k` and `fc_v`.

Walking through the module, first we calculate $QW^Q$, $KW^K$ and $VW^V$ with the linear layers, `fc_q`, `fc_k` and `fc_v`, to give us `Q`, `K` and `V`. Next, we split the `hid_dim` of the query, key and value into `n_heads` using `.view` and correctly permute them so they can be multiplied together. We then calculate the `energy` (the un-normalized attention) by multiplying `Q` and `K` together and scaling it by the square root of `head_dim`, which is calulated as `hid_dim // n_heads`. We then mask the energy so we do not pay attention over any elements of the sequeuence we shouldn't, then apply the softmax and dropout. We then apply the attention to the value heads, `V`, before combining the `n_heads` together. Finally, we multiply this $W^O$, represented by `fc_o`. 

##### <font color = 'blue'>**Linear Transformation of embeddings to generate Queries, Keys and values**</font>

In [65]:
out_hid_dim = 8
hid_dim = 8
fc_q = nn.Linear(hid_dim, out_hid_dim).to(device)
fc_k = nn.Linear(hid_dim, out_hid_dim).to(device)
fc_v = nn.Linear(hid_dim, out_hid_dim).to(device)
fc_o = nn.Linear(out_hid_dim, hid_dim).to(device)

In [66]:
fc_q.weight.shape

torch.Size([8, 8])

In [67]:
fc_q.bias.shape

torch.Size([8])

##### <font color = 'blue'> **Generate queries, keys and values**

In [68]:
Q = fc_q(encoder_input_after_dropout)
K = fc_k(encoder_input_after_dropout)
V = fc_v(encoder_input_after_dropout)

In [69]:
Q.shape
#[batch_size, query_len, hid_dim]

torch.Size([2, 17, 8])

In [70]:
n_heads = 2
head_dim = out_hid_dim // n_heads
print(head_dim)

4


In [71]:
assert out_hid_dim % n_heads == 0

In [72]:
Q = Q.view(batch_size, -1, n_heads, head_dim)
K = K.view(batch_size, -1, n_heads, head_dim)
V = V.view(batch_size, -1, n_heads, head_dim)

In [73]:
Q.shape
#[batch_size, query_len, n_heads, head_dim]

torch.Size([2, 17, 2, 4])

In [74]:
Q = Q.permute(0, 2, 1, 3)
K = K.permute(0, 2, 1, 3)
V = V.permute(0, 2, 1, 3)

In [75]:
Q.shape
#[batch_size,num_heads, query_len, head_dim ]

torch.Size([2, 2, 17, 4])

In [76]:
K.shape
#[batch_size,num_heads, key_len, head_dim ]

torch.Size([2, 2, 17, 4])

##### <font color = 'blue'> **Scaled dot product of Queries and Keys**</font>

In [77]:
scale = torch.sqrt(torch.FloatTensor([head_dim])).to(device)

In [78]:
energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / scale

In [79]:
energy.shape
#[batch_size, num_heads, query_len, key_len]

torch.Size([2, 2, 17, 17])

##### <font color = 'blue'> **Apply mask to output of Q, K dot product**</font><br>
<font color = 'green'>**We do not want tokens to pay attention to pad tokens**</font>

In [80]:
mask_input = source.clone()
mask_input

tensor([[ 1, 22, 30, 12, 31, 32, 33, 34, 35, 36, 37, 38, 39, 22, 40, 17,  2],
        [ 1, 22, 24, 25, 26, 12, 22, 27, 28, 29, 17,  2,  3,  3,  3,  3,  3]])

In [81]:
SRC_PAD_IDX = source_vocab['<PAD>']
SRC_PAD_IDX

3

In [82]:
src_mask = (mask_input!= SRC_PAD_IDX )
src_mask

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True, False, False, False, False, False]])

In [83]:
src_mask.shape

torch.Size([2, 17])

In [84]:
src_mask = src_mask.unsqueeze(1).unsqueeze(2)

In [85]:
src_mask.shape

torch.Size([2, 1, 1, 17])

In [86]:
src_mask = src_mask.to(device)

In [87]:
energy_masked = energy.masked_fill(src_mask == 0, -1e10)

<font color = 'red'>**Print energy values for second sentence**</font>

In [92]:
print(energy_masked[1,1,1,15].data, energy_masked[1,1,1,16].data)
      
#[batch_size, num_heads, query_len, key_len]

tensor(-1.0000e+10, device='cuda:0') tensor(-1.0000e+10, device='cuda:0')


In [93]:
print(energy_masked[1,0,5,15].data, energy_masked[1,0,5,16].data)
      
#[batch_size, num_heads, query_len, key_len]

tensor(-1.0000e+10, device='cuda:0') tensor(-1.0000e+10, device='cuda:0')


<font size = 4, color = 'red'> **NOT UNDERSTOOD**</font><br>
<font color = 'green'> **We are not ignoring pad tokens completely - we are ignoring pad tokens in keys and not queries**

In [94]:
print(energy_masked[1,0,15,1].data, energy_masked[1,0,16,1].data)      
#[batch_size, num_heads, query_len, key_len]

tensor(0.0861, device='cuda:0') tensor(-0.9556, device='cuda:0')


<font color = 'red'>**Print energy values for first sentence**</font>

In [95]:
print(energy_masked[0,0,5,15].data, energy_masked[0,1,5,16].data, 
      energy_masked[0,0,15,5].data, energy_masked[0,1,16,5].data)
#[batch_size, num_heads, query_len, key_len]

tensor(-0.7212, device='cuda:0') tensor(-0.6631, device='cuda:0') tensor(0.2438, device='cuda:0') tensor(2.6890, device='cuda:0')


##### <font color = 'blue'>**Apply softmax to convert QV dot product to probabilities**</font><br>

In [96]:
attention_prob = torch.softmax(energy_masked, dim = -1)                 
#attention_prob = [batch size, n heads, query len, key len]

In [97]:
print(attention_prob [1,1,1,15].data, attention_prob [1,1,1,16].data)
#[batch_size, num_heads, query_len, key_len]

tensor(0., device='cuda:0') tensor(0., device='cuda:0')


In [99]:
print(attention_prob[1, 0,1, :])

tensor([0.0125, 0.0131, 0.2209, 0.1472, 0.0973, 0.0100, 0.0705, 0.0569, 0.0548,
        0.0793, 0.1370, 0.1005, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SliceBackward0>)


In [100]:
attention_prob[0, 0,1, :].sum()

tensor(1., device='cuda:0', grad_fn=<SumBackward0>)

##### <font color = 'blue'>**Apply dropout layer to attention probabilities**</font><br>
<font color = 'red'>**NOT UNDERSTOOD- why apply dropout here (probs will not sum to 1)**</font><br>
<font color = 'green'>**Quotes from paper --We apply dropout [33] to the output of each sub-layer, before it is added to the sub-layer input and normalized.**</font>


In [101]:
att_enc_dropout =  nn.Dropout(p=0.0)

In [102]:
attention_prob_after_dropout = att_enc_dropout(attention_prob)

In [103]:
print(attention_prob_after_dropout [0, 0,1, :])

tensor([0.0069, 0.0031, 0.0177, 0.0442, 0.2607, 0.0039, 0.2280, 0.0592, 0.0332,
        0.0391, 0.0719, 0.0232, 0.1246, 0.0076, 0.0227, 0.0091, 0.0450],
       device='cuda:0', grad_fn=<SliceBackward0>)


In [104]:
attention_prob_after_dropout[0, 0,1, :].sum()

tensor(1., device='cuda:0', grad_fn=<SumBackward0>)

<font size = 3, color = 'red'>**Probs do not sum to 1, sometimes these are greater than one and sometimes these are less than one**</font>

##### <font size = 4, color = 'blue'>**Encoder Self Attention Output**</font><br>
<font size = 3, color = 'green'>**Final vectors are wighted sum of values. This gives us the final embeddings afer considering the context words. These represent the contextualized embeddings for the tokens**</font><br>

In [105]:
V.shape
# [batch_size, num_heads, value_len, head_dim]

torch.Size([2, 2, 17, 4])

In [106]:
attention_prob_after_dropout.shape
#[batch_size, num_heads, query_len, key_len]

torch.Size([2, 2, 17, 17])

<font size = 3, color = 'green'>**NOTE: key_len will be same as value_len**.<br>
- Query comes from focal word (sentence), keys and values are from context.
- In self attention both focal words and context are based on same sentence and hence same length.
- In encoder-decoder attention of machine translation - focal word is target sentence and context word comes from source language. Hence queries are generated from target language. Whereas keys and values are generated from source language. We are trying to find which focal word in target language should pay attention to which words in source language.

<font size = 3, color = 'red'>**Not Understood** - Since keys and values both capture context, why can we not use same matrix for Values and Keys i.e fc_k = fc_v. This is exacyly what we did in seq2seq paper with attention (without self attention). The source vectors were used both as keys and values.

In [107]:
# We can do this batch multiplication of the matrices of shape 
# [query_len, key_len] and [value_len, head_dim] as key_len = value_len
encoder_contextulaized_embeddings = torch.matmul(attention_prob_after_dropout, V)
#[batch_size, num_heads, query_len, head_dim]

In [108]:
encoder_contextulaized_embeddings.shape
# [batch_size, number_of_heads, query_len, head_dim]

torch.Size([2, 2, 17, 4])

In [109]:
encoder_contextulaized_embeddings = encoder_contextulaized_embeddings.permute(0, 2, 1, 3)
# [batch_size, query_len, number_of_heads, head_dim]

In [110]:
encoder_contextulaized_embeddings.shape

torch.Size([2, 17, 2, 4])

In [111]:
encoder_contextulaized_embeddings = encoder_contextulaized_embeddings.view(batch_size, -1, out_hid_dim)

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

We cannot use .view because the tensor is no londer contiguous. We can use reshape which will create a copy and make a contiguous tensor.

In [112]:
encoder_contextulaized_embeddings = encoder_contextulaized_embeddings.reshape(batch_size, -1, out_hid_dim)

In [113]:
encoder_contextulaized_embeddings.shape
#[batch_size, seq_len, out_hid_dim]

torch.Size([2, 17, 8])

<font size = 3, color = 'green'>We need to project the final values to have same shape as  input embedding. To accomplish this we will use fc_o linear layer we created earlier.

In [114]:
encoder_contextulaized_embeddings = fc_o(encoder_contextulaized_embeddings)

In [115]:
encoder_contextulaized_embeddings.shape
#[batch_size, seq_len, hid_dim]

torch.Size([2, 17, 8])

In [116]:
encoder_contextulaized_embeddings[0,0,:]

tensor([-0.3837, -0.2911,  0.0199, -0.3197, -0.1372, -0.0581,  0.0435, -0.0631],
       device='cuda:0', grad_fn=<SliceBackward0>)

#### <font color ='blue'>**Implementing MultiHeadAttention using Pytorch Layer**
Limitation: hid_dim == output_hid_dim.
    

In [117]:
multihead_attnetion_layer = torch.nn.MultiheadAttention(embed_dim=hid_dim, num_heads=n_heads, dropout=0.0, 
                                                        bias=True, add_bias_kv=False, 
                                                        add_zero_attn=False, kdim=None, 
                                                        vdim=None, batch_first=True, 
                                                        device=device, dtype=None)

In [118]:
for name, parameter in multihead_attnetion_layer.named_parameters():
    print(name, parameter.data.shape)

in_proj_weight torch.Size([24, 8])
in_proj_bias torch.Size([24])
out_proj.weight torch.Size([8, 8])
out_proj.bias torch.Size([8])


<font size = 3, color = 'green'>It combines Q, Kand V into one metrics. To compare the results, we need to make sure that initial merices are the same

In [119]:
multihead_attnetion_layer.in_proj_weight.data = torch.concat((fc_q.weight.data,fc_k.weight.data,
                                                             fc_v.weight.data))

In [120]:
multihead_attnetion_layer.in_proj_weight.data.shape

torch.Size([24, 8])

In [121]:
multihead_attnetion_layer.in_proj_bias.data = torch.concat((fc_q.bias.data,fc_k.bias.data,
                                                             fc_v.bias.data))

In [122]:
multihead_attnetion_layer.in_proj_bias.shape

torch.Size([24])

In [123]:
multihead_attnetion_layer.out_proj.weight.data = fc_o.weight.data

In [124]:
multihead_attnetion_layer.out_proj.bias.data = fc_o.bias.data

In [125]:
src_mask = (mask_input== SRC_PAD_IDX )
src_mask=src_mask.to(device)

In [126]:
src_mask.shape

torch.Size([2, 17])

In [127]:
src_mask

tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False, False, False,
         False, False,  True,  True,  True,  True,  True]], device='cuda:0')

In [128]:
output, attn_prob =multihead_attnetion_layer (query=encoder_input_after_dropout, 
                                              key= encoder_input_after_dropout,
                                              value= encoder_input_after_dropout, 
                                              key_padding_mask=src_mask, 
                                              need_weights=True, 
                                              attn_mask=None)

In [129]:
output[1,12,:]

tensor([-0.1993, -0.2112,  0.1667, -0.5761, -0.4725, -0.0408,  0.1955,  0.1250],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [130]:
encoder_contextulaized_embeddings[1,12,:]

tensor([-0.1993, -0.2112,  0.1667, -0.5761, -0.4725, -0.0408,  0.1955,  0.1250],
       device='cuda:0', grad_fn=<SliceBackward0>)