# <font color = 'blue'> 6 - Attention Encoder-Decoder

# <font color = 'blue'> Import Libraries
As always, let's import all the required modules and set the random seeds for reproducability.

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

import torchtext 
from torchtext.datasets import Multi30k
from torchtext.vocab import vocab


import numpy as np
from collections import Counter, OrderedDict

import random
import pandas as pd
from pathlib import Path

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
torchtext.__version__, torch.__version__, torch.cuda.is_available()

('0.11.0', '1.10.0', True)

# <font color = 'blue'>  Preparing the Data



In [6]:
data_folder = Path('/home/harpreet/Insync/google_drive_shaannoor/Data/NLP')
project_folder = Path('/home/harpreet/Insync/google_drive_harpreet/Research/NLP/pytorch-seq2seq')

We'll then create our tokenizers as before.

## <font color = 'blue'> Load tokenized data

In [7]:
df_train = pd.read_pickle(project_folder/'df_train_en_de.pickel')

In [8]:
df_train

Unnamed: 0,source_tokens,target_tokens,source_tokens_reverse
0,"[zwei, junge, weiße, männer, sind, i, m, freie...","[two, young, ,, white, males, are, outside, ne...","[., büsche, vieler, nähe, der, in, freien, m, ..."
1,"[mehrere, männer, mit, schutzhelmen, bedienen,...","[several, men, in, hard, hats, are, operating,...","[., antriebsradsystem, ein, bedienen, schutzhe..."
2,"[ein, kleines, mädchen, klettert, in, ein, spi...","[a, little, girl, climbing, into, a, wooden, p...","[., holz, aus, spielhaus, ein, in, klettert, m..."
3,"[ein, mann, in, einem, blauen, hemd, steht, au...","[a, man, in, a, blue, shirt, is, standing, on,...","[., fenster, ein, putzt, und, leiter, einer, a..."
4,"[zwei, männer, stehen, am, herd, und, bereiten...","[two, men, are, at, the, stove, preparing, foo...","[., zu, essen, bereiten, und, herd, am, stehen..."
...,...,...,...
28995,"[., wand, verschnörkelten, einer, hinter, schr...","[a, woman, behind, a, scrolled, wall, is, writ...","[eine, frau, schreibt, hinter, einer, verschnö..."
28996,"[., kletterwand, einer, an, übt, bergsteiger, ...","[a, rock, climber, practices, on, a, rock, cli...","[ein, bergsteiger, übt, an, einer, kletterwand..."
28997,"[., hauses, einem, vor, straße, einer, auf, ar...","[two, male, construction, workers, are, workin...","[zwei, bauarbeiter, arbeiten, auf, einer, stra..."
28998,"[., fassade, einer, vor, wagen, einem, mit, ju...","[an, elderly, man, sits, outside, a, storefron...","[ein, älterer, mann, sitzt, mit, einem, jungen..."


## <font color = 'blue'> Small subset of data

In [9]:
df_train_small= df_train[0:4]

In [10]:
df_train_small

Unnamed: 0,source_tokens,target_tokens,source_tokens_reverse
0,"[zwei, junge, weiße, männer, sind, i, m, freie...","[two, young, ,, white, males, are, outside, ne...","[., büsche, vieler, nähe, der, in, freien, m, ..."
1,"[mehrere, männer, mit, schutzhelmen, bedienen,...","[several, men, in, hard, hats, are, operating,...","[., antriebsradsystem, ein, bedienen, schutzhe..."
2,"[ein, kleines, mädchen, klettert, in, ein, spi...","[a, little, girl, climbing, into, a, wooden, p...","[., holz, aus, spielhaus, ein, in, klettert, m..."
3,"[ein, mann, in, einem, blauen, hemd, steht, au...","[a, man, in, a, blue, shirt, is, standing, on,...","[., fenster, ein, putzt, und, leiter, einer, a..."


In [11]:
df_train_small= df_train_small.drop(columns=['source_tokens_reverse'])

In [12]:
df_train_small

Unnamed: 0,source_tokens,target_tokens
0,"[zwei, junge, weiße, männer, sind, i, m, freie...","[two, young, ,, white, males, are, outside, ne..."
1,"[mehrere, männer, mit, schutzhelmen, bedienen,...","[several, men, in, hard, hats, are, operating,..."
2,"[ein, kleines, mädchen, klettert, in, ein, spi...","[a, little, girl, climbing, into, a, wooden, p..."
3,"[ein, mann, in, einem, blauen, hemd, steht, au...","[a, man, in, a, blue, shirt, is, standing, on,..."


## <font color = 'blue'> Build Vocab

In [13]:
def create_vocab(text, min_freq, specials):
    my_counter = Counter()
    for line in text:
       my_counter.update(line)
    my_vocab = vocab(my_counter, min_freq=min_freq)
    for i, special in enumerate(specials):
        my_vocab.insert_token(special, i)
    my_vocab.set_default_index(0)
    return my_vocab

Create source vocab, We will add four special tokens - ```['<unk>', '<BOS>', '<EOS>', '<PAD>']```

### <font color = 'blue'> Source Vocab

In [14]:
source_vocab = create_vocab(df_train_small['source_tokens'], 1, ['<unk>', '<BOS>', '<EOS>', '<PAD>'])

In [15]:
len(source_vocab)

41

In [16]:
pd.DataFrame(source_vocab.get_stoi().items(), columns=['tokens', 'index']).sort_values(by = ['index'])[0:10]

Unnamed: 0,tokens,index
28,<unk>,0
34,<BOS>,1
23,<EOS>,2
19,<PAD>,3
17,zwei,4
27,junge,5
14,weiße,6
12,männer,7
16,sind,8
7,i,9


In [17]:
# check index of unknown word - it should be zero
source_vocab['abracdabra']

0

### <font color = 'blue'> Target Vocab

In [18]:
target_vocab = create_vocab(df_train_small['target_tokens'], 1, ['<unk>', '<BOS>', '<EOS>', '<PAD>'])

In [19]:
len(target_vocab)

40

## <font color = 'blue'> Create Dataset and Dataloader

In [20]:
class EngGerman(Dataset):
    def __init__(self, X1, X2):
        self.X1 = X1
        self.X2 = X2
        
    def __len__(self):
        return len(self.X1)
    
    def __getitem__(self, indices):
        return (self.X1.iloc[indices] , self.X2.iloc[indices]) 

In [21]:
trainset = EngGerman(df_train_small['source_tokens'], df_train['target_tokens'])

In [22]:
trainset[0]

(['zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'i',
  'm',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.'],
 ['two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.'])

In [23]:
trainset[1]

(['mehrere',
  'männer',
  'mit',
  'schutzhelmen',
  'bedienen',
  'ein',
  'antriebsradsystem',
  '.'],
 ['several',
  'men',
  'in',
  'hard',
  'hats',
  'are',
  'operating',
  'a',
  'giant',
  'pulley',
  'system',
  '.'])

In [24]:
trainset[2]

(['ein',
  'kleines',
  'mädchen',
  'klettert',
  'in',
  'ein',
  'spielhaus',
  'aus',
  'holz',
  '.'],
 ['a', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '.'])

In [25]:
trainset[3]

(['ein',
  'mann',
  'in',
  'einem',
  'blauen',
  'hemd',
  'steht',
  'auf',
  'einer',
  'leiter',
  'und',
  'putzt',
  'ein',
  'fenster',
  '.'],
 ['a',
  'man',
  'in',
  'a',
  'blue',
  'shirt',
  'is',
  'standing',
  'on',
  'a',
  'ladder',
  'cleaning',
  'a',
  'window',
  '.'])

<font color = 'green'> **Function to replace words woth their index. Alaso add tokens BOS and EOS for beginning and end of sentences**

In [26]:
def text_transform (my_vocab, text):
     text_numerical = [my_vocab[token] for token in text]
     return torch.tensor([my_vocab['<BOS>']] + text_numerical + [my_vocab['<EOS>']])
     #return list(my_vocab['<BOS>']) + text_numerical + list(my_vocab['<EOS>'])

In [27]:
text = trainset[0][1]
print(text)
text_transform(target_vocab, text)

['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


tensor([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  2])

In [28]:
text = trainset[1][1]
print(text)
text_transform(target_vocab, text)

['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']


tensor([ 1, 15, 16, 17, 18, 19,  9, 20, 21, 22, 23, 24, 14,  2])

<font color = 'green'> Create a function that will be use by dataloaders to group obsevations. We will first use transform function to add eos and bos tokens and replace words with indexes. Finally we will add pad tokens for smaller sentences in a batch.

In [29]:
def collate_batch(batch):
   source_list, target_list = [], []
   for (source_text, target_text) in batch:
        source_transform = text_transform(source_vocab, source_text)
        source_list.append(source_transform)
        target_transform =text_transform(target_vocab, target_text)
        target_list.append(target_transform)
        
   source_pad = pad_sequence(source_list, padding_value=3.0, batch_first = True)
   target_pad = pad_sequence(target_list, padding_value=3.0, batch_first = True)
   #print(source_list)
   return (source_pad, target_pad)

In [30]:
batch_size = 2

train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collate_batch)

In [31]:
torch.manual_seed(456)
for i, (source, target) in enumerate(train_loader):
   
  print('batch number:' ,i)
  print('source')  
  print(source)
  print('target')  
  print(target)

batch number: 0
source
tensor([[ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,  2,  3],
        [ 1, 22, 30, 12, 31, 32, 33, 34, 35, 36, 37, 38, 39, 22, 40, 17,  2]])
target
tensor([[ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  2,  3,  3,  3,  3],
        [ 1, 21, 31, 17, 21, 32, 33, 34, 35, 36, 21, 37, 38, 21, 39, 14,  2]])
batch number: 1
source
tensor([[ 1, 22, 24, 25, 26, 12, 22, 27, 28, 29, 17,  2],
        [ 1, 18,  7, 19, 20, 21, 22, 23, 17,  2,  3,  3]])
target
tensor([[ 1, 21, 25, 26, 27, 28, 21, 29, 30, 14,  2,  3,  3,  3],
        [ 1, 15, 16, 17, 18, 19,  9, 20, 21, 22, 23, 24, 14,  2]])


In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
device

device(type='cuda')

In [33]:
src = source.clone()
print(src)
print(src.shape)

tensor([[ 1, 22, 24, 25, 26, 12, 22, 27, 28, 29, 17,  2],
        [ 1, 18,  7, 19, 20, 21, 22, 23, 17,  2,  3,  3]])
torch.Size([2, 12])


In [34]:
trg= target.clone
print(src)
print(src.shape)

tensor([[ 1, 22, 24, 25, 26, 12, 22, 27, 28, 29, 17,  2],
        [ 1, 18,  7, 19, 20, 21, 22, 23, 17,  2,  3,  3]])
torch.Size([2, 12])


# <font color = 'blue'>  **Embeddings**

In [35]:
src = source.clone().to(device)
trg = target.clone().to(device)

In [36]:
src

tensor([[ 1, 22, 24, 25, 26, 12, 22, 27, 28, 29, 17,  2],
        [ 1, 18,  7, 19, 20, 21, 22, 23, 17,  2,  3,  3]], device='cuda:0')

In [37]:
trg

tensor([[ 1, 21, 25, 26, 27, 28, 21, 29, 30, 14,  2,  3,  3,  3],
        [ 1, 15, 16, 17, 18, 19,  9, 20, 21, 22, 23, 24, 14,  2]],
       device='cuda:0')

In [39]:
hid_dim = 8
torch.manual_seed(0)
src_token_embedding_layer = nn.Embedding(len(source_vocab), hid_dim).to(device)
trg_token_embedding_layer = nn.Embedding(len(target_vocab), hid_dim).to(device)

In [40]:
src_embedding = src_token_embedding_layer(src)
trg_embedding = trg_token_embedding_layer(trg)

In [41]:
scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

In [42]:
src_embedding_scaled = src_embedding * scale
trg_embedding_scaled = trg_embedding * scale

In [43]:
print(src_embedding_scaled.shape)
trg_embedding_scaled.shape
#[Batch_size, seq_len, hid_dim]

torch.Size([2, 12, 8])


torch.Size([2, 14, 8])

In [44]:
src_position = torch.arange(src.shape[1]).view(1,-1).repeat(src.shape[0], 1).to(device)
trg_position = torch.arange(trg.shape[1]).view(1,-1).repeat(trg.shape[0], 1).to(device)

In [45]:
print(src_position)
print(trg_position)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]], device='cuda:0')
tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13]],
       device='cuda:0')


In [46]:
max_length = 20
torch.manual_seed(0)
src_position_embedding_layer = nn.Embedding(max_length, hid_dim).to(device)
trg_position_embedding_layer = nn.Embedding(max_length, hid_dim).to(device)

In [47]:
src_position_embedding = src_position_embedding_layer(src_position)
trg_position_embedding = src_position_embedding_layer(trg_position)

In [48]:
print(src_position_embedding.shape)
print(trg_position_embedding.shape)
#[Batch_size, seq_len, hid_dim]

torch.Size([2, 12, 8])
torch.Size([2, 14, 8])


In [49]:
torch.manual_seed(0)
encoder_input_dropout = nn.Dropout(p=0.1)
decoder_input_dropout = nn.Dropout(p=0.1)

In [50]:
encoder_input_after_dropout = encoder_input_dropout(src_position_embedding + src_embedding_scaled)
decoder_input_after_dropout = decoder_input_dropout(trg_position_embedding + trg_embedding_scaled)

In [51]:
encoder_input_after_dropout.shape

torch.Size([2, 12, 8])

In [52]:
decoder_input_after_dropout.shape

torch.Size([2, 14, 8])

# <font color = 'blue'> Encoder-Decoder Attention

<font color = 'green'> We will use the encoder and decoder input embeddings for encoder decoder attention. In teh complete model encoder output and decoder self attention output will be teh inputs to encoder-decoder attention

## <font color = 'blue'> Encoder-Decoder attention using nn.MultiheadAttention

<font color = 'green'> Now we will use ```torch.nn.MultiheadAttention``` from pytorch. The limittaions is that the hid_dim = out_hid_dim. Threfore hid_dim must be divisible by num_heads. In encoder hid_dim = 8, out_hid_dim =9 and num_heads =3. To keep things simpler for decoder, we will use hid_dim = out_hid_dim = 8. We will use num_heads = 2.

In [53]:
enc_dec_attnetion_layer = torch.nn.MultiheadAttention(embed_dim=hid_dim, num_heads=2, 
                                                        dropout=0.0, 
                                                        bias=True, add_bias_kv=False, 
                                                        add_zero_attn=False, kdim=None, 
                                                        vdim=None, batch_first=True, 
                                                        device=device, dtype=None)

In [55]:
for name, param in enc_dec_attnetion_layer.named_parameters():
    print(name, param)

in_proj_weight Parameter containing:
tensor([[ 0.3763,  0.2778, -0.4188, -0.2694,  0.3997, -0.3953, -0.0130,  0.2440],
        [-0.0283, -0.1411, -0.3400,  0.3003, -0.2023, -0.0332, -0.2203,  0.1427],
        [ 0.3561, -0.2301,  0.2985, -0.2054, -0.4158, -0.2433, -0.1752, -0.0005],
        [ 0.2562, -0.1277,  0.1224,  0.3388,  0.3865, -0.1923, -0.1799, -0.2100],
        [ 0.0457,  0.4024,  0.0755,  0.0069,  0.0560, -0.3103, -0.3071,  0.2443],
        [ 0.2494,  0.2909, -0.3317, -0.2078,  0.1968, -0.2872,  0.1729, -0.1140],
        [ 0.0898, -0.2116, -0.1195,  0.0113, -0.3267, -0.2988, -0.2241,  0.2493],
        [ 0.0304, -0.0666,  0.0263, -0.0798, -0.3771, -0.0504, -0.3882,  0.3941],
        [-0.2509,  0.3624, -0.4034, -0.2466, -0.4142,  0.3088, -0.0350,  0.1387],
        [-0.4063, -0.2667, -0.3524,  0.0971,  0.2161,  0.0450, -0.4052,  0.3519],
        [-0.3766,  0.3414,  0.0444,  0.2520,  0.1527, -0.3914,  0.2546, -0.2493],
        [-0.2519, -0.2988, -0.2055,  0.3408,  0.1452, -0.2501

<font size = 3, color = 'green'>**As we can see that this layers has two weight metrices. It combines $W^q, W^k and W^v$ into one matrix in_proj and $W^o$ is out_proj matrix.**

In [56]:
for name, param in enc_dec_attnetion_layer.named_parameters():
    print(name, param.shape)

in_proj_weight torch.Size([24, 8])
in_proj_bias torch.Size([24])
out_proj.weight torch.Size([8, 8])
out_proj.bias torch.Size([8])


In [57]:
src

tensor([[ 1, 22, 24, 25, 26, 12, 22, 27, 28, 29, 17,  2],
        [ 1, 18,  7, 19, 20, 21, 22, 23, 17,  2,  3,  3]], device='cuda:0')

In [58]:
SRC_PAD_IDX = target_vocab['<PAD>']
SRC_PAD_IDX

3

<font color = 'green'> The layer apply mask where mask values are True. In manual implementation we applied mask where values are False. Hence now we will use ```trg_mask = (trg==TRG_PAD_IDX)``` instead of ```trg_mask = (trg!=TRG_PAD_IDX)```

In [59]:
src_pad_mask = (src==SRC_PAD_IDX)


In [60]:
src_pad_mask.shape

torch.Size([2, 12])

In [61]:
decoder_contextulaized_embeddings, decoder_self_attention_probs = enc_dec_attnetion_layer(
                                                     query=decoder_input_after_dropout, 
                                                     key= encoder_input_after_dropout,
                                                     value= encoder_input_after_dropout, 
                                                     key_padding_mask=src_pad_mask, 
                                                     need_weights=True, 
                                                     attn_mask=None)

In [62]:
decoder_contextulaized_embeddings.shape

torch.Size([2, 14, 8])

In [70]:
decoder_contextulaized_embeddings[1, 0, :]
#[batch_size, query_len, hid_dim]

tensor([ 1.1555, -4.0577, -5.0036,  2.1264,  2.8235, -0.7168,  2.4416, -1.2822],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [68]:
decoder_self_attention_probs.shape
#[batch_size, query_len, key_len]

torch.Size([2, 14, 12])

In [71]:
decoder_self_attention_probs[1,1,:]

tensor([1.3277e-02, 3.1367e-04, 1.3624e-04, 1.8799e-01, 1.6538e-05, 2.9840e-01,
        6.8699e-05, 3.1859e-03, 2.8897e-05, 4.9659e-01, 0.0000e+00, 0.0000e+00],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [72]:
decoder_self_attention_probs[1,0,:]

tensor([1.2413e-04, 1.1850e-04, 5.3116e-01, 1.3379e-03, 4.7481e-03, 7.4040e-02,
        6.1360e-02, 3.6862e-03, 3.2327e-01, 1.5939e-04, 0.0000e+00, 0.0000e+00],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [73]:
decoder_self_attention_probs[1,13,:]

tensor([6.9798e-03, 1.8109e-03, 4.8646e-01, 1.3657e-01, 2.4348e-01, 2.1808e-05,
        8.8134e-02, 2.6209e-03, 1.8633e-03, 3.2055e-02, 0.0000e+00, 0.0000e+00],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [74]:
decoder_self_attention_probs[0,13,:]

tensor([9.5464e-03, 1.2221e-03, 6.9953e-04, 1.1335e-02, 4.8589e-02, 4.3403e-01,
        1.2982e-04, 1.1177e-04, 8.8061e-06, 2.8275e-04, 6.0540e-04, 4.9344e-01],
       device='cuda:0', grad_fn=<SliceBackward0>)

### <font color = 'blue'> Method 1 - Self Attention- Without Using nn.MultiheadAttention

#### <font color = 'blue'> Manually initialize Weights and biases 

<font color = 'green'> To compare results, we will use the weights and biases from nn.MultiHeadAttention layer we used earlier.

In [75]:
out_hid_dim = 8
hid_dim = 8
fc_q = nn.Linear(hid_dim, out_hid_dim).to(device)
fc_k = nn.Linear(hid_dim, out_hid_dim).to(device)
fc_v = nn.Linear(hid_dim, out_hid_dim).to(device)
fc_o = nn.Linear(out_hid_dim, hid_dim).to(device)

In [77]:
fc_q.weight.data = enc_dec_attnetion_layer.in_proj_weight.data[0:8,:]
fc_q.bias.data = enc_dec_attnetion_layer.in_proj_bias.data[0:8]
print(fc_q.weight.data)
print(fc_q.bias.data)

tensor([[ 0.3763,  0.2778, -0.4188, -0.2694,  0.3997, -0.3953, -0.0130,  0.2440],
        [-0.0283, -0.1411, -0.3400,  0.3003, -0.2023, -0.0332, -0.2203,  0.1427],
        [ 0.3561, -0.2301,  0.2985, -0.2054, -0.4158, -0.2433, -0.1752, -0.0005],
        [ 0.2562, -0.1277,  0.1224,  0.3388,  0.3865, -0.1923, -0.1799, -0.2100],
        [ 0.0457,  0.4024,  0.0755,  0.0069,  0.0560, -0.3103, -0.3071,  0.2443],
        [ 0.2494,  0.2909, -0.3317, -0.2078,  0.1968, -0.2872,  0.1729, -0.1140],
        [ 0.0898, -0.2116, -0.1195,  0.0113, -0.3267, -0.2988, -0.2241,  0.2493],
        [ 0.0304, -0.0666,  0.0263, -0.0798, -0.3771, -0.0504, -0.3882,  0.3941]],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')


In [78]:
fc_k.weight.data = enc_dec_attnetion_layer.in_proj_weight.data[8:16,:]
fc_k.bias.data = enc_dec_attnetion_layer.in_proj_bias.data[8:16]
print(fc_k.weight.data)
print(fc_k.bias.data)

tensor([[-0.2509,  0.3624, -0.4034, -0.2466, -0.4142,  0.3088, -0.0350,  0.1387],
        [-0.4063, -0.2667, -0.3524,  0.0971,  0.2161,  0.0450, -0.4052,  0.3519],
        [-0.3766,  0.3414,  0.0444,  0.2520,  0.1527, -0.3914,  0.2546, -0.2493],
        [-0.2519, -0.2988, -0.2055,  0.3408,  0.1452, -0.2501,  0.3149,  0.3838],
        [ 0.0048, -0.1765,  0.2560, -0.0106, -0.2261,  0.1781,  0.0745,  0.2857],
        [-0.0765,  0.2502,  0.2493,  0.0801, -0.1353, -0.1036,  0.1332,  0.2318],
        [ 0.4233,  0.3586, -0.2676, -0.2466, -0.1674,  0.3708, -0.0955,  0.0112],
        [-0.1462, -0.2138, -0.1292,  0.3666,  0.3280,  0.2984, -0.1094, -0.2439]],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')


In [79]:
fc_v.weight.data = enc_dec_attnetion_layer.in_proj_weight.data[16:,:]
fc_v.bias.data = enc_dec_attnetion_layer.in_proj_bias.data[16:]
print(fc_v.weight.data)
print(fc_v.bias.data)

tensor([[ 0.4032,  0.2661, -0.0876, -0.4138, -0.1970,  0.0984,  0.3258,  0.0302],
        [ 0.0678, -0.0233, -0.3942, -0.3782,  0.0286, -0.1126,  0.0703,  0.4220],
        [-0.3806,  0.1810,  0.1914,  0.2954,  0.0498, -0.3115,  0.3695,  0.1538],
        [-0.4105,  0.2072,  0.1600, -0.1192,  0.1370,  0.2916, -0.2625, -0.3464],
        [ 0.4321,  0.1024,  0.0788,  0.2816, -0.0996, -0.4232,  0.3214,  0.1239],
        [-0.3858, -0.3703,  0.3832, -0.1266, -0.1693,  0.1821,  0.1343, -0.1489],
        [ 0.1633,  0.1020,  0.2901, -0.3135, -0.2798,  0.2624,  0.0754, -0.0908],
        [-0.1181, -0.3744,  0.3527,  0.2048, -0.3104, -0.2376,  0.2671, -0.3707]],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')


In [80]:
fc_o.weight.data = enc_dec_attnetion_layer.out_proj.weight.data
fc_o.bias.data = enc_dec_attnetion_layer.out_proj.bias.data
print(fc_o.weight.data)
print(fc_o.bias.data)

tensor([[-0.3398,  0.1933,  0.2581,  0.2190,  0.1178, -0.0953, -0.0964,  0.0481],
        [-0.1671,  0.1613, -0.3391,  0.1987, -0.2557,  0.2815, -0.2839, -0.3338],
        [ 0.3252,  0.2891, -0.2454,  0.1039, -0.0680,  0.1597, -0.1212, -0.1963],
        [-0.1580, -0.2998, -0.0987, -0.1872,  0.0870,  0.2023,  0.2119, -0.2741],
        [-0.1120, -0.3063, -0.0368, -0.2990,  0.1855,  0.1990, -0.2176,  0.3324],
        [-0.1558, -0.0212, -0.0969,  0.1274, -0.2628,  0.1756,  0.1212,  0.2329],
        [-0.2689, -0.0370,  0.0462, -0.2464, -0.0516, -0.2242,  0.0455, -0.0850],
        [-0.0727, -0.0087, -0.1862,  0.0398, -0.0506, -0.2583,  0.1757, -0.1860]],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')


#### <font color = 'blue'> **Generate queries, keys and values**

<font size =3, color = 'green'> Here the queries will be generated based on decoder input whereas keys and values are generated based on encoder input. Target should pay attention to words in source.

In [81]:
Q = fc_q(decoder_input_after_dropout )
K = fc_k(encoder_input_after_dropout )
V = fc_v(encoder_input_after_dropout )

In [82]:
n_heads = 2
head_dim = out_hid_dim // n_heads
print(head_dim)

4


In [83]:
assert out_hid_dim % n_heads == 0

In [84]:
Q = Q.view(batch_size, -1, n_heads, head_dim)
K = K.view(batch_size, -1, n_heads, head_dim)
V = V.view(batch_size, -1, n_heads, head_dim)

In [85]:
Q.shape
#[batch_size, query_len, n_heads, head_dim]

torch.Size([2, 14, 2, 4])

In [86]:
Q = Q.permute(0, 2, 1, 3)
K = K.permute(0, 2, 1, 3)
V = V.permute(0, 2, 1, 3)

In [87]:
Q.shape
#[batch_size,num_heads, query_len, head_dim ]

torch.Size([2, 2, 14, 4])

In [88]:
K.shape
#[batch_size,num_heads, key_len, head_dim ]

torch.Size([2, 2, 12, 4])

#### <font color = 'blue'> **Scaled dot product of Queries and Keys**</font>

In [89]:
scale = torch.sqrt(torch.FloatTensor([head_dim])).to(device)

In [90]:
energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / scale

In [91]:
energy.shape
#[batch_size, num_heads, query_len, key_len]

torch.Size([2, 2, 14, 12])

#### <font color = 'blue'> **Apply mask to output of Q, K dot product**</font><br>
<font color = 'green'>**We do not want tokens to pay attention to pad tokens**</font>

In [92]:
mask_input = source.clone()
mask_input

tensor([[ 1, 22, 24, 25, 26, 12, 22, 27, 28, 29, 17,  2],
        [ 1, 18,  7, 19, 20, 21, 22, 23, 17,  2,  3,  3]])

In [93]:
SRC_PAD_IDX = source_vocab['<PAD>']
SRC_PAD_IDX

3

In [94]:
src_pad_mask = (mask_input!= SRC_PAD_IDX )
src_pad_mask

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         False, False]])

In [95]:
src_pad_mask.shape

torch.Size([2, 12])

In [96]:
src_pad_mask = src_pad_mask.unsqueeze(1).unsqueeze(2)

In [97]:
src_pad_mask.shape

torch.Size([2, 1, 1, 12])

In [98]:
src_pad_mask = src_pad_mask.to(device)

In [99]:
energy_masked = energy.masked_fill(src_pad_mask == 0, -1e10)

In [100]:
energy_masked.shape

torch.Size([2, 2, 14, 12])

#### <font color = 'blue'>**Apply softmax to convert QV dot product to probabilities**</font><br>

In [101]:
attention_prob = torch.softmax(energy_masked, dim = -1)                 
#attention_prob = [batch size, n heads, query len, key len]

In [103]:
print(attention_prob [1,1,1,10].data, attention_prob [1,1,1,11].data)
#[batch_size, num_heads, query_len, key_len]

tensor(0., device='cuda:0') tensor(0., device='cuda:0')


In [105]:
print(attention_prob[1, 0,1, :])

tensor([1.6849e-04, 6.3200e-06, 2.7247e-04, 1.9907e-07, 2.3001e-05, 7.1706e-10,
        2.3023e-09, 6.3591e-03, 2.4307e-10, 9.9317e-01, 0.0000e+00, 0.0000e+00],
       device='cuda:0', grad_fn=<SliceBackward0>)


In [106]:
attention_prob[0, 0,1, :].sum()

tensor(1.0000, device='cuda:0', grad_fn=<SumBackward0>)

#### <font color = 'blue'>**Apply dropout layer to attention probabilities**</font><br>

In [107]:
att_dropout =  nn.Dropout(p=0.0)

In [108]:
attention_prob_after_dropout= att_dropout(attention_prob)

In [110]:
print(attention_prob_after_dropout [1, 0,1, :])

tensor([1.6849e-04, 6.3200e-06, 2.7247e-04, 1.9907e-07, 2.3001e-05, 7.1706e-10,
        2.3023e-09, 6.3591e-03, 2.4307e-10, 9.9317e-01, 0.0000e+00, 0.0000e+00],
       device='cuda:0', grad_fn=<SliceBackward0>)


In [111]:
attention_prob_after_dropout[0, 0,1, :].sum()

tensor(1.0000, device='cuda:0', grad_fn=<SumBackward0>)

<font size = 3, color = 'red'>**Probs do not sum to 1, sometimes these are greater than one and sometimes these are less than one**</font>

In [112]:
attention_prob_after_dropout.shape

torch.Size([2, 2, 14, 12])

In [113]:
attention_prob_after_dropout_mean = torch.mean(attention_prob_after_dropout, dim = 1)

#### <font size = 4, color = 'blue'>**Self Attention Output**</font><br>

In [114]:
V.shape
# [batch_size, num_heads, value_len, head_dim]

torch.Size([2, 2, 12, 4])

In [115]:
attention_prob_after_dropout.shape
#[batch_size, num_heads, query_len, key_len]

torch.Size([2, 2, 14, 12])

In [116]:
decoder_contextulaized_embeddings_m1 = torch.matmul(attention_prob_after_dropout, V)
#[batch_size, num_heads, query_len, head_dim]

In [117]:
decoder_contextulaized_embeddings_m1.shape
# [batch_size, number_of_heads, query_len, head_dim]

torch.Size([2, 2, 14, 4])

In [118]:
decoder_contextulaized_embeddings_m1 = decoder_contextulaized_embeddings_m1.permute(0, 2, 1, 3)
# [batch_size, query_len, number_of_heads, head_dim]

In [119]:
decoder_contextulaized_embeddings_m1.shape

torch.Size([2, 14, 2, 4])

In [120]:
decoder_contextulaized_embeddings_m1 = decoder_contextulaized_embeddings_m1.reshape(batch_size, -1, out_hid_dim)

In [121]:
decoder_contextulaized_embeddings_m1.shape
#[batch_size, seq_len, out_hid_dim]

torch.Size([2, 14, 8])

<font size = 3, color = 'green'>We need to project the final values to have same shape as  input embedding. To accomplish this we will use fc_o linear layer we created earlier.

In [122]:
decoder_contextulaized_embeddings_m1 = fc_o(decoder_contextulaized_embeddings_m1)

In [123]:
decoder_contextulaized_embeddings_m1.shape
#[batch_size, seq_len, hid_dim]

torch.Size([2, 14, 8])

#### <font size = 4, color = 'blue'>**Compare results with PyTorch Layer**</font><br>

In [124]:
decoder_contextulaized_embeddings_m1[1,9,:]

tensor([ 1.7619, -1.3666, -3.5892,  0.2569,  4.2740,  2.1638, -0.2373, -2.3390],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [125]:
decoder_contextulaized_embeddings[1,9,:]

tensor([ 1.7619, -1.3666, -3.5892,  0.2569,  4.2740,  2.1638, -0.2373, -2.3390],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [126]:
decoder_self_attention_probs[0,5,:]

tensor([3.7179e-02, 4.1577e-04, 2.3379e-02, 1.2573e-01, 2.5657e-04, 7.4071e-03,
        4.0060e-03, 1.4204e-03, 1.7374e-05, 1.6990e-05, 3.6617e-01, 4.3400e-01],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [127]:
attention_prob_after_dropout_mean[0,5,:]

tensor([3.7179e-02, 4.1577e-04, 2.3379e-02, 1.2573e-01, 2.5657e-04, 7.4071e-03,
        4.0060e-03, 1.4204e-03, 1.7374e-05, 1.6990e-05, 3.6617e-01, 4.3400e-01],
       device='cuda:0', grad_fn=<SliceBackward0>)

### <font color = 'blue'> Method 2 - Self Attention- Without Using nn.MultiheadAttention

#### <font color = 'blue'> Manually initialize Weights and biases 

In [129]:
Wq = enc_dec_attnetion_layer.in_proj_weight.data[0:8,:]
bq = enc_dec_attnetion_layer.in_proj_bias.data[0:8]
print(Wq)
print(bq)

tensor([[ 0.3763,  0.2778, -0.4188, -0.2694,  0.3997, -0.3953, -0.0130,  0.2440],
        [-0.0283, -0.1411, -0.3400,  0.3003, -0.2023, -0.0332, -0.2203,  0.1427],
        [ 0.3561, -0.2301,  0.2985, -0.2054, -0.4158, -0.2433, -0.1752, -0.0005],
        [ 0.2562, -0.1277,  0.1224,  0.3388,  0.3865, -0.1923, -0.1799, -0.2100],
        [ 0.0457,  0.4024,  0.0755,  0.0069,  0.0560, -0.3103, -0.3071,  0.2443],
        [ 0.2494,  0.2909, -0.3317, -0.2078,  0.1968, -0.2872,  0.1729, -0.1140],
        [ 0.0898, -0.2116, -0.1195,  0.0113, -0.3267, -0.2988, -0.2241,  0.2493],
        [ 0.0304, -0.0666,  0.0263, -0.0798, -0.3771, -0.0504, -0.3882,  0.3941]],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')


In [130]:
Wk = enc_dec_attnetion_layer.in_proj_weight.data[8:16,:]
bk = enc_dec_attnetion_layer.in_proj_bias.data[8:16]
print(Wk)
print(bk)

tensor([[-0.2509,  0.3624, -0.4034, -0.2466, -0.4142,  0.3088, -0.0350,  0.1387],
        [-0.4063, -0.2667, -0.3524,  0.0971,  0.2161,  0.0450, -0.4052,  0.3519],
        [-0.3766,  0.3414,  0.0444,  0.2520,  0.1527, -0.3914,  0.2546, -0.2493],
        [-0.2519, -0.2988, -0.2055,  0.3408,  0.1452, -0.2501,  0.3149,  0.3838],
        [ 0.0048, -0.1765,  0.2560, -0.0106, -0.2261,  0.1781,  0.0745,  0.2857],
        [-0.0765,  0.2502,  0.2493,  0.0801, -0.1353, -0.1036,  0.1332,  0.2318],
        [ 0.4233,  0.3586, -0.2676, -0.2466, -0.1674,  0.3708, -0.0955,  0.0112],
        [-0.1462, -0.2138, -0.1292,  0.3666,  0.3280,  0.2984, -0.1094, -0.2439]],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')


In [131]:
Wv = enc_dec_attnetion_layer.in_proj_weight.data[16:,:]
bv = enc_dec_attnetion_layer.in_proj_bias.data[16:]
print(Wv)
print(bv)

tensor([[ 0.4032,  0.2661, -0.0876, -0.4138, -0.1970,  0.0984,  0.3258,  0.0302],
        [ 0.0678, -0.0233, -0.3942, -0.3782,  0.0286, -0.1126,  0.0703,  0.4220],
        [-0.3806,  0.1810,  0.1914,  0.2954,  0.0498, -0.3115,  0.3695,  0.1538],
        [-0.4105,  0.2072,  0.1600, -0.1192,  0.1370,  0.2916, -0.2625, -0.3464],
        [ 0.4321,  0.1024,  0.0788,  0.2816, -0.0996, -0.4232,  0.3214,  0.1239],
        [-0.3858, -0.3703,  0.3832, -0.1266, -0.1693,  0.1821,  0.1343, -0.1489],
        [ 0.1633,  0.1020,  0.2901, -0.3135, -0.2798,  0.2624,  0.0754, -0.0908],
        [-0.1181, -0.3744,  0.3527,  0.2048, -0.3104, -0.2376,  0.2671, -0.3707]],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')


In [132]:
Wo = enc_dec_attnetion_layer.out_proj.weight.data
bo = enc_dec_attnetion_layer.out_proj.bias.data
print(Wo)
print(bo)

tensor([[-0.3398,  0.1933,  0.2581,  0.2190,  0.1178, -0.0953, -0.0964,  0.0481],
        [-0.1671,  0.1613, -0.3391,  0.1987, -0.2557,  0.2815, -0.2839, -0.3338],
        [ 0.3252,  0.2891, -0.2454,  0.1039, -0.0680,  0.1597, -0.1212, -0.1963],
        [-0.1580, -0.2998, -0.0987, -0.1872,  0.0870,  0.2023,  0.2119, -0.2741],
        [-0.1120, -0.3063, -0.0368, -0.2990,  0.1855,  0.1990, -0.2176,  0.3324],
        [-0.1558, -0.0212, -0.0969,  0.1274, -0.2628,  0.1756,  0.1212,  0.2329],
        [-0.2689, -0.0370,  0.0462, -0.2464, -0.0516, -0.2242,  0.0455, -0.0850],
        [-0.0727, -0.0087, -0.1862,  0.0398, -0.0506, -0.2583,  0.1757, -0.1860]],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')


#### <font color = 'blue'> **Generate queries, keys and values**

<font size =3, color = 'green'> Here the queries will be generated based on decoder input whereas keys and values are generated based on encoder input. Target should pay attention to words in source.

In [133]:
Q2 = decoder_input_after_dropout.matmul(Wq.T)  + bq
K2 = encoder_input_after_dropout.matmul(Wk.T)  + bk
V2 = encoder_input_after_dropout.matmul(Wv.T)  + bv

In [134]:
Q2.shape

torch.Size([2, 14, 8])

In [135]:
n_heads = 2
head_dim = out_hid_dim // n_heads
print(head_dim)

4


In [136]:
assert out_hid_dim % n_heads == 0

In [137]:
Q2 = Q2.view(batch_size, -1, n_heads, head_dim)
K2 = K2.view(batch_size, -1, n_heads, head_dim)
V2 = V2.view(batch_size, -1, n_heads, head_dim)

In [138]:
Q2.shape
#[batch_size, query_len, n_heads, head_dim]

torch.Size([2, 14, 2, 4])

In [139]:
Q2 = Q2.permute(0, 2, 1, 3)
K2 = K2.permute(0, 2, 1, 3)
V2 = V2.permute(0, 2, 1, 3)

In [140]:
Q2.shape
#[batch_size,num_heads, query_len, head_dim ]

torch.Size([2, 2, 14, 4])

In [141]:
K2.shape
#[batch_size,num_heads, key_len, head_dim ]

torch.Size([2, 2, 12, 4])

In [142]:
Q[1,1,5,:]

tensor([ 2.6609,  1.3068, -0.7045,  0.5076], device='cuda:0',
       grad_fn=<SliceBackward0>)

In [143]:
Q2[1,1,5,:]

tensor([ 2.6609,  1.3068, -0.7045,  0.5076], device='cuda:0',
       grad_fn=<SliceBackward0>)

In [144]:
K[1,1,5,:]

tensor([-2.1791, -1.3406,  1.1957,  1.0938], device='cuda:0',
       grad_fn=<SliceBackward0>)

In [145]:
K2[1,1,5,:]

tensor([-2.1791, -1.3406,  1.1957,  1.0938], device='cuda:0',
       grad_fn=<SliceBackward0>)

In [146]:
V[1,1,5,:]

tensor([-0.5218, -0.3120,  0.7041,  1.2420], device='cuda:0',
       grad_fn=<SliceBackward0>)

In [147]:
V2[1,1,5,:]

tensor([-0.5218, -0.3120,  0.7041,  1.2420], device='cuda:0',
       grad_fn=<SliceBackward0>)

#### <font color = 'blue'> **Scaled dot product of Queries and Keys**</font>

In [148]:
scale = torch.sqrt(torch.FloatTensor([head_dim])).to(device)

In [149]:
energy2 = torch.matmul(Q2, K2.permute(0, 1, 3, 2)) / scale

In [150]:
energy2.shape
#[batch_size, num_heads, query_len, key_len]

torch.Size([2, 2, 14, 12])

In [151]:
energy[1,1,5,:]

tensor([ 0.3080,  1.7470,  7.0328, -2.4411,  2.0112, -3.9187, -0.0461,  2.8605,
        -0.3626,  6.3214, -1.4837, -3.2299], device='cuda:0',
       grad_fn=<SliceBackward0>)

In [152]:
energy2[1,1,5,:]

tensor([ 0.3080,  1.7470,  7.0328, -2.4411,  2.0112, -3.9187, -0.0461,  2.8605,
        -0.3626,  6.3214, -1.4837, -3.2299], device='cuda:0',
       grad_fn=<SliceBackward0>)

#### <font color = 'blue'> **Apply mask to output of Q, K dot product**</font><br>
<font color = 'green'>**We do not want tokens to pay attention to pad tokens**</font>

In [155]:
mask_input2 = source.clone()
mask_input2

tensor([[ 1, 22, 24, 25, 26, 12, 22, 27, 28, 29, 17,  2],
        [ 1, 18,  7, 19, 20, 21, 22, 23, 17,  2,  3,  3]])

In [156]:
SRC_PAD_IDX = source_vocab['<PAD>']
SRC_PAD_IDX

3

In [157]:
src_pad_mask2 = (mask_input2!= SRC_PAD_IDX )
src_pad_mask2

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         False, False]])

In [159]:
src_pad_mask2.shape

torch.Size([2, 12])

In [160]:
src_pad_mask2 = src_pad_mask2.unsqueeze(1).unsqueeze(2)

In [161]:
src_pad_mask2.shape

torch.Size([2, 1, 1, 12])

In [164]:
src_pad_mask2 = src_pad_mask2.to(device)

In [165]:
energy_masked2 = energy2.masked_fill(src_pad_mask2 == 0, -1e10)

In [166]:
energy_masked2.shape

torch.Size([2, 2, 14, 12])

In [167]:
energy_masked[1,1,5,:]

tensor([ 3.0805e-01,  1.7470e+00,  7.0328e+00, -2.4411e+00,  2.0112e+00,
        -3.9187e+00, -4.6059e-02,  2.8605e+00, -3.6260e-01,  6.3214e+00,
        -1.0000e+10, -1.0000e+10], device='cuda:0', grad_fn=<SliceBackward0>)

In [168]:
energy_masked2[1,1,5,:]

tensor([ 3.0805e-01,  1.7470e+00,  7.0328e+00, -2.4411e+00,  2.0112e+00,
        -3.9187e+00, -4.6059e-02,  2.8605e+00, -3.6260e-01,  6.3214e+00,
        -1.0000e+10, -1.0000e+10], device='cuda:0', grad_fn=<SliceBackward0>)

#### <font color = 'blue'>**Apply softmax to convert QV dot product to probabilities**</font><br>

In [169]:
attention_prob2 = torch.softmax(energy_masked2, dim = -1)                 
#attention_prob = [batch size, n heads, query len, key len]

In [173]:
print(attention_prob2[1,1,1,10].data, attention_prob2[1,1,1,11].data)
#[batch_size, num_heads, query_len, key_len]

tensor(0., device='cuda:0') tensor(0., device='cuda:0')


In [174]:
print(attention_prob2[1, 0,1, :])

tensor([1.6849e-04, 6.3200e-06, 2.7247e-04, 1.9907e-07, 2.3001e-05, 7.1706e-10,
        2.3023e-09, 6.3591e-03, 2.4307e-10, 9.9317e-01, 0.0000e+00, 0.0000e+00],
       device='cuda:0', grad_fn=<SliceBackward0>)


In [175]:
attention_prob2[0, 0,1, :].sum()

tensor(1.0000, device='cuda:0', grad_fn=<SumBackward0>)

#### <font color = 'blue'>**Apply dropout layer to attention probabilities**</font><br>
<font color = 'red'>**NOT UNDERSTOOD- why apply dropout here (probs will not sum to 1)**</font><br>
<font color = 'green'>**Quotes from paper --We apply dropout [33] to the output of each sub-layer, before it is added to the sub-layer input and normalized.**</font>


In [176]:
att_dropout2 =  nn.Dropout(p=0.0)

In [177]:
attention_prob_after_dropout2 = att_dropout2(attention_prob2)

In [178]:
print(attention_prob_after_dropout2[0, 0,1, :])

tensor([2.9166e-10, 1.6071e-03, 1.7196e-07, 8.2590e-12, 3.9455e-07, 1.9409e-07,
        7.3817e-03, 4.6745e-06, 9.8184e-01, 8.8270e-03, 3.3416e-04, 3.4758e-08],
       device='cuda:0', grad_fn=<SliceBackward0>)


In [179]:
attention_prob_after_dropout2[0, 0,1, :].sum()

tensor(1.0000, device='cuda:0', grad_fn=<SumBackward0>)

<font size = 3, color = 'red'>**Probs do not sum to 1, sometimes these are greater than one and sometimes these are less than one**</font>

In [180]:
attention_prob_after_dropout2.shape

torch.Size([2, 2, 14, 12])

In [181]:
attention_prob_after_dropout_mean2 = torch.mean(attention_prob_after_dropout2, dim = 1)

#### <font size = 4, color = 'blue'>**Self Attention Output**</font><br>

In [182]:
V2.shape
# [batch_size, num_heads, value_len, head_dim]

torch.Size([2, 2, 12, 4])

In [183]:
attention_prob_after_dropout2.shape
#[batch_size, num_heads, query_len, key_len]

torch.Size([2, 2, 14, 12])

In [184]:
decoder_contextulaized_embeddings_m2 = torch.matmul(attention_prob_after_dropout2, V2)
#[batch_size, num_heads, query_len, head_dim]

In [185]:
decoder_contextulaized_embeddings_m2.shape
# [batch_size, number_of_heads, query_len, head_dim]

torch.Size([2, 2, 14, 4])

In [186]:
decoder_contextulaized_embeddings_m2 = decoder_contextulaized_embeddings_m2.permute(0, 2, 1, 3)
# [batch_size, query_len, number_of_heads, head_dim]

In [187]:
decoder_contextulaized_embeddings_m2.shape

torch.Size([2, 14, 2, 4])

In [188]:
decoder_contextulaized_embeddings_m2 = decoder_contextulaized_embeddings_m2.reshape(batch_size, -1, out_hid_dim)

In [189]:
decoder_contextulaized_embeddings_m2.shape
#[batch_size, seq_len, out_hid_dim]

torch.Size([2, 14, 8])

<font size = 3, color = 'green'>We need to project the final values to have same shape as  input embedding. To accomplish this we will use fc_o linear layer we created earlier.

In [190]:
decoder_contextulaized_embeddings_m2 = decoder_contextulaized_embeddings_m2.matmul(Wo.T)  + bo

In [191]:
decoder_contextulaized_embeddings_m2.shape
#[batch_size, seq_len, hid_dim]

torch.Size([2, 14, 8])

#### <font size = 4, color = 'blue'>**Compare results with PyTorch Layer and method1**</font><br>

In [192]:
decoder_contextulaized_embeddings_m2[0,9,:]

tensor([-0.8930, -0.0810, -0.0353,  0.3941,  0.7499,  0.8224, -0.1756, -0.2079],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [196]:
decoder_contextulaized_embeddings_m1[0,9,:]

tensor([-0.8930, -0.0810, -0.0353,  0.3941,  0.7499,  0.8224, -0.1756, -0.2079],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [193]:
decoder_contextulaized_embeddings[0,9,:]

tensor([-0.8930, -0.0810, -0.0353,  0.3941,  0.7499,  0.8224, -0.1756, -0.2079],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [201]:
attention_prob_after_dropout_mean2[0,9,:]

tensor([0.0390, 0.0418, 0.1368, 0.0167, 0.2915, 0.1070, 0.0087, 0.0115, 0.1452,
        0.1808, 0.0137, 0.0074], device='cuda:0', grad_fn=<SliceBackward0>)

In [202]:
attention_prob_after_dropout_mean[0,9,:]

tensor([0.0390, 0.0418, 0.1368, 0.0167, 0.2915, 0.1070, 0.0087, 0.0115, 0.1452,
        0.1808, 0.0137, 0.0074], device='cuda:0', grad_fn=<SliceBackward0>)

In [203]:
decoder_self_attention_probs[0,9,:]

tensor([0.0390, 0.0418, 0.1368, 0.0167, 0.2915, 0.1070, 0.0087, 0.0115, 0.1452,
        0.1808, 0.0137, 0.0074], device='cuda:0', grad_fn=<SliceBackward0>)