# <font color ='blue'> 1 -  Neural Machine Translation - Attention





# <font color ='blue'> Import Libraries



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext 
from torchtext.datasets import Multi30k
from torchtext.vocab import vocab
#from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np
from collections import Counter, OrderedDict

import random
import math
import time
import pandas as pd
from pathlib import Path
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
from pathlib import Path
data_folder = Path('/home/harpreet/Insync/google_drive_shaannoor/Data/NLP')
project_folder = Path('/home/harpreet/Insync/google_drive_harpreet/Research/NLP/pytorch-seq2seq')

In [None]:
#!pip install -U spacy

In [None]:
import torch, torchtext, spacy
torchtext.__version__, torch.__version__, torch.cuda.is_available(), spacy.__version__

('0.11.0', '1.10.0', True, '3.2.4')

# <font color ='blue'> Set Seeds

In [None]:
import random
import numpy as np
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# <font color = 'blue'> Load Tokenized Data

<font color ='green'>
Next, we download and load the train, validation and test data. 

The dataset we'll be using is the [Multi30k dataset](https://github.com/multi30k/dataset). This is a dataset with ~30,000 parallel English, German and French sentences, each with ~12 words per sentence. 

The data was tokenized in Tutorial 1 and we will load the tokenized data. We will use only first two observations for this small example

In [None]:
df_train = pd.read_pickle(project_folder/'df_train_en_de.pickel')[0:2]

In [None]:
df_train

Unnamed: 0,source_tokens,target_tokens,source_tokens_reverse
0,"[zwei, junge, weiße, männer, sind, i, m, freie...","[two, young, ,, white, males, are, outside, ne...","[., büsche, vieler, nähe, der, in, freien, m, ..."
1,"[mehrere, männer, mit, schutzhelmen, bedienen,...","[several, men, in, hard, hats, are, operating,...","[., antriebsradsystem, ein, bedienen, schutzhe..."


# <font color = 'blue'>  Build Vocab

In [None]:
def create_vocab(text, min_freq, specials):
    my_counter = Counter()
    for line in text:
       my_counter.update(line)
    my_vocab = vocab(my_counter, min_freq=min_freq)
    for i, special in enumerate(specials):
        my_vocab.insert_token(special, i)
    my_vocab.set_default_index(0)
    return my_vocab

In [None]:
source_vocab = create_vocab(df_train['source_tokens'], 1, ['<unk>', '<BOS>', '<EOS>', '<PAD>'])

In [None]:
len(source_vocab)

24

In [None]:
pd.DataFrame(source_vocab.get_stoi().items(), columns=['tokens', 'index']).sort_values(by = ['index'])

Unnamed: 0,tokens,index
3,<unk>,0
22,<BOS>,1
8,<EOS>,2
10,<PAD>,3
16,zwei,4
4,junge,5
15,weiße,6
17,männer,7
13,sind,8
19,i,9


In [None]:
# check index of unknown word - it should be zero
source_vocab['abracdabra']

0

In [None]:
target_vocab = create_vocab(df_train['target_tokens'], 1, ['<unk>', '<BOS>', '<EOS>', '<PAD>'])

In [None]:
len(target_vocab)

25

In [None]:
pd.DataFrame(target_vocab.get_stoi().items(), columns=['tokens', 'index']).sort_values(by = ['index'])

Unnamed: 0,tokens,index
22,<unk>,0
10,<BOS>,1
11,<EOS>,2
12,<PAD>,3
16,two,4
14,young,5
15,",",6
23,white,7
17,males,8
20,are,9


# <font color = 'blue'>  Create Dataset and Dataloader

In [None]:
class EngGerman(Dataset):
    def __init__(self, X1, X2):
        self.X1 = X1
        self.X2 = X2
        
    def __len__(self):
        return len(self.X1)
    
    def __getitem__(self, indices):
        return (self.X1.iloc[indices] , self.X2.iloc[indices]) 

In [None]:
trainset = EngGerman(df_train['source_tokens'], df_train['target_tokens'])

In [None]:
trainset.__getitem__(0)

(['zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'i',
  'm',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.'],
 ['two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.'])

In [None]:
len(trainset)

2

In [None]:
def text_transform (my_vocab, text):
     text_numerical = [my_vocab[token] for token in text]
     return torch.tensor([source_vocab['<BOS>']] + text_numerical + [source_vocab['<EOS>']])
     #return list(source_vocab['<BOS>']) + text_numerical + list(source_vocab['<EOS>'])

In [None]:
text = trainset.__getitem__(1)[1]
print(text)
text_transform(target_vocab, text)

['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']


tensor([ 1, 15, 16, 17, 18, 19,  9, 20, 21, 22, 23, 24, 14,  2])

In [None]:
def collate_batch(batch):
   source_list, target_list = [], []
   for (source_text, target_text) in batch:
        source_transform = text_transform(source_vocab, source_text)
        source_list.append(source_transform)
        target_transform =text_transform(target_vocab, target_text)
        target_list.append(target_transform)
        
   source_pad = pad_sequence(source_list, padding_value=3.0)
   target_pad = pad_sequence(target_list, padding_value=3.0)
   #print(source_list)
   return (source_pad, target_pad)

In [None]:
batch_size = 2

train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collate_batch)

In [None]:
for source, target in train_loader:
  print(source)
  print(target)
  break

tensor([[ 1,  1],
        [ 4, 18],
        [ 5,  7],
        [ 6, 19],
        [ 7, 20],
        [ 8, 21],
        [ 9, 22],
        [10, 23],
        [11, 17],
        [12,  2],
        [13,  3],
        [14,  3],
        [15,  3],
        [16,  3],
        [17,  3],
        [ 2,  3]])
tensor([[ 1,  1],
        [ 4, 15],
        [ 5, 16],
        [ 6, 17],
        [ 7, 18],
        [ 8, 19],
        [ 9,  9],
        [10, 20],
        [11, 21],
        [12, 22],
        [13, 23],
        [14, 24],
        [ 2, 14],
        [ 3,  2]])


# <font color = 'blue'>  Breakdown of forward loop

## <font color = 'blue'>  Get Source and Target sentences

In [None]:
source, target = next(iter(train_loader))

In [None]:
print(source)

tensor([[ 1,  1],
        [18,  4],
        [ 7,  5],
        [19,  6],
        [20,  7],
        [21,  8],
        [22,  9],
        [23, 10],
        [17, 11],
        [ 2, 12],
        [ 3, 13],
        [ 3, 14],
        [ 3, 15],
        [ 3, 16],
        [ 3, 17],
        [ 3,  2]])


In [None]:
print(target)

tensor([[ 1,  1],
        [15,  4],
        [16,  5],
        [17,  6],
        [18,  7],
        [19,  8],
        [ 9,  9],
        [20, 10],
        [21, 11],
        [22, 12],
        [23, 13],
        [24, 14],
        [14,  2],
        [ 2,  3]])


In [None]:
batch_size = target.shape[1]
print(batch_size)

2


In [None]:
trg_len = target.shape[0]
print(trg_len)

14


In [None]:
# tensors to store decoder output
# shape of outputs target length X batch size X vocab size
outputs = torch.zeros(trg_len, batch_size, len(target_vocab))
print(outputs.shape)

torch.Size([14, 2, 25])


## <font color = 'blue'>  Forward Loop

### <font color = 'blue'> Encoder

<font color = 'green'> 
First, we'll build the encoder. Similar to the previous model, we only use a single layer GRU, however we now use a *bidirectional RNN*. With a bidirectional RNN, we have two RNNs in each layer. A *forward RNN* going over the embedded sentence from left to right (shown below in green), and a *backward RNN* going over the embedded sentence from right to left (teal). All we need to do in code is set `bidirectional = True` and then pass the embedded sentence to the RNN as before. 

![](assets/seq2seq8.png)

We now have:

$$\begin{align*}
h_t^\rightarrow &= \text{EncoderGRU}^\rightarrow(e(x_t^\rightarrow),h_{t-1}^\rightarrow)\\
h_t^\leftarrow &= \text{EncoderGRU}^\leftarrow(e(x_t^\leftarrow),h_{t-1}^\leftarrow)
\end{align*}$$

Where $x_0^\rightarrow = \text{<sos>}, x_1^\rightarrow = \text{guten}$ and $x_0^\leftarrow = \text{<eos>}, x_1^\leftarrow = \text{morgen}$.

As before, we only pass an input (`embedded`) to the RNN, which tells PyTorch to initialize both the forward and backward initial hidden states ($h_0^\rightarrow$ and $h_0^\leftarrow$, respectively) to a tensor of all zeros. We'll also get two context vectors, one from the forward RNN after it has seen the final word in the sentence, $z^\rightarrow=h_T^\rightarrow$, and one from the backward RNN after it has seen the first word in the sentence, $z^\leftarrow=h_T^\leftarrow$.

The RNN returns `outputs` and `hidden`. 

`outputs` is of size **[src len, batch size, hid dim * num directions]** where the first `hid_dim` elements in the third axis are the hidden states from the top layer forward RNN, and the last `hid_dim` elements are hidden states from the top layer backward RNN. We can think of the third axis as being the forward and backward hidden states concatenated together other, i.e. $h_1 = [h_1^\rightarrow; h_{T}^\leftarrow]$, $h_2 = [h_2^\rightarrow; h_{T-1}^\leftarrow]$ and we can denote all encoder hidden states (forward and backwards concatenated together) as $H=\{ h_1, h_2, ..., h_T\}$.

`hidden` is of size **[n layers * num directions, batch size, hid dim]**, where **[-2, :, :]** gives the top layer forward RNN hidden state after the final time-step (i.e. after it has seen the last word in the sentence) and **[-1, :, :]** gives the top layer backward RNN hidden state after the final time-step (i.e. after it has seen the first word in the sentence).

<font color = 'red'> 
    
As the decoder is not bidirectional, it only needs a single context vector, $z$, to use as its initial hidden state, $s_0$, and we currently have two, a forward and a backward one ($z^\rightarrow=h_T^\rightarrow$ and $z^\leftarrow=h_T^\leftarrow$, respectively). We solve this by concatenating the two context vectors together, passing them through a linear layer, $g$, and applying the $\tanh$ activation function. 

$$z=\tanh(g(h_T^\rightarrow, h_T^\leftarrow)) = \tanh(g(z^\rightarrow, z^\leftarrow)) = s_0$$

**Note**: this is actually a deviation from the paper. Instead, they feed only the first backward RNN hidden state through a linear layer to get the context vector/decoder initial hidden state. This doesn't seem to make sense to me, so we have changed it.

<font color = 'green'>
    
As we want our model to look back over the whole of the source sentence we return `outputs`, the stacked forward and backward hidden states for every token in the source sentence. We also return `hidden`, which acts as our initial hidden state in the decoder.

#### <font color = 'blue'> Encoder Embedding

In [None]:
enc_emb_dim = 5
encoder_embedding_layer = nn.Embedding(num_embeddings=len(source_vocab), 
                                 embedding_dim= enc_emb_dim)

In [None]:
encoder_embedding_layer.weight.shape
# source_vocab, enc_emb_dim

torch.Size([24, 5])

In [None]:
enc_emb = encoder_embedding_layer(source)
# source_len, batch_size, emb_dim

In [None]:
enc_emb.shape

torch.Size([16, 2, 5])

#### <font color = 'blue'> Encoder RNN

In [None]:
torch.manual_seed(0)
enc_hid_dim = 3
enc_GRU = nn.GRU(input_size = enc_emb_dim,
               hidden_size= enc_hid_dim,
                num_layers = 1,
               bidirectional = True)

In [None]:
enc_output, enc_hidden = enc_GRU(enc_emb)
# enc_output shape: seq_len, batch_size, num directions * hid_dim ---> all timestamp from last layer
# enc_hiiden shape: num_layers * direction, batch_size, hidden_dim : last seq from all layers

In [None]:
enc_output.shape

torch.Size([16, 2, 6])

In [None]:
enc_output

tensor([[[-0.0241, -0.1899, -0.4941, -0.5887, -0.2889,  0.4260],
         [-0.0241, -0.1899, -0.4941, -0.6006, -0.2299,  0.5388]],

        [[-0.3531,  0.2073, -0.8475, -0.8305, -0.3723,  0.5454],
         [-0.0076, -0.2125, -0.6873, -0.8130, -0.2598,  0.7757]],

        [[ 0.1478,  0.1244, -0.8890, -0.6792, -0.1753,  0.5427],
         [ 0.2419, -0.3288, -0.7076, -0.8286, -0.3152,  0.7875]],

        [[-0.2404, -0.2158, -0.8336, -0.0195, -0.1701,  0.3666],
         [-0.3493,  0.3574, -0.8172, -0.8455, -0.1867,  0.5185]],

        [[ 0.1781, -0.2872, -0.8431,  0.2039,  0.0345,  0.2389],
         [ 0.1690,  0.2405, -0.8675, -0.7999,  0.1489,  0.5296]],

        [[ 0.2413, -0.5135, -0.6952, -0.4797, -0.1070,  0.3505],
         [ 0.0551, -0.0207, -0.6502, -0.5356,  0.2243,  0.3612]],

        [[ 0.1597, -0.3344, -0.7434, -0.7862, -0.2958,  0.0423],
         [ 0.1516, -0.1400, -0.6689, -0.2205,  0.2618,  0.1126]],

        [[ 0.0111, -0.4020, -0.3248, -0.8204, -0.2746, -0.0323],
         [ 

In [None]:
enc_hidden.shape

torch.Size([2, 2, 3])

In [None]:
enc_hidden

tensor([[[-0.2991, -0.5858, -0.5938],
         [-0.0599, -0.0271,  0.1323]],

        [[-0.5887, -0.2889,  0.4260],
         [-0.6006, -0.2299,  0.5388]]], grad_fn=<StackBackward0>)

In [None]:
enc_hidden[-2,:,:] # last seq from forward direction 

tensor([[-0.2991, -0.5858, -0.5938],
        [-0.0599, -0.0271,  0.1323]], grad_fn=<SliceBackward0>)

In [None]:
enc_hidden[-2,:,:].shape
# batch_size, hidden_dim

torch.Size([2, 3])

In [None]:
enc_hidden[-1,:,:] # last seq from backward direction 

tensor([[-0.5887, -0.2889,  0.4260],
        [-0.6006, -0.2299,  0.5388]], grad_fn=<SliceBackward0>)

#### <font color = 'blue'> Encoder feed forward

<font color = 'green'> We will now take a last time stamp from both the directions. We will concatenate these and pass it through a linear layer. We will then apply tanh activation function. This will be used as an initial hidden state for the decoder.

In [None]:
enc_hidden_last_concat = torch.cat((enc_hidden[-2,:,:], enc_hidden[-1,:,:]), dim = 1)
# batch_size, 2* enc_hidden_dim

In [None]:
enc_hidden_last_concat

tensor([[-0.2991, -0.5858, -0.5938, -0.5887, -0.2889,  0.4260],
        [-0.0599, -0.0271,  0.1323, -0.6006, -0.2299,  0.5388]],
       grad_fn=<CatBackward0>)

In [None]:
enc_hidden_last_concat.shape

torch.Size([2, 6])

In [None]:
dec_hid_dim = 3
enc_dec_hidden_layer = nn.Linear(in_features=2*enc_hid_dim, out_features=dec_hid_dim)

dec_hidden_initial = torch.tanh(enc_dec_hidden_layer(enc_hidden_last_concat))

In [None]:
dec_hidden_initial.shape
# batch_size, dec_hid_dim

torch.Size([2, 3])

In [None]:
#dec_hidden_initial = dec_hidden_initial.unsqueeze(0)
#num_direrctions * num_layers, batch_size, dec_hid_dim

In [None]:
#dec_hidden_initial.shape

### <font color = 'blue'> Decoder

<font color = 'green'>
    
In decoder, we start of by feeding token BOS. We then use the predicted word as the next input and so on.<br>    
Decoder Steps
- the initial decoder hidden state is set to be the `context` vector, $s_0 = z = h_T$
- we use a batch of `<sos>` tokens as the first `input`, $y_1$
- we then decode within a loop:
  - inserting the input token $y_t$, previous hidden state, $s_{t-1}$, and all encoder outputs, $H$, into the decoder
  - receiving a prediction, $\hat{y}_{t+1}$, and a new hidden state, $s_t$
  - we then decide if we are going to teacher force or not, setting the next input as appropriate

<font color = 'green'> We will first create a placeholder for outputs which will be updated in a for loop

In [None]:
target.shape

torch.Size([14, 2])

In [None]:
target

tensor([[ 1,  1],
        [15,  4],
        [16,  5],
        [17,  6],
        [18,  7],
        [19,  8],
        [ 9,  9],
        [20, 10],
        [21, 11],
        [22, 12],
        [23, 13],
        [24, 14],
        [14,  2],
        [ 2,  3]])

In [None]:
tar_len = target.shape[0]
batch_size = target.shape[1]
tar_vocab_size = len(target_vocab)
outputs = torch.zeros(size = (tar_len, batch_size, tar_vocab_size ))

In [None]:
outputs.shape

torch.Size([14, 2, 25])

#### <font color = 'blue'> Calculation First Word of decoder

<font color = 'green'> The input is a first word for all sentence in a batch , this corresponds to token BOS

In [None]:
input_dec1 = target[0, :]
# batch_size

print(input_dec1.shape)
input_dec1

torch.Size([2])


tensor([1, 1])

In [None]:
input_dec1_reshaped = input_dec1.unsqueeze(0)
input_dec1_reshaped.shape # 1, batch size

torch.Size([1, 2])

##### <font color = 'blue'> Decoder Embedding

In [None]:
dec_emb_dim = 5
dec_emb_layer = nn.Embedding(num_embeddings=len(target_vocab), embedding_dim=dec_emb_dim)

In [None]:
dec_embedded1 = dec_emb_layer(input_dec1_reshaped)

In [None]:
dec_embedded1.shape
#[1, batch_size, dec_emb_dim]

torch.Size([1, 2, 5])

##### <font color = 'blue'> Decoder Attention

![](assets/seq2seq_attention.png)

![](assets/seq2seq_attention_score.png)


<font color = 'red'> **The above figure is taken from Leena Voita's NLP course. The notation is different from the repo. In repo $h$ is used for the source language and $s$ is used for the target language. We will use the repo's notation.**

<font color = 'green'> For the first word, decoder state is the initial decoder state i.e. hidden state from the encoder. This paper has used the Multi-Layer Perceptron to calculate the score.

<font color = 'green'> The multilayer pereceptron has two linear layers. The source and hidden states are concatenated and passed theough first layer. We the apply a non-linearity. Finally the output fo non-linearity is passed through the the second linear layer. The output from the first layer has dimension = dec_hid_dim. This does not  has to be dec_hid_dim. It could be any value. The dec_hid_dim was used in the paper. The output from second layer is 1. This number represents how much attention the word $t$ in deoder should pat to word $k$ in encoder.<font color ='red'> **The second linear layer does not have any bias term**.  I could not find this detail  in the paper, but this is how it was impplemented in this repo.


Graphically, this looks something like below. This is for calculating the very first attention vector, where $s_{t-1} = s_0 = z$. The green/teal blocks represent the hidden states from both the forward and backward RNNs, and the attention computation is all done within the pink block.

![](assets/seq2seq9.png)

In [None]:
torch.manual_seed(0)
attention_layer1 = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim) # the output does not have to be dec_hid_dim
attention_layer2 = nn.Linear(dec_hid_dim, 1, bias = False) # the input does not have to be dec_hid_dim 

In [None]:
s0 = dec_hidden_initial.clone()

In [None]:
s0.shape
# batch_size, dec_hid_dim

torch.Size([2, 3])

In [None]:
enc_output.shape
# seq_len, batch_size, 2*enc_hid_dim

torch.Size([16, 2, 6])

In [None]:
h0 = enc_output[0,:,:]

In [None]:
h0.shape
#batch_size, 2*enc_hid_dim

torch.Size([2, 6])

In [None]:
attention_input = torch.cat((s0,h0), dim =1)
# batch_size, 2*enc_hid_dim + dec_hid_dim

In [None]:
attention_input.shape

torch.Size([2, 9])

In [None]:
attention_output_layer1 = torch.tanh(attention_layer1(attention_input))

In [None]:
attention_output_layer1.shape
#batch_size, dec_hidden_dim

torch.Size([2, 3])

In [None]:
attention_output_layer2 = attention_layer2(attention_output_layer1 )

In [None]:
attention_output_layer2.shape
#batch_size, 1

torch.Size([2, 1])

In [None]:
attention_output_layer2

tensor([[0.0090],
        [0.0447]], grad_fn=<MmBackward0>)

<font color = 'green'> This gives us the score for h0 and s0, We ned to calculate the score of s0 and all hk. So basically we need to pass all the pairs (s0, h0), (s0, h1)....(s0, hk) through two linear     layers. We can do this by repeating s0 k times and then pass the vector through two linear layers.

In [None]:
enc_output.shape
# seq_len, batch_size, 2* enc_hid_dim

torch.Size([16, 2, 6])

In [None]:
src_len = enc_output.shape[0]
batch_size = enc_output.shape[1]

In [None]:
s0.shape

torch.Size([2, 3])

In [None]:
s0 = s0.unsqueeze(0)
s0.shape
#1, batch_size, dec_hid_dim

torch.Size([1, 2, 3])

In [None]:
s0 = s0.repeat(src_len, 1, 1)

In [None]:
s0.shape
#src_len, batch_size, dec_hid_dim

torch.Size([16, 2, 3])

In [None]:
attention_input_all = torch.cat(( enc_output, s0), dim = 2)
#src_len, batch_size, 2* enc_hid_dim + dec_hid_dim
attention_input_all.shape

torch.Size([16, 2, 9])

In [None]:
attention_output_layer1_all = torch.tanh(attention_layer1(attention_input_all))
attention_output_layer1_all.shape
#src_len, batch_size, dec_hid_dim

torch.Size([16, 2, 3])

In [None]:
attention_output_layer2_all = attention_layer2(attention_output_layer1_all )
attention_output_layer2_all.shape
#src_len, batch_size, 1

torch.Size([16, 2, 1])

In [None]:
attention_output_layer2_all = attention_output_layer2_all.squeeze(2)

In [None]:
attention_output_layer2_all.shape
#src_len, batch_size

torch.Size([16, 2])

In [None]:
attention_output_layer2_all

tensor([[ 0.0525,  0.0399],
        [ 0.0358,  0.0485],
        [ 0.0617,  0.0889],
        [-0.0816,  0.0357],
        [-0.0597,  0.0751],
        [ 0.0827,  0.0397],
        [ 0.1918,  0.0347],
        [ 0.1889, -0.0279],
        [ 0.1430, -0.0607],
        [ 0.1714,  0.1194],
        [ 0.1497,  0.0929],
        [ 0.1502,  0.1717],
        [ 0.1482,  0.1657],
        [ 0.1435,  0.1212],
        [ 0.1329,  0.1289],
        [ 0.1039,  0.1560]], grad_fn=<SqueezeBackward1>)

In [None]:
import torch.nn.functional as F
attention_prob0_all = F.softmax(attention_output_layer2_all, dim=0)

In [None]:
attention_prob0_all

tensor([[0.0594, 0.0601],
        [0.0584, 0.0606],
        [0.0599, 0.0631],
        [0.0519, 0.0599],
        [0.0531, 0.0623],
        [0.0612, 0.0601],
        [0.0682, 0.0598],
        [0.0680, 0.0562],
        [0.0650, 0.0544],
        [0.0669, 0.0651],
        [0.0654, 0.0634],
        [0.0655, 0.0686],
        [0.0653, 0.0682],
        [0.0650, 0.0652],
        [0.0643, 0.0657],
        [0.0625, 0.0675]], grad_fn=<SoftmaxBackward0>)

In [None]:
attention_prob0_all 
#src_len, batch_size

tensor([[0.0594, 0.0601],
        [0.0584, 0.0606],
        [0.0599, 0.0631],
        [0.0519, 0.0599],
        [0.0531, 0.0623],
        [0.0612, 0.0601],
        [0.0682, 0.0598],
        [0.0680, 0.0562],
        [0.0650, 0.0544],
        [0.0669, 0.0651],
        [0.0654, 0.0634],
        [0.0655, 0.0686],
        [0.0653, 0.0682],
        [0.0650, 0.0652],
        [0.0643, 0.0657],
        [0.0625, 0.0675]], grad_fn=<SoftmaxBackward0>)

In [None]:
attention_prob0_all.sum(dim = 0)

tensor([1., 1.], grad_fn=<SumBackward1>)

##### <font color = 'blue'> Decoder GRU (RNN)

The image below shows decoding the first word in an example translation.

![](assets/seq2seq10.png)

The green/teal blocks show the forward/backward encoder RNNs which output $H$, the red block shows the context vector, $z = h_T = \tanh(g(h^\rightarrow_T,h^\leftarrow_T)) = \tanh(g(z^\rightarrow, z^\leftarrow)) = s_0$, the blue block shows the decoder RNN which outputs $s_t$, the purple block shows the linear layer, $f$, which outputs $\hat{y}_{t+1}$ and the orange block shows the calculation of the weighted sum over $H$ by $a_t$ and outputs $w_t$. Not shown is the calculation of $a_t$.

In [None]:
enc_output.shape
# seq_len, batch_size, 2* hid_dim

torch.Size([16, 2, 6])

In [None]:
attention_prob0_all.shape
# Seq_len, batch_size


torch.Size([16, 2])

In [None]:
enc_output = enc_output.permute(1,0,2)
# batch_size, seq_len, 2* hid_dim

In [None]:
enc_output.shape

torch.Size([2, 16, 6])

In [None]:
attention_prob0_all = attention_prob0_all.permute(1,0).unsqueeze(1)

In [None]:
attention_prob0_all.shape
# batchsize, 1, seq_len

torch.Size([2, 1, 16])

In [None]:
w0 = torch.matmul(attention_prob0_all, enc_output)

In [None]:
w0.shape
#batch_size, 1, 2*enc_hid_dim

torch.Size([2, 1, 6])

In [None]:
w0 = w0.permute(1,0,2)
#1, batch_size, 2* enc_hid_dim 

In [None]:
w0.shape

torch.Size([1, 2, 6])

In [None]:
dec_embedded1.shape
#[1, batch_size, dec_emb_dim]

torch.Size([1, 2, 5])

In [None]:
decoder_rnn_input = torch.cat((dec_embedded1, w0), dim = 2)
decoder_rnn_input.shape

torch.Size([1, 2, 11])

In [None]:
dec_emb_dim + 2 * enc_hid_dim
decoder_rnn = torch.nn.GRU(input_size= dec_emb_dim + 2 * enc_hid_dim
                           ,hidden_size =enc_hid_dim,
                           num_layers =1,
                           bidirectional = False,
                           batch_first = False)
                  

In [None]:
 decoder_rnn_input.shape
 # seq_len, batch_size, dec_emb_dim + 2 * enc_hid_dim

torch.Size([1, 2, 11])

In [None]:
dec_hidden_initial.unsqueeze(0).shape
# num_dierctions * num_layers, batch_size, dec_hid_dim

torch.Size([1, 2, 3])

In [None]:
decoder_rnn_output, decoder_rnn_hidden = decoder_rnn(decoder_rnn_input,
                                                     dec_hidden_initial.unsqueeze(0))

In [None]:
decoder_rnn_output.shape
# seq_len, batch_size, hidden_dim

torch.Size([1, 2, 3])

In [None]:
print(dec_embedded1.shape, decoder_rnn_output.shape, w0.shape)

torch.Size([1, 2, 5]) torch.Size([1, 2, 3]) torch.Size([1, 2, 6])


In [None]:
decoder_linear_layer = nn.Linear(in_features= dec_hid_dim + dec_emb_dim + 2*enc_hid_dim,
                                 out_features=len(target_vocab))

In [None]:
final_layer_input = torch.cat((dec_embedded1.squeeze(0) , decoder_rnn_output.squeeze(0) , w0.squeeze(0) ), dim =1)

In [None]:
final_layer_input.shape

torch.Size([2, 14])

In [None]:
predicted_output = decoder_linear_layer(final_layer_input)

In [None]:
predicted_output.shape

torch.Size([2, 25])

In [None]:
outputs_updated = outputs.clone()

In [None]:
outputs[0:3, :, 0:5]

tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]])

In [None]:
outputs_updated[1] = predicted_output

In [None]:
outputs_updated[0:3, :, 0:5]

tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[-0.4730,  0.3017,  0.2717,  0.5852, -0.5424],
         [-0.3806,  0.2242,  0.2834,  0.6534, -0.5980]],

        [[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]],
       grad_fn=<SliceBackward0>)

In [None]:
teacher_forcing_ratio = 0.5

In [None]:
torch.manual_seed(4)
teacher_force = random.random() < teacher_forcing_ratio
teacher_force

False

In [None]:
#get the highest predicted token from our predictions
top1 = predicted_output.argmax(1) 

#if teacher forcing, use actual next token as next input
#if not, use predicted token
input = target[1] if teacher_force else top1
input

tensor([18, 20])

In [None]:
print(top1)
print(target[1])

tensor([18, 20])
tensor([15,  4])


<font size = 4, color = 'red'> Repeat the doceoder steps in a for loop and update output for all the tokens. Let us assume that we have upadted the output. Nest we will calculate the loss.

### <font color = 'blue'> Loss Calculation

As stated before, our decoder loop starts at 1, not 0. This means the 0th element of our `outputs` tensor remains all zeros. So our `trg` and `outputs` look something like:

$$\begin{align*}
\text{trg} = [<sos>, &y_1, y_2, y_3, <eos>]\\
\text{outputs} = [0, &\hat{y}_1, \hat{y}_2, \hat{y}_3, <eos>]
\end{align*}$$

Here, when we calculate the loss, we cut off the first element of each tensor to get:

$$\begin{align*}
\text{trg} = [&y_1, y_2, y_3, <eos>]\\
\text{outputs} = [&\hat{y}_1, \hat{y}_2, \hat{y}_3, <eos>]
\end{align*}$$

In [None]:
print(target.shape, outputs_updated.shape)

torch.Size([14, 2]) torch.Size([14, 2, 25])


In [None]:
target = target[1:,:].view(-1)
outputs_updated = outputs_updated[1:,:,:].view(-1, len(target_vocab))
print(target.shape, outputs_updated.shape)

torch.Size([26]) torch.Size([26, 25])


In [None]:
TRG_PAD_IDX = target_vocab['<PAD>']
TRG_PAD_IDX

3

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
loss = criterion(outputs_updated, target)

In [None]:
loss

tensor(3.2565, grad_fn=<NllLossBackward0>)