# <font color = 'blue'>1 - Sequence to Sequence Learning with Neural Networks

# <font color = 'blue'>Import Libraries



In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
from pathlib import Path
data_folder = Path('/home/harpreet/Insync/google_drive_shaannoor/Data/NLP')
#data_folder = Path('/content/drive/MyDrive/Data/NLP')

In [4]:
#!pip install -U spacy

In [3]:
import torchtext, torch, spacy
torchtext.__version__, torch.__version__, torch.cuda.is_available(), spacy.__version__

('0.11.0', '1.10.0', True, '3.2.4')

# <font color = 'blue'> Set Seeds

In [4]:
import random
import numpy as np
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# <font color = 'blue'> Load Data and Tokenize

In [7]:
#!python -m spacy download en_core_web_sm

In [8]:
#!python -m spacy download de_core_news_sm

In [5]:
nlp_de = spacy.load('de_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')

<font color ='green'>
Next, we create the tokenizer functions. These can be passed to torchtext and will take in the sentence as a string and return the sentence as a list of tokens.

<font color = 'red'>**In the paper we are implementing, they find it beneficial to reverse the order of the input which they believe "introduces many short term dependencies in the data that make the optimization problem much easier".We copy this by reversing the German sentence after it has been transformed into a list of tokens.**</font>

<font color ='green'> Next, we download and load the train, validation and test data. 
The dataset we'll be using is the [Multi30k dataset](https://github.com/multi30k/dataset). This is a dataset with ~30,000 parallel English, German and French sentences, each with ~12 words per sentence. 

In [10]:
#!mkdir /home/harpreet/Insync/google_drive_shaannoor/Data/NLP/Multi30k

In [9]:
# for this example we are choosing only first two rows
import pandas as pd
from torchtext.datasets import Multi30k
for split in ['train', 'valid', 'test']:
    df[split] = pd.DataFrame(Multi30k(root= data_folder/'Multi30k', 
                                      split=split,language_pair=('de', 'en')))[0:2]

In [10]:
pd.set_option('display.max_colwidth', 300)

In [11]:
pd.set_option('display.max_rows', 15)

In [12]:
df['train']

Unnamed: 0,0,1
0,Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.\n,"Two young, White males are outside near many bushes.\n"
1,Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.\n,Several men in hard hats are operating a giant pulley system.\n


In [14]:
def tokenizer(data_array, spacy_model, reverse = False):
    token_list =[]
    disabled = spacy_model.select_pipes(disable= ['tok2vec', 'tagger', 
                                                  'parser', 'attribute_ruler', 
                                                  'lemmatizer', 'ner'])
    for doc in nlp_en.pipe(data_array, batch_size=1000, n_process=-1):
        tokens = [token.text.lower() for token in doc if token.text not in ['\n']] 
        if reverse:
            tokens.reverse()
        token_list.append(tokens)
    disabled.restore()
    return token_list                         

In [15]:
for split in ['train', 'valid', 'test']:
    df[split]['source_tokens'] = tokenizer(df[split].iloc[:,0].values, 
                                           nlp_de, reverse = True)
    df[split]['target_tokens'] = tokenizer(df[split].iloc[:,1].values, nlp_de)
    df[split] = df[split][['source_tokens', 'target_tokens']]

In [16]:
df['train']

Unnamed: 0,source_tokens,target_tokens
0,"[., büsche, vieler, nähe, der, in, freien, m, i, sind, männer, weiße, junge, zwei]","[two, young, ,, white, males, are, outside, near, many, bushes, .]"
1,"[., antriebsradsystem, ein, bedienen, schutzhelmen, mit, männer, mehrere]","[several, men, in, hard, hats, are, operating, a, giant, pulley, system, .]"


In [18]:
print(df['train']['source_tokens'][0])

['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'm', 'i', 'sind', 'männer', 'weiße', 'junge', 'zwei']


# <font color = 'blue'> Build Vocab

In [27]:
from collections import Counter
from torchtext.vocab import vocab
def create_vocab(text, min_freq, specials):
    my_counter = Counter()
    for line in text:
       my_counter.update(line)
    my_vocab = vocab(my_counter, min_freq=min_freq)
    for i, special in enumerate(specials):
        my_vocab.insert_token(special, i)
    my_vocab.set_default_index(0)
    return my_vocab

In [28]:
source_vocab = create_vocab(text = df['train']['source_tokens'], 
                            min_freq=1, 
                            specials=['<unk>', '<BOS>', '<EOS>', '<PAD>'])

In [29]:
len(source_vocab)

24

In [30]:
pd.set_option('display.max_rows', 30)

In [31]:
pd.DataFrame(source_vocab.get_stoi().items(), columns=['tokens', 'index']).sort_values(by = ['index'])

Unnamed: 0,tokens,index
8,<unk>,0
20,<BOS>,1
10,<EOS>,2
15,<PAD>,3
12,.,4
18,büsche,5
21,vieler,6
16,nähe,7
19,der,8
11,in,9


In [32]:
# check index of unknown word - it should be zero
source_vocab['abracdabra']

0

In [33]:
target_vocab = create_vocab(df['train']['target_tokens'], 1, ['<unk>', '<BOS>', '<EOS>', '<PAD>'])

In [34]:
len(target_vocab)

25

In [35]:
pd.DataFrame(target_vocab.get_stoi().items(), columns=['tokens', 'index']).sort_values(by = ['index'])

Unnamed: 0,tokens,index
22,<unk>,0
10,<BOS>,1
11,<EOS>,2
12,<PAD>,3
16,two,4
14,young,5
15,",",6
23,white,7
17,males,8
20,are,9


# <font color = 'blue'> Create Dataset and Dataloader

In [37]:
from torch.utils.data import Dataset
class EngGerman(Dataset):
    def __init__(self, X1, X2):
        self.X1 = X1
        self.X2 = X2
        
    def __len__(self):
        return len(self.X1)
    
    def __getitem__(self, indices):
        return (self.X1.iloc[indices] , self.X2.iloc[indices]) 

In [38]:
trainset = EngGerman(df['train']['source_tokens'], df['train']['target_tokens'])
testset = EngGerman(df['test']['source_tokens'], df['test']['target_tokens'])
validset = EngGerman(df['valid']['source_tokens'], df['valid']['target_tokens'])

In [39]:
trainset.__getitem__(0)

(['.',
  'büsche',
  'vieler',
  'nähe',
  'der',
  'in',
  'freien',
  'm',
  'i',
  'sind',
  'männer',
  'weiße',
  'junge',
  'zwei'],
 ['two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.'])

In [41]:
len(trainset), len(testset), len(validset)

(2, 2, 2)

<font color = 'green'> Convert words into indices, add index of <BOS> in the beginning and add index of <EOS> at the end.

In [42]:
def text_transform (my_vocab, text):
     text_numerical = [my_vocab[token] for token in text]
     return torch.tensor([source_vocab['<BOS>']] + text_numerical + [source_vocab['<EOS>']])
     #return list(source_vocab['<BOS>']) + text_numerical + list(source_vocab['<EOS>'])

In [44]:
text = trainset.__getitem__(1)[1]
print(text)
text_transform(target_vocab, text)

['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']


tensor([ 1, 15, 16, 17, 18, 19,  9, 20, 21, 22, 23, 24, 14,  2])

<font color ='green'> Add  padding to batch of sentences using pad_sequence

In [47]:
seq= [torch.tensor([1,2,4]), torch.tensor([3,4,5,6,7])]

In [50]:
from torch.nn.utils.rnn import pad_sequence
padded_seq = pad_sequence(seq,batch_first=True,padding_value = -10)
padded_seq

tensor([[  1,   2,   4, -10, -10],
        [  3,   4,   5,   6,   7]])

In [51]:
padded_seq.shape # batch, seq len

torch.Size([2, 5])

In [52]:
padded_seq = pad_sequence(seq,batch_first=False,padding_value = -10)

In [53]:
padded_seq

tensor([[  1,   3],
        [  2,   4],
        [  4,   5],
        [-10,   6],
        [-10,   7]])

In [54]:
padded_seq.shape # seq_len, batch_size

torch.Size([5, 2])

<font color = 'green'> Creare a function to specify transformations for a batch

In [58]:
def collate_batch(batch):
   source_list, target_list = [], []
   for (source_text, target_text) in batch:
        source_transform = text_transform(source_vocab, source_text)
        source_list.append(source_transform)
        target_transform =text_transform(target_vocab, target_text)
        target_list.append(target_transform)
        
   source_pad = pad_sequence(source_list, padding_value=3.0)
   target_pad = pad_sequence(target_list, padding_value=3.0)
   #print(source_list)
   return (source_pad, target_pad)

In [60]:
from torch.utils.data import DataLoader
batch_size = 2

train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collate_batch)

In [61]:
for source, target in train_loader:
  print(source)
  print(target)
  break

tensor([[ 1,  1],
        [ 4,  4],
        [ 5, 18],
        [ 6, 19],
        [ 7, 20],
        [ 8, 21],
        [ 9, 22],
        [10, 14],
        [11, 23],
        [12,  2],
        [13,  3],
        [14,  3],
        [15,  3],
        [16,  3],
        [17,  3],
        [ 2,  3]])
tensor([[ 1,  1],
        [ 4, 15],
        [ 5, 16],
        [ 6, 17],
        [ 7, 18],
        [ 8, 19],
        [ 9,  9],
        [10, 20],
        [11, 21],
        [12, 22],
        [13, 23],
        [14, 24],
        [ 2, 14],
        [ 3,  2]])


In [62]:
batch_size = 2

train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collate_batch)
valid_loader = DataLoader(validset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collate_batch)
test_loader = DataLoader(testset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collate_batch)

# <font color = 'blue'> Breakdown of forward loop

## <font color = 'blue'> Get Source and Target sentences

In [63]:
source, target = next(iter(train_loader))

In [64]:
source.shape
# source_len, batch_size

torch.Size([16, 2])

In [65]:
print(source)

tensor([[ 1,  1],
        [ 4,  4],
        [18,  5],
        [19,  6],
        [20,  7],
        [21,  8],
        [22,  9],
        [14, 10],
        [23, 11],
        [ 2, 12],
        [ 3, 13],
        [ 3, 14],
        [ 3, 15],
        [ 3, 16],
        [ 3, 17],
        [ 3,  2]])


In [66]:
print(target)

tensor([[ 1,  1],
        [15,  4],
        [16,  5],
        [17,  6],
        [18,  7],
        [19,  8],
        [ 9,  9],
        [20, 10],
        [21, 11],
        [22, 12],
        [23, 13],
        [24, 14],
        [14,  2],
        [ 2,  3]])


In [67]:
batch_size = target.shape[1]
print(batch_size)

2


In [68]:
trg_len = target.shape[0]
print(trg_len)

14


In [69]:
# tensors to store decoder output
# shape of outputs target length X batch size X vocab size
outputs = torch.zeros(trg_len, batch_size, len(target_vocab))
print(outputs.shape)

torch.Size([14, 2, 25])


## <font color = 'blue'> Encoder Loop
<font color = 'green'> So our encoder looks something like this: 

![](assets/seq2seq2.png)

<font color = 'green'> 
We will first pass the input through an embedding layer. Initially, the word is represented by a vector of size of vocab (one hot encoding). We do a matrix multiplication of onehot vector with the embedding matrix. The dimension of the matrix will be (vocab_size , emn_dim). So this will convert the word vector from vocab_size to emd_dim size. The embedding matrix is a learnable parameter. We do not create one-hot vectors because multiplying one-hot vector with matrix is equivalent to choosing the row where the value of vector was one. For example Let use assume that vocab size is 6 and emb_dim is 3. One hot vector for fourth word in vocab is is [0,0,0,1,0,0] and we do a matrix multiplication of this vector wih matrix of size(6,3)we will essentially just pick 4th row from the matrix. hat is why the input to out model are indexes of the words. 


nn.Embedding layer will initilaize the weight matrix randomly. The matrix will be updated during backward pass.

In [71]:
import torch.nn as nn
input_dim = len(source_vocab) # each words is represented by vector of vocab size
emb_dim = 5 # we get a vector which will represent the word with a vector of size 5 - usually this in range from 100 to 500.

source_embedding = nn.Embedding(input_dim, emb_dim, padding_idx=3)

### <font color = 'blue'>  Embedding Layer

In [72]:
source_embedding

Embedding(24, 5, padding_idx=3)

In [73]:
source_embedding.weight

Parameter containing:
tensor([[-0.3144, -0.4963,  0.1235, -0.1510, -1.3892],
        [-0.6208, -0.0242, -1.0402,  1.4635, -0.7477],
        [ 1.7456, -1.2779, -1.0257,  1.7473,  0.8090],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9717, -0.5150,  1.4255,  0.7987, -2.5273],
        [ 1.4778, -0.1696, -0.9919, -1.4569,  0.2563],
        [-0.4030,  0.4195,  0.7667,  0.0190,  0.0220],
        [ 1.1532, -0.3393,  0.1559,  0.8966, -0.2968],
        [-0.6857, -0.0496, -1.2485, -0.8509, -0.7690],
        [-1.5606, -0.5309,  0.2178, -0.2833, -0.5660],
        [ 0.3566, -0.4535, -0.2971, -1.5380, -1.0248],
        [-0.3781,  0.3910,  0.5158, -1.0042,  0.9860],
        [ 1.1334,  0.8504,  1.0534,  0.3692, -0.0552],
        [-0.6125,  0.7500, -0.7346,  0.4622,  1.1759],
        [ 0.2145,  0.5362,  0.1365, -2.3332,  1.5308],
        [ 0.2680,  0.4505, -0.2725, -1.7399,  0.1299],
        [-0.5630, -0.2829,  0.0731, -1.3880, -0.2678],
        [-0.1254, -1.5038, -0.3287,  0.4360

In [74]:
embedding_output = source_embedding(source)

In [75]:
source
# source_len, batch_size

tensor([[ 1,  1],
        [ 4,  4],
        [18,  5],
        [19,  6],
        [20,  7],
        [21,  8],
        [22,  9],
        [14, 10],
        [23, 11],
        [ 2, 12],
        [ 3, 13],
        [ 3, 14],
        [ 3, 15],
        [ 3, 16],
        [ 3, 17],
        [ 3,  2]])

<font color = 'green'> 
We can see below that embedding_output is simply the corresponding rows from embedding matrix. In a way we can say that a word in a matrxi represent a vector for word which the network will learn. Usually the matrix is initialized randomly but we can use pretrained embeddings like word2vec to initilaize the matrix as well.

In [76]:
embedding_output

tensor([[[-0.6208, -0.0242, -1.0402,  1.4635, -0.7477],
         [-0.6208, -0.0242, -1.0402,  1.4635, -0.7477]],

        [[ 0.9717, -0.5150,  1.4255,  0.7987, -2.5273],
         [ 0.9717, -0.5150,  1.4255,  0.7987, -2.5273]],

        [[-1.2222, -0.2746, -0.3450, -0.7162,  0.5781],
         [ 1.4778, -0.1696, -0.9919, -1.4569,  0.2563]],

        [[ 0.3805, -1.4538, -2.6740,  1.5984,  0.8021],
         [-0.4030,  0.4195,  0.7667,  0.0190,  0.0220]],

        [[-0.3511, -0.0670, -0.0534, -0.8315, -0.2632],
         [ 1.1532, -0.3393,  0.1559,  0.8966, -0.2968]],

        [[-0.5432, -1.6406,  0.9295,  1.2907,  0.2612],
         [-0.6857, -0.0496, -1.2485, -0.8509, -0.7690]],

        [[-0.5862, -1.5105, -2.0155,  0.6964, -0.6676],
         [-1.5606, -0.5309,  0.2178, -0.2833, -0.5660]],

        [[ 0.2145,  0.5362,  0.1365, -2.3332,  1.5308],
         [ 0.3566, -0.4535, -0.2971, -1.5380, -1.0248]],

        [[-0.8424,  0.5289, -0.5447,  0.8097,  1.1226],
         [-0.3781,  0.3910,  0.5

In [77]:
embedding_output.shape # sequence_len , batch , embed_dim

torch.Size([16, 2, 5])

<font color = 'green'> 
We will pass the embedding_output through a dropout layer. The dropout layer has no learnable parameters.

### <font color = 'blue'>  Dropout Layer

In [82]:
encoder_dropout = nn.Dropout(p=0.5)

In [83]:
encoder_dropout_output = encoder_dropout(embedding_output)

As we can see the dropout layer randomly makes 50% of neurons zero and divided everything else by 0.5 i.e multiplies by 2.

In [84]:
encoder_dropout_output

tensor([[[-1.2415, -0.0483, -0.0000,  2.9269, -1.4953],
         [-0.0000, -0.0000, -2.0805,  0.0000, -1.4953]],

        [[ 0.0000, -0.0000,  2.8511,  1.5974, -0.0000],
         [ 1.9434, -1.0300,  2.8511,  0.0000, -0.0000]],

        [[-0.0000, -0.5493, -0.6900, -0.0000,  1.1562],
         [ 2.9556, -0.3392, -1.9837, -2.9138,  0.5126]],

        [[ 0.0000, -0.0000, -5.3480,  3.1967,  1.6043],
         [-0.8061,  0.8391,  1.5334,  0.0381,  0.0000]],

        [[-0.7023, -0.1340, -0.0000, -1.6631, -0.0000],
         [ 2.3064, -0.6787,  0.3118,  0.0000, -0.5936]],

        [[-0.0000, -0.0000,  0.0000,  0.0000,  0.5224],
         [-0.0000, -0.0993, -0.0000, -1.7017, -0.0000]],

        [[-0.0000, -0.0000, -0.0000,  1.3927, -0.0000],
         [-0.0000, -0.0000,  0.4355, -0.5667, -0.0000]],

        [[ 0.4289,  1.0724,  0.2730, -4.6663,  3.0616],
         [ 0.0000, -0.9070, -0.0000, -0.0000, -2.0497]],

        [[-1.6849,  0.0000, -0.0000,  0.0000,  0.0000],
         [-0.0000,  0.7820,  0.0

### <font color = 'blue'> LSTM Layer

<img src ="https://drive.google.com/uc?export=view&id=1R9q3bdONSRy57UjhXyznOSpq-1o5brmD" width = 500 >

<font size =4, color = 'green'> nn.LSTM - Parameters
<br>
<font size =4, color = 'green'>nn.LSTM( input_size, hidden_size, num_layers, non-linearity, bias, dropout, bidirectional, batch_first)

<font color = 'green'>
    
- input_size – The number of expected features in the input  (embedding size)

- hidden_size – The number of features in the hidden state h

- num_layers – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two RNNs together to form a stacked RNN, with the second RNN taking in outputs of the first RNN and computing the final results. Default: 1

- bias – If False, then the layer does not use bias weights b_ih and b_hh. Default: True

- batch_first – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False. Recommended to chnage this to batch_first.

- dropout – If non-zero, introduces a Dropout layer on the outputs of each RNN layer except the last layer, with dropout probability equal to dropout. Default: 0

- bidirectional – If True, becomes a bidirectional RNN. Default: False

<font size =4, color = 'green'> LSTM layer needs three inputs -

<font color = 'green'> 
1. input  (if batch_first = True then then the shape should be - Batch size, Sequence Length , input size
 <br><br> In our case - sequence length will be number of tokens in a sentence and input size will be size of embedding.  <br><br> 

2. h0 (initial hidden state) - (num_layers * num_directions, batch size, hidden_size)
3. c0 (initial hidden state) - (num_layers * num_directions, batch size, hidden_size)
<br><br>  By deafult both h0 and c0 are initilaized as tensors of zeros. 

  batch_first does not apply to h0 and c0

<font size =4, color = 'green'>Output from LSTM layers (three outputs) 
    
<font color = 'green'> 
    
1. output of shape (seq_len, batch, num_directions * hidden_size): tensor containing the output features (h_t) from the last layer of the RNN, for each t.
<br><br> if batch_first = True <br><br> 
Batch size X Sequence Length X num_directions * Hidden size <br><br> 

2. (h_n and c_n) of shape (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t = seq_len.
    

<font size =4, color = 'green'> Shape of weight metrices
    
<font color = 'green'> 
    
- LSTM.weight_ih_l[k] – the learnable input-hidden weights of the k-th layer, of shape ( **4*hidden_size**, input_size) for k = 0. Otherwise, the shape is (**4*hidden_size**, num_directions * hidden_size) <br><br>

- LSTM.weight_hh_l[k] – the learnable hidden-hidden weights of the k-th layer, of shape (**4*hidden_size**, hidden_size) <br><br>

- LSTM.bias_ih_l[k] – the learnable input-hidden bias of the k-th layer, of shape (**4*hidden_size**) <br><br>

- LSTM.bias_hh_l[k] – the learnable hiddn-hidden bias of the k-th layer, of shape (**4*hidden_size**) <br><br>

 

In [78]:
hid_dim = 3 # size of vector of each word from LSTM layer
n_layers = 2
encoder_lstm_layer = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = 0.3)

In [79]:
encoder_lstm_layer

LSTM(5, 3, num_layers=2, dropout=0.3)

In [80]:
for name, param in encoder_lstm_layer.named_parameters():
    print(name, param.data.shape)

weight_ih_l0 torch.Size([12, 5])
weight_hh_l0 torch.Size([12, 3])
bias_ih_l0 torch.Size([12])
bias_hh_l0 torch.Size([12])
weight_ih_l1 torch.Size([12, 3])
weight_hh_l1 torch.Size([12, 3])
bias_ih_l1 torch.Size([12])
bias_hh_l1 torch.Size([12])


In [85]:
encoder_dropout_output.shape 
# [src len, batch size, emb dim]

torch.Size([16, 2, 5])

In [86]:
h0 = torch.zeros(2,2,3) # layers , batch_size, hidden_size
c0 = torch.zeros(2,2,3) # layers , batch_size, hidden_size

In [87]:
# encoder_dropout_output = [src len, batch size, emb dim]

encoder_outputs, (encoder_hidden, encoder_cell) = encoder_lstm_layer(encoder_dropout_output, (h0, c0))

#outputs = [src len, batch size, hid dim ] -->  all sequences from last layer
#hidden = [n layers , batch size, hid dim] --> last sequence from all the layers
#cell = [n layers , batch size, hid dim] --> last sequence from all the layer

#outputs are always from the top hidden layer

In [88]:
encoder_outputs.shape, encoder_hidden.shape, encoder_cell.shape

(torch.Size([16, 2, 3]), torch.Size([2, 2, 3]), torch.Size([2, 2, 3]))

## <font color ='blue'> Decoder

<font color ='green'>
    
Next, we'll build our decoder, which will also be a 2-layer (4 in the paper) LSTM.

![](assets/seq2seq3.png)

The `Decoder` class does a single step of decoding, i.e. it ouputs single token per time-step. The first layer will receive a hidden and cell state from the previous time-step, $(s_{t-1}^1, c_{t-1}^1)$, and feeds it through the LSTM with the current embedded token, $y_t$, to produce a new hidden and cell state, $(s_t^1, c_t^1)$. The subsequent layers will use the hidden state from the layer below, $s_t^{l-1}$, and the previous hidden and cell states from their layer, $(s_{t-1}^l, c_{t-1}^l)$. This provides equations very similar to those in the encoder.

$$\begin{align*}
(s_t^1, c_t^1) = \text{DecoderLSTM}^1(d(y_t), (s_{t-1}^1, c_{t-1}^1))\\
(s_t^2, c_t^2) = \text{DecoderLSTM}^2(s_t^1, (s_{t-1}^2, c_{t-1}^2))
\end{align*}$$

Remember that the initial hidden and cell states to our decoder are our context vectors, which are the final hidden and cell states of our encoder from the same layer, i.e. $(s_0^l,c_0^l)=z^l=(h_T^l,c_T^l)$.



In [92]:
#first input to the decoder is the <sos> tokens
input_de = target[0,:]

In [95]:
input_de.shape
# batch_size

torch.Size([2])

In [96]:
input_de
# batch_size

tensor([1, 1])

In [97]:
input_de = input_de.unsqueeze(0)
input_de
# 1, batch_sise
# 1 here represents the sequence length

tensor([[1, 1]])

In [98]:
input_de.shape # [1, batch size] - since we have  one word at a time seq length is 1

torch.Size([1, 2])

In [100]:
output_dim = len(target_vocab)
emb_dim = 5
n_layers = 2
hid_dim=3

decoder_embedding = nn.Embedding(output_dim, emb_dim)
decoder_dropout = nn.Dropout(p=0.5)
decoder_lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = 0.3)

In [101]:
dec_output1, (dec_hidden1, dec_cell1)= decoder_lstm(
                          decoder_dropout(decoder_embedding(input_de)),
                          (encoder_hidden, encoder_cell))

In [103]:
print(dec_output1.shape) # seq_len, batch_size, hidden_dim
print(dec_hidden1.shape) # num_layers, batch_size, hidden_dim
print(dec_cell1.shape)  # num_layers, batch_size, hidden_dim

torch.Size([1, 2, 3])
torch.Size([2, 2, 3])
torch.Size([2, 2, 3])


<font color ='green'>
For deocder we are passing one word at a time. We are intrested in the output from the last layer. We can use the dec_output for this OR we can extract last  element of hidden cell

In [104]:
dec_output1 

tensor([[[ 0.1206,  0.2369, -0.1888],
         [ 0.1018,  0.2435, -0.1666]]], grad_fn=<StackBackward0>)

In [105]:
dec_hidden1[-1,:,:]

tensor([[ 0.1206,  0.2369, -0.1888],
        [ 0.1018,  0.2435, -0.1666]], grad_fn=<SliceBackward0>)

<font color ='green'>
Since we need to estimate probbaility we will need to change the dimension of the token back to length of the vocab size. We can then use the softmax to get probbailities. This will give us the predicted probabilities. The prediction will be the word corresponding to maximum probbaility. Since we know the actual word we can then calculate the loss. The actual values wil be one hot encoding and predictd values will be probbailities.

<font color ='green'>
We will use the nn.linear layer to change the output from shape (2,3) to shape (2,len(target_vocab)). The nn.Linear layer will initilaize a weight matrix of the shape (vocab_size, hid_dim). It then does the follwoing transformation : $y = xW^T + b$

In [106]:
decoder_linear = nn.Linear(in_features=3, out_features=len(target_vocab))

In [107]:
for name, param in decoder_linear.named_parameters():
    print(name, param.data.shape)

weight torch.Size([25, 3])
bias torch.Size([25])


In [108]:
dec_output1.shape

torch.Size([1, 2, 3])

Linear layer will need two dimenional input. We will squeeze out the redundant dimension

In [109]:
dec_output1 = dec_output1.squeeze(0)

In [110]:
dec_output1.shape

torch.Size([2, 3])

<font color ='green'>
So the shape is (2,3). We will multiply it with a matrix of (3, 25) (we are taking the transpose of the weight matrix to get out put of 2, 25 ---> so word in each batch is converted into vector of vocab size

In [111]:
prediction1 = decoder_linear(dec_output1)

In [112]:
prediction1

tensor([[ 0.3964, -0.1926,  0.3159,  0.4407, -0.6102,  0.0711, -0.2346, -0.1060,
          0.4961, -0.1705, -0.3661,  0.1079,  0.0828, -0.2413, -0.1515, -0.5287,
          0.5211, -0.4508,  0.3556, -0.4582, -0.2168, -0.2761,  0.6901, -0.1194,
         -0.2217],
        [ 0.4069, -0.2067,  0.3201,  0.4436, -0.6200,  0.0686, -0.2229, -0.1199,
          0.4930, -0.1723, -0.3642,  0.0936,  0.0753, -0.2529, -0.1455, -0.5467,
          0.5041, -0.4407,  0.3378, -0.4463, -0.2275, -0.2581,  0.6775, -0.1331,
         -0.2200]], grad_fn=<AddmmBackward0>)

In [113]:
prediction1.shape

torch.Size([2, 25])

<font color ='green'> Update the output based on prediction. Once we update output for all the words, we can use that to calculate batch loss

In [131]:
outputs[1] = prediction1

In [132]:
outputs.shape
# seq_len, batch_size, vocab

torch.Size([14, 2, 25])

In [134]:
outputs[0:3]

tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 0.3964, -0.1926,  0.3159,  0.4407, -0.6102,  0.0711, -0.2346,
          -0.1060,  0.4961, -0.1705, -0.3661,  0.1079,  0.0828, -0.2413,
          -0.1515, -0.5287,  0.5211, -0.4508,  0.3556, -0.4582, -0.2168,
          -0.2761,  0.6901, -0.1194, -0.2217],
         [ 0.4069, -0.2067,  0.3201,  0.4436, -0.6200,  0.0686, -0.2229,
          -0.1199,  0.4930, -0.1723, -0.3642,  0.0936,  0.0753, -0.2529,
          -0.1455, -0.5467,  0.5041, -0.4407,  0.3378,

<font color ='green'>
    
So the predicted word is the word corresponding to index of max value. As we can see below , in our example the word corresponding to index 22 is the predicted word for both the sentences in the batch. The actual word correspond to index 15 and 4 for two sentences. We can use the predicted words as an input for next sequence or we can use the actual word as input to the next sequence. We will then repeat the whole process of the decoder section.
Then we will get the second predicted word. We will continue till we reach the last word in the sequence.

In [140]:
top1 = prediction1.argmax(1) 
top1

tensor([22, 22])

In [141]:
actual1 = target[1]
actual1

tensor([15,  4])

In [142]:
teacher_forcing_ratio = 0.5
teacher_forcing = random.random() < teacher_forcing_ratio

In [143]:
teacher_forcing

False

<font color ='green'>
During training - we will randomly sometimes used the actual word (teacher forcing) and sometimes use the predicted word as input to next sequence. If we use teacher_forcing_ratio =1 then we will always use the actual word as input to next sequence. During inference we will set teacher_forcing_ratio = 0.

<font color ='green'>
Final note - We will ignore the loss corresponding to PAD tokens.

In [144]:
if teacher_forcing:
    input_de = actual1
else:
    input_de = top1

In [145]:
input_de

tensor([22, 22])