### RNN

Equations of RNN
1. $ \mathbf{h_t} = \tanh{(\mathbf{U h_{t-1}}+\mathbf{W x_t})}$
2. $\mathbf{y_t} = softmax(\mathbf{Vh_t})$

In [1]:
import torch
import numpy as np
import torch.nn as nn
from copy import deepcopy

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, out_size) -> None:
        super().__init__(RNN,self).__init__()
        self.hidden_size = hidden_size
        self.prev_hidden = nn.Linear(hidden_size,hidden_size)
        self.curr_input = nn.Linear(input_size,hidden_size)
        self.curr_hidden = nn.Linear(hidden_size,hidden_size)
        self.out_layer = nn.Linear(hidden_size,out_size)
    def forward(self,input_vector):
        hidden = torch.zeros((self.hidden_size,))
        result = list()
        for vector in input_vector:
            U_h_t_1 = self.prev_hidden(hidden)
            W_x_t = self.curr_input(vector)
            hidden = torch.tanh(U_h_t_1+W_x_t)
            out = self.out_layer(hidden)
            result.append(deepcopy(out.tolist()))
        return torch.tensor(result)

# LSTM

## Equations:-

### forget gate
1. forget gate mask = $\mathbf{f_t} = \sigma({\mathbf{U_f\cdot h_{t-1} + W_f\cdot x_t}})$
2. forget context = $\mathbf{k_t = c_{t-1}\odot f_t}$
### actual content
3. actual info to extract = $\mathbf{g_t} = \tanh(\mathbf{U_g\cdot h_{t-1} + W_g\cdot x_t})$
### add gate
4. add gate mask = $\mathbf{i_t} = \sigma(\mathbf{U_i\cdot h_{t-1} + W_i\cdot x_t})$
5. added context  = $\mathbf{j_t} = \mathbf{g_t \odot i_t}$
### current context = $\mathbf{c_t} = \mathbf{j_t+k_t}$
### output gate
7. output mask = $\mathbf{o_t} = \sigma(\mathbf{U_o\cdot h_{t-1}+W_o\cdot x_t})$
### current hidden state = $\mathbf{h_t} = \mathbf{o_t}\odot \tanh(c_t)$


In [None]:
import torch
import torch.nn as nn
from copy import deepcopy
class LSTM(nn.Module):
    def __init__(self,input_size,hidden_size):
        super(LSTM,self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.forget_gate_mask_h = nn.Linear(hidden_size,hidden_size)
        self.forget_gate_mask_i = nn.Linear(input_size,hidden_size)
        
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()

        self.actual_h = nn.Linear(hidden_size,hidden_size)
        self.actual_i = nn.Linear(input_size,hidden_size)


        self.add_gate_mask_h = nn.Linear(hidden_size,hidden_size)
        self.add_gate_mask_i = nn.Linear(input_size,hidden_size)

        self.out_gate_mask_h = nn.Linear(hidden_size,hidden_size)
        self.out_gate_mask_i = nn.Linear(input_size,hidden_size)

        self.final_layer = nn.Linear(hidden_size,input_size)

    def forward(self,input_vector):
        hidden = torch.zeros((self.hidden_size,))
        context = torch.zeros((self.hidden_size,))
        result = list()

        for vector in input_vector:
            f_t = self.sigmoid(self.forget_gate_mask_h(hidden)+self.forget_gate_mask_i(vector))
            k_t = torch.mul(context,f_t)

            g_t = self.tanh(self.actual_h(hidden)+self.actual_i(vector))
            i_t = self.sigmoid(self.add_gate_mask_h(hidden)+self.add_gate_mask_i(vector))
            j_t = torch.mul(g_t,i_t)

            context = j_t+k_t

            o_t = self.sigmoid(self.out_gate_mask_h(hidden)+self.out_gate_mask_i(vector))
            hidden = torch.mul(o_t,self.tanh(context))
            out = self.final_layer(hidden)
            result.append(deepcopy(out.tolist()))
        return torch.tensor(result)
            

# Encoder decoder for Neural Machine translation

### encoder 
* we use LSTM as encoder


### decoder
1. $\mathbf{c} = \mathbf{h_n^e}$
2. $\mathbf{h_0^d} = \mathbf{c}$
3. $\mathbf{h_t^d} = \tanh(\hat{y}_{t-1},\mathbf{h_{t-1}^{d},c})$
4. $\mathbf{z_t} = ffn(\mathbf{h_t^d})$
5. $y_t = softmax(\mathbf{z_t})$
6. $\hat{y}_t = argmax_{w\in V}P(w|x,y_t,...y_{t-1})$


In [None]:
class Encoder(nn.Module):
    def __init__(self,input_size, hidden_size):
        super(Encoder,self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.forget_gate_mask_h = nn.Linear(hidden_size,hidden_size)
        self.forget_gate_mask_i = nn.Linear(input_size,hidden_size)
        
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()

        self.actual_h = nn.Linear(hidden_size,hidden_size)
        self.actual_i = nn.Linear(input_size,hidden_size)


        self.add_gate_mask_h = nn.Linear(hidden_size,hidden_size)
        self.add_gate_mask_i = nn.Linear(input_size,hidden_size)

        self.out_gate_mask_h = nn.Linear(hidden_size,hidden_size)
        self.out_gate_mask_i = nn.Linear(input_size,hidden_size)
    def forward(self,input_vector):
        hidden = torch.zeros((self.hidden_size,))
        context = torch.zeros((self.hidden_size,))

        for vector in input_vector:
            f_t = self.sigmoid(self.forget_gate_mask_h(hidden)+self.forget_gate_mask_i(vector))
            k_t = torch.mul(context,f_t)

            g_t = self.tanh(self.actual_h(hidden)+self.actual_i(vector))
            i_t = self.sigmoid(self.add_gate_mask_h(hidden)+self.add_gate_mask_i(vector))
            j_t = torch.mul(g_t,i_t)

            context = j_t+k_t

            o_t = self.sigmoid(self.out_gate_mask_h(hidden)+self.out_gate_mask_i(vector))
            hidden = torch.mul(o_t,self.tanh(context))
            
        return hidden
    
class Decoder(nn.Module):
    def __init__(self,input_size,hidden_size,vocab_size):
        super(Decoder,self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.vocab_size = vocab_size
        self.hidden_layer_y = nn.Linear(input_size,hidden_size)
        self.hidden_layer_h = nn.Linear(hidden_size,hidden_size)
        self.hidden_layer_c = nn.Linear(hidden_size,hidden_size)
        self.softmax = nn.Softmax()
        self.tanh = nn.Tanh()
        self.output_layer = nn.Linear(hidden_size,vocab_size)
    
    def forward(self,context,vocab_dict,input_vector=None):
        hidden = torch.zeros((self.hidden_size,))
        result = list()
        # during training teacher forcing
        if input_vector:
            for vector in input_vector[:-1]:
                hidden = self.tanh(self.hidden_layer_y(vector)+self.hidden_layer_h(hidden)+self.hidden_layer_c(context))
                z_t = self.softmax(self.output_layer(hidden))
                y_t = torch.argmax(z_t,dim=1)
                result.append(deepcopy(vocab_dict[y_t].tolist()))
        # during testing
        else:
            start_token = vocab_dict[0]
            end_token = vocab_dict[-1]
            out = start_token
            while out != end_token:
                hidden = self.tanh(self.hidden_layer_y(out)+self.hidden_layer_h(hidden)+self.hidden_layer_c(context))
                z_t = self.softmax(self.output_layer(hidden))
                y_t = torch.argmax(z_t,dim=1)
                out = vocab_dict[y_t]
                result.append(deepcopy(out.tolist()))
        return torch.tensor(result)
    
class NMT_Model(nn.Module):
    def __init__(self,input_size,hidden_size,vocab_dict):
        super(NMT_Model,self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.vocab_dict = vocab_dict
        self.encoder = Encoder(input_size=input_size,hidden_size=hidden_size)
        self.decoder = Decoder(input_size=input_size,hidden_size=hidden_size,vocab_size=len(vocab_dict))
    def forward(self,encoder_input,decoder_input=None):
        hidden = self.encoder(encoder_input)
        result = self.decoder(hidden,self.vocab_dict,decoder_input)
        return result


# Attention Mechanism
### encoder 
* we use LSTM as encoder


### decoder
1. $\mathbf{c} = \mathbf{h_n^e}$
2. $\mathbf{h_0^d} = \mathbf{c}$
3. $\mathbf{h_t^d} = \tanh(\hat{y}_{t-1},\mathbf{h_{t-1}^{d},c_i})$
4. $\mathbf{c_i} = \sum_{j}{\alpha_{ij}\mathbf{h}_j^e}$
5. $\alpha_{ij} = softmax(\mathbf{h_{i-1}^d}\cdot \mathbf{h_j^e})\text{ } \forall j\in e$
5. $\mathbf{z_t} = ffn(\mathbf{h_t^d})$
6. $y_t = softmax(\mathbf{z_t})$
7. $\hat{y}_t = argmax_{w\in V}P(w|x,y_t,...y_{t-1})$

In [None]:
class Encoder(nn.Module):
    def __init__(self,input_size, hidden_size):
        super(Encoder,self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.forget_gate_mask_h = nn.Linear(hidden_size,hidden_size)
        self.forget_gate_mask_i = nn.Linear(input_size,hidden_size)
        
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()

        self.actual_h = nn.Linear(hidden_size,hidden_size)
        self.actual_i = nn.Linear(input_size,hidden_size)


        self.add_gate_mask_h = nn.Linear(hidden_size,hidden_size)
        self.add_gate_mask_i = nn.Linear(input_size,hidden_size)

        self.out_gate_mask_h = nn.Linear(hidden_size,hidden_size)
        self.out_gate_mask_i = nn.Linear(input_size,hidden_size)
    def forward(self,input_vector):
        hidden = torch.zeros((self.hidden_size,))
        context = torch.zeros((self.hidden_size,))
        hidden_encoders = list()
        for vector in input_vector:
            f_t = self.sigmoid(self.forget_gate_mask_h(hidden)+self.forget_gate_mask_i(vector))
            k_t = torch.mul(context,f_t)

            g_t = self.tanh(self.actual_h(hidden)+self.actual_i(vector))
            i_t = self.sigmoid(self.add_gate_mask_h(hidden)+self.add_gate_mask_i(vector))
            j_t = torch.mul(g_t,i_t)

            context = j_t+k_t

            o_t = self.sigmoid(self.out_gate_mask_h(hidden)+self.out_gate_mask_i(vector))
            hidden = torch.mul(o_t,self.tanh(context))
            hidden_encoders.append(deepcopy(hidden.tolist()))
            
        return hidden_encoders
    
class Decoder(nn.Module):
    def __init__(self,input_size,hidden_size,vocab_size):
        super(Decoder,self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.vocab_size = vocab_size
        self.hidden_layer_y = nn.Linear(input_size,hidden_size)
        self.hidden_layer_h = nn.Linear(hidden_size,hidden_size)
        self.hidden_layer_c = nn.Linear(hidden_size,hidden_size)
        self.softmax = nn.Softmax()
        self.tanh = nn.Tanh()
        self.output_layer = nn.Linear(hidden_size,vocab_size)
    
    def forward(self,hidden_encoders,vocab_dict,input_vector=None):
        hidden = torch.zeros((self.hidden_size,))
        result = list()
        # attentive weighted context computation
        def context_compute(hidden):
            score = torch.tensor([torch.dot(enc,hidden) for enc in hidden_encoders])
            alpha = torch.softmax(score) 
            context = torch.sum(torch.stack([enc*score for enc,score in zip(hidden_encoders,alpha)]),dim=0)
            return context
        # during training teacher forcing
        if input_vector:
            for vector in input_vector[:-1]:
                context = context_compute(hidden)
                hidden = self.tanh(self.hidden_layer_y(vector)+self.hidden_layer_h(hidden)+self.hidden_layer_c(context))
                z_t = self.softmax(self.output_layer(hidden))
                y_t = torch.argmax(z_t,dim=1)
                result.append(deepcopy(vocab_dict[y_t].tolist()))
        # during testing
        else:
            start_token = vocab_dict[0]
            end_token = vocab_dict[-1]
            out = start_token
            while out != end_token:
                context = context_compute(hidden)
                hidden = self.tanh(self.hidden_layer_y(out)+self.hidden_layer_h(hidden)+self.hidden_layer_c(context))
                z_t = self.softmax(self.output_layer(hidden))
                y_t = torch.argmax(z_t,dim=1)
                out = vocab_dict[y_t]
                result.append(deepcopy(out.tolist()))
        return torch.tensor(result)
    
class NMT_Model(nn.Module):
    def __init__(self,input_size,hidden_size,vocab_dict):
        super(NMT_Model,self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.vocab_dict = vocab_dict
        self.encoder = Encoder(input_size=input_size,hidden_size=hidden_size)
        self.decoder = Decoder(input_size=input_size,hidden_size=hidden_size,vocab_size=len(vocab_dict))
    def forward(self,encoder_input,decoder_input=None):
        hidden = self.encoder(encoder_input)
        result = self.decoder(hidden,self.vocab_dict,decoder_input)
        return result
