In [2]:
import numpy as np
import matplotlib.pyplot as plt

import torch

from torchinfo import summary

# RNN

## Single layer RNN

In [3]:
class SingleRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SingleRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.Wxh = torch.nn.Linear(input_size, hidden_size)
        self.Whh = torch.nn.Linear(hidden_size, hidden_size)
        self.Why = torch.nn.Linear(hidden_size, output_size)
        self.tanh = torch.nn.Tanh()
        self.softmax = torch.nn.Softmax(dim=1)
        
    def forward(self, x):
        hidden = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        hidden = self.tanh(self.Wxh(x) + self.Whh(hidden))
        output = self.Why(hidden)
        output = self.softmax(output[:, -1])
        return output
    
sRNN = SingleRNN(64, 13, 35)
summary(sRNN, input_size=(256, 64))

Layer (type:depth-idx)                   Output Shape              Param #
SingleRNN                                [1, 35]                   --
├─Linear: 1-1                            [256, 13]                 845
├─Linear: 1-2                            [1, 256, 13]              182
├─Tanh: 1-3                              [1, 256, 13]              --
├─Linear: 1-4                            [1, 256, 35]              490
├─Softmax: 1-5                           [1, 35]                   --
Total params: 1,517
Trainable params: 1,517
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.22
Input size (MB): 0.07
Forward/backward pass size (MB): 0.12
Params size (MB): 0.01
Estimated Total Size (MB): 0.20

In [4]:
class SingleRNN2(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SingleRNN2, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size 

        self.Wxh = torch.nn.Parameter(torch.randn(input_size, hidden_size).float())
        self.Whh = torch.nn.Parameter(torch.randn(hidden_size, hidden_size).float())
        self.Why = torch.nn.Parameter(torch.randn(hidden_size, output_size).float())
        self.bh = torch.nn.Parameter(torch.zeros(1, hidden_size).float())
        self.by = torch.nn.Parameter(torch.zeros(1, output_size).float())

        self.tanh = torch.nn.Tanh()
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x):
        hidden = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)

        u = torch.matmul(x, self.Wxh) + torch.matmul(hidden, self.Whh) + self.bh
        hidden = self.tanh(u)
        output = torch.matmul(hidden, self.Why) + self.by
        output = self.softmax(output)
        return output[:,-1]
        
sRNN2 = SingleRNN2(64, 13, 35)
summary(sRNN2, input_size=(256, 64))
    

Layer (type:depth-idx)                   Output Shape              Param #
SingleRNN2                               [1, 35]                   1,504
├─Tanh: 1-1                              [1, 256, 13]              --
├─Softmax: 1-2                           [1, 256, 35]              --
Total params: 1,504
Trainable params: 1,504
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0
Input size (MB): 0.07
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.07

## Multi layer RNN

In [5]:
class MultiRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, layer_num, output_size):
        super(MultiRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.layer_num = layer_num
        self.output_size = output_size

        self.Wxh = torch.nn.Linear(input_size, hidden_size)
        self.Whh = torch.nn.Linear(hidden_size, hidden_size)
        self.Why = torch.nn.Linear(hidden_size, output_size)
        self.tanh = torch.nn.Tanh()
        self.softmax = torch.nn.Softmax(dim=1)
        
    def forward(self, x):
        hidden = torch.zeros(self.layer_num, x.size(0), self.hidden_size).to(x.device)
        hidden = self.tanh(self.Wxh(x) + self.Whh(hidden))
        output = self.Why(hidden)
        output = self.softmax(output[:, -1])
        return output
    
mRNN = MultiRNN(64, 13, 3, 35)
summary(mRNN, input_size=(256, 64))

Layer (type:depth-idx)                   Output Shape              Param #
MultiRNN                                 [3, 35]                   --
├─Linear: 1-1                            [256, 13]                 845
├─Linear: 1-2                            [3, 256, 13]              182
├─Tanh: 1-3                              [3, 256, 13]              --
├─Linear: 1-4                            [3, 256, 35]              490
├─Softmax: 1-5                           [3, 35]                   --
Total params: 1,517
Trainable params: 1,517
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.22
Input size (MB): 0.07
Forward/backward pass size (MB): 0.32
Params size (MB): 0.01
Estimated Total Size (MB): 0.39

In [6]:
class MultiRNN2(torch.nn.Module):
    def __init__(self, input_size, hidden_size, layer_num, output_size):
        super(MultiRNN2, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.layer_num = layer_num
        self.output_size = output_size 

        self.Wxh = torch.nn.Parameter(torch.randn(input_size, hidden_size).float())
        self.Whh = torch.nn.Parameter(torch.randn(hidden_size, hidden_size).float())
        self.Why = torch.nn.Parameter(torch.randn(hidden_size, output_size).float())
        self.bh = torch.nn.Parameter(torch.zeros(1, hidden_size).float())
        self.by = torch.nn.Parameter(torch.zeros(1, output_size).float())

        self.tanh = torch.nn.Tanh()
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x):
        hidden = torch.zeros(self.layer_num, x.size(0), self.hidden_size).to(x.device)

        u = torch.matmul(x, self.Wxh) + torch.matmul(hidden, self.Whh) + self.bh
        hidden = self.tanh(u)
        output = torch.matmul(hidden, self.Why) + self.by
        output = self.softmax(output)
        return output[:,-1]
        
mRNN2 = MultiRNN2(64, 13, 3, 35)
summary(mRNN2, input_size=(256, 64))
    

Layer (type:depth-idx)                   Output Shape              Param #
MultiRNN2                                [3, 35]                   1,504
├─Tanh: 1-1                              [3, 256, 13]              --
├─Softmax: 1-2                           [3, 256, 35]              --
Total params: 1,504
Trainable params: 1,504
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0
Input size (MB): 0.07
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.07

# LSTM

In [7]:
# one layer LSTM without batch
class LSTM(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.Wf = torch.nn.Parameter(torch.randn(hidden_size, hidden_size + input_size).float())
        self.Wi = torch.nn.Parameter(torch.randn(hidden_size, hidden_size + input_size).float())
        self.Wo = torch.nn.Parameter(torch.randn(hidden_size, hidden_size + input_size).float())
        self.Wc = torch.nn.Parameter(torch.randn(hidden_size, hidden_size + input_size).float())
        self.bf = torch.nn.Parameter(torch.zeros(hidden_size, 1).float())
        self.bi = torch.nn.Parameter(torch.zeros(hidden_size, 1).float())
        self.bo = torch.nn.Parameter(torch.zeros(hidden_size, 1).float())
        self.bc = torch.nn.Parameter(torch.zeros(hidden_size, 1).float())

        self.tanh = torch.nn.Tanh()
        self.softmax = torch.nn.Softmax(dim=1)
        self.sigmoid = torch.nn.Sigmoid()
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        hidden = torch.zeros(x.size(0), self.hidden_size).to(x.device)
        cell = torch.zeros(x.size(0), self.hidden_size).to(x.device)
        outputs = []
        for i in range(len(x)):
            xt = x[i].reshape(-1, 1)
            u = torch.cat((hidden, xt.T), 1).T
            gf = self.sigmoid(torch.matmul(self.Wf, u) + self.bf)
            cell1 = gf * cell
            gi = self.sigmoid(torch.matmul(self.Wi, u) + self.bi)
            cell = cell1 + gi * self.tanh(torch.matmul(self.Wc, u) + self.bc)
            go = self.sigmoid(torch.matmul(self.Wo, u) + self.bo)
            hidden = go * self.tanh(cell)
            outputs.append(hidden)
        outputs = torch.stack(outputs)
        outputs = self.fc(outputs)
        return outputs
    
lstm = LSTM(64, 13, 35)  
summary(lstm, input_size=(1, 64))


Layer (type:depth-idx)                   Output Shape              Param #
LSTM                                     [1, 13, 35]               4,056
├─Sigmoid: 1-1                           [13, 1]                   --
├─Sigmoid: 1-2                           [13, 1]                   --
├─Tanh: 1-3                              [13, 1]                   --
├─Sigmoid: 1-4                           [13, 1]                   --
├─Tanh: 1-5                              [13, 13]                  --
├─Linear: 1-6                            [1, 13, 35]               490
Total params: 4,546
Trainable params: 4,546
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.01

# GRU

In [26]:
# one layer GRU without batch

class GRU(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.Wr = torch.nn.Parameter(torch.randn(hidden_size, hidden_size + input_size).float())
        self.Wu = torch.nn.Parameter(torch.randn(hidden_size, hidden_size + input_size).float())
        self.Wc = torch.nn.Parameter(torch.randn(hidden_size, hidden_size + input_size).float())
        self.br = torch.nn.Parameter(torch.zeros(hidden_size, 1).float())
        self.bu = torch.nn.Parameter(torch.zeros(hidden_size, 1).float())
        self.bc = torch.nn.Parameter(torch.zeros(hidden_size, 1).float())

        self.tanh = torch.nn.Tanh()
        self.softmax = torch.nn.Softmax(dim=1)
        self.sigmoid = torch.nn.Sigmoid()
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        hidden = torch.zeros(x.size(0), self.hidden_size).to(x.device)
        outputs = []
        for i in range(len(x)):
            xt = x[i].reshape(-1,1)
            u = torch.cat((hidden, xt.T), 1).T
            gu = self.sigmoid(torch.matmul(self.Wu, u) + self.bu)
            gr = self.sigmoid(torch.matmul(self.Wr, u) + self.br)
            r_hidden = gr * hidden.T
            u_tilde = torch.cat((r_hidden, xt), 0)
            hidden_tilde = self.tanh(torch.matmul(self.Wc, u_tilde) + self.bc)
            hidden = gu * hidden_tilde + (1 - gu) * hidden
            outputs.append(hidden)
        outputs = torch.stack(outputs)
        outputs = self.fc(outputs)
        return outputs
    
gru = GRU(64, 13, 35)
summary(gru, input_size=(1, 64))

Layer (type:depth-idx)                   Output Shape              Param #
GRU                                      [1, 13, 35]               3,042
├─Sigmoid: 1-1                           [13, 1]                   --
├─Sigmoid: 1-2                           [13, 1]                   --
├─Tanh: 1-3                              [13, 1]                   --
├─Linear: 1-4                            [1, 13, 35]               490
Total params: 3,532
Trainable params: 3,532
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.01

# Attention Mechanism

In [39]:
# signle scaled dot-product attention
class ScaledDotProductAttention(torch.nn.Module):
    def __init__(self, input_size, masked = False):
        super(ScaledDotProductAttention, self).__init__()
        self.input_size = input_size
        self.masked = masked
        
        self.Q = torch.nn.Linear(input_size, input_size) # batch_size, seq_len, input_size
        self.K = torch.nn.Linear(input_size, input_size)
        self.V = torch.nn.Linear(input_size, input_size)

        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x): # x: batch_size, seq_len, input_size
        Q = self.Q(x)
        K = self.K(x)
        V = self.V(x)

        score = torch.bmm(Q, K.transpose(1, 2))
        score = score / torch.sqrt(torch.tensor(K.size(1)).float())

        mask = torch.randn(score.size()).to(score.device)
        if self.masked:
            score = score.masked_fill(mask == 0, -1e9)
        attention = self.softmax(score)

        output = torch.bmm(attention, V)
        return output
    
att = ScaledDotProductAttention(64)
summary(att, input_size=(256, 64, 64))



Layer (type:depth-idx)                   Output Shape              Param #
ScaledDotProductAttention                [256, 64, 64]             --
├─Linear: 1-1                            [256, 64, 64]             4,160
├─Linear: 1-2                            [256, 64, 64]             4,160
├─Linear: 1-3                            [256, 64, 64]             4,160
├─Softmax: 1-4                           [256, 64, 64]             --
Total params: 12,480
Trainable params: 12,480
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 3.19
Input size (MB): 4.19
Forward/backward pass size (MB): 25.17
Params size (MB): 0.05
Estimated Total Size (MB): 29.41

In [None]:
# multi-head attention

        
        