In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook

from argparse import Namespace
import os
import collections
from collections import Counter

import string

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

### Save parameters

In [2]:
def save_torch_to_csv(torch_data,filename):
    pd.DataFrame(torch_data.detach().numpy()).to_csv(filename)

### Set random seeds

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    print(f"seed: {seed}")
    
seed_everything(42)

seed: 42


# 1. SurnameGenerationModel

In [4]:
class SurnameGenerationModel(nn.Module):
    
    def __init__(self, 
                 embedding_dim, 
                 num_embeddings,
                 rnn_hidden_size, 
                 batch_first = True, 
                 padding_idx = 0,
                 dropout_p   = 0.5
                ):
        """
        Args:
            For embedding
            embedding_dim (int)  : The size of the character embeddings
            num_embeddings (int) : The number of characters to embed
                                   len(surname_vocab)
            padding_idx (int)    : The index for the tensor padding
            
            For RNN
            rnn_hidden_size (int): The size of the RNN's hidden state
            batch_first (bool)   : Informs whether the input tensors will 
                                   have batch or the sequence on the 0th dimension
            For dropout
            dropout_p (float)    : The probability of zeroing activations using 
                                   the dropout method. Higher means more likely to zero.
            
        """
        super(SurnameGenerationModel, self).__init__()

        self.emb = nn.Embedding(num_embeddings = num_embeddings,
                                embedding_dim  = embedding_dim,
                                padding_idx    = padding_idx)
        self.rnn = nn.GRU(input_size   = embedding_dim,
                          hidden_size  = rnn_hidden_size,
                          batch_first  = batch_first)
        self.fc  = nn.Linear(in_features  = rnn_hidden_size,
                             out_features = num_embeddings)
        
    def forward(self, x_in, apply_softmax=False):
        """The forward pass of the model
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (batch, input_dim)
            apply_softmax (bool): a flag for the softmax activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch, num_embeddings)
        """
        x_embedded = self.emb(x_in)
        y_out = self.rnn(x_embedded)

        batch_size, seq_size, feat_size = y_out.shape
        y_out = y_out.contiguous().view(batch_size * seq_size, feat_size)

        y_out = self.fc(F.dropout(y_out, p=self._dropout_p))
                         
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)
            
        new_feat_size = y_out.shape[-1]
        y_out = y_out.view(batch_size, seq_size, new_feat_size)
            
        return y_out

# 2. The Embedding layer

### The x input

In [5]:
seed_everything(42)
batch_size     = 2
length_of_text = 7
my_input = torch.randint(0,20,[batch_size, length_of_text])
my_input

seed: 42


tensor([[ 2,  7, 16, 14,  6, 15,  0],
        [ 4, 10, 13, 18, 14, 10, 14]])

### The nn.Embedding layer
#### More details [Embedding](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/Frankenstein/Embedding_layer.ipynb), [Embedding+CNN](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/AGNews/NewsClassifier.ipynb)

In [6]:
# the number of embeddings (the number of vocabulary items)
n_tokens_in_vocabulary = 20
# size of the embeddings (embedding dimension)
dimension_embedding    = 4
# If one specifies padding_idx=0 every input where the value is equal to padding_idx will 
# be zero-ed out 
padding_idx            = 0

In [7]:
seed_everything(42)
my_emb = nn.Embedding(num_embeddings = n_tokens_in_vocabulary, 
                      embedding_dim  = dimension_embedding,
                      padding_idx    = padding_idx)  

seed: 42


In [8]:
### The output of the nn.Embedding layer (i.e.,the input of the RNNCell layer)
x_in = my_emb(my_input)

In [9]:
print("For the nn.Embedding Layer")
print("Shape of input: ", my_input.shape)
print("Shape of output: ", x_in.shape)
print(x_in.shape)
print("-"*60)
print("x_in, the input to the next step")
print(x_in)

For the nn.Embedding Layer
Shape of input:  torch.Size([2, 7])
Shape of output:  torch.Size([2, 7, 4])
torch.Size([2, 7, 4])
------------------------------------------------------------
x_in, the input to the next step
tensor([[[-0.7521,  1.6487, -0.3925, -1.4036],
         [-0.2316,  0.0418, -0.2516,  0.8599],
         [ 1.4451,  0.8564,  2.2181,  0.5232],
         [-1.4032,  0.0360, -0.0635,  0.6756],
         [ 1.2791,  1.2964,  0.6105,  1.3347],
         [-0.0978,  1.8446, -1.1845,  1.3835],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 1.6423, -0.1596, -0.4974,  0.4396],
         [-1.5576,  0.9956, -0.8798, -0.6011],
         [-0.4880,  1.1914, -0.8140, -0.7360],
         [-0.1722,  0.5238,  0.0566,  0.4263],
         [-1.4032,  0.0360, -0.0635,  0.6756],
         [-1.5576,  0.9956, -0.8798, -0.6011],
         [-1.4032,  0.0360, -0.0635,  0.6756]]], grad_fn=<EmbeddingBackward0>)


# 3.  The LSTM layer
## [torch.nn.LSTM](https://pytorch.org/docs/2.1/generated/torch.nn.LSTM.html#lstm)
### AS shown above, the output of the emdedding layer is a tensor with dimension = [2,7,4]. 
### In this context, the three dimensions could be 
- batch_first = True: [batch_size, sequence_size, feature_size]
- batch_first = False: [sequence_size, batch_size, feature_size]

###  The meaning of the three sizes are below:
#### - sequence_size: The length of the sequence, representing the number of time steps in the input sequence. In NLP, this typically corresponds to the number of words (or other basic units) in the input sentence or document. Each time step usually represents one word (or word vector). In this case, sequence_size = 7, meaning that there are seven words in each sentence input, or there are seven characters in each word input. 
#### - batch_size: The size of the batch, indicating the number of samples processed simultaneously during a single training or inference process. A larger batch size can utilize parallel computation to improve efficiency. In NLP, batch size often corresponds to processing multiple sentences or documents in a single input. In this case, batch_size = 2, meaning that there are 2 sentence inputs, or 2 word inputs. 
#### feature_size: The input dimension, representing the number of input features at each time step. In NLP, this typically corresponds to the dimensionality of word embeddings. For example, if each word is represented using 300-dimensional word vectors, then the input_size would be 300. In this case, feature_size = 4, meaning that the embedding dimension is 4, and each word in a sentence of each character in a word is represented by a vector of legnth 4. 

In [10]:
print("Shape of x_in(output of the embedding layer):", x_in.size())
batch_size, seq_size, feat_size = x_in.size()
print("batch_size:", batch_size)
print("seq_size:", seq_size)
print("feat_size:", feat_size)

Shape of x_in(output of the embedding layer): torch.Size([2, 7, 4])
batch_size: 2
seq_size: 7
feat_size: 4


In [11]:
print("Note that batch_first=False, i.e, the first dimension is the batch_size=2")
print("Shape:", x_in.shape)
x_in

Note that batch_first=False, i.e, the first dimension is the batch_size=2
Shape: torch.Size([2, 7, 4])


tensor([[[-0.7521,  1.6487, -0.3925, -1.4036],
         [-0.2316,  0.0418, -0.2516,  0.8599],
         [ 1.4451,  0.8564,  2.2181,  0.5232],
         [-1.4032,  0.0360, -0.0635,  0.6756],
         [ 1.2791,  1.2964,  0.6105,  1.3347],
         [-0.0978,  1.8446, -1.1845,  1.3835],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 1.6423, -0.1596, -0.4974,  0.4396],
         [-1.5576,  0.9956, -0.8798, -0.6011],
         [-0.4880,  1.1914, -0.8140, -0.7360],
         [-0.1722,  0.5238,  0.0566,  0.4263],
         [-1.4032,  0.0360, -0.0635,  0.6756],
         [-1.5576,  0.9956, -0.8798, -0.6011],
         [-1.4032,  0.0360, -0.0635,  0.6756]]], grad_fn=<EmbeddingBackward0>)

In [12]:
rnn_hidden_size = 3
seed_everything(42)
my_LSTM = nn.LSTM(input_size   = dimension_embedding,
                  hidden_size  = rnn_hidden_size,
                  batch_first  = True)

seed: 42


In [13]:
list(my_LSTM.named_parameters())

[('weight_ih_l0',
  Parameter containing:
  tensor([[ 0.4414,  0.4792, -0.1353,  0.5304],
          [-0.1265,  0.1165, -0.2811,  0.3391],
          [ 0.5090, -0.4236,  0.5018,  0.1081],
          [ 0.4266,  0.0782,  0.2784, -0.0815],
          [ 0.4451,  0.0853, -0.2695,  0.1472],
          [-0.2660, -0.0677, -0.2345,  0.3830],
          [-0.4557, -0.2662, -0.1630, -0.3471],
          [ 0.0545, -0.5702,  0.5214, -0.4904],
          [ 0.4457,  0.0961, -0.1875,  0.3568],
          [ 0.0900,  0.4665,  0.0631, -0.1821],
          [ 0.1551, -0.1566,  0.2430,  0.5155],
          [ 0.3337, -0.2524,  0.3333,  0.1033]], requires_grad=True)),
 ('weight_hh_l0',
  Parameter containing:
  tensor([[ 0.2932, -0.3519, -0.5715],
          [-0.2231, -0.4428,  0.4737],
          [ 0.1663,  0.2391,  0.1826],
          [-0.0100,  0.4518, -0.4102],
          [ 0.0364, -0.3941,  0.1780],
          [-0.1988,  0.1769, -0.1203],
          [ 0.4788, -0.3422, -0.3443],
          [-0.3444,  0.5193,  0.1924],
     

In [14]:
W_i  = list(my_LSTM.named_parameters())[0]
W_ii = W_i[1][0:3,]
W_if = W_i[1][3:6,]
W_ig = W_i[1][6:9,]
W_io = W_i[1][9:12,]

W_h  = list(my_LSTM.named_parameters())[1]
W_hi = W_h[1][0:3,]
W_hf = W_h[1][3:6,]
W_hg = W_h[1][6:9,]
W_ho = W_h[1][9:12,]


b_i  = list(my_LSTM.named_parameters())[2]
b_ii = b_i[1][0:3,]
b_if = b_i[1][3:6,]
b_ig = b_i[1][6:9,]
b_io = b_i[1][9:12,]


b_h  = list(my_LSTM.named_parameters())[3]
b_hi = b_h[1][0:3,]
b_hf = b_h[1][3:6,]
b_hg = b_h[1][6:9,]
b_ho = b_h[1][9:12,]

In [15]:
initial_hidden = torch.zeros((1, batch_size, rnn_hidden_size))
initial_hidden

tensor([[[0., 0., 0.],
         [0., 0., 0.]]])

In [16]:
initial_cell = torch.zeros((1, batch_size, rnn_hidden_size))
initial_cell

tensor([[[0., 0., 0.],
         [0., 0., 0.]]])

In [17]:
### The argument "h_0 = initial_hidden" can be omitted 
### because h_0 defaults to zeros if not provided.
output, hc_final = my_LSTM(x_in, (initial_hidden,initial_cell))
output

tensor([[[ 0.2113, -0.0331, -0.0089],
         [ 0.0287, -0.1838,  0.0847],
         [-0.4075, -0.1287,  0.3579],
         [ 0.0038, -0.1198,  0.0980],
         [-0.4373, -0.2892,  0.2922],
         [-0.4714, -0.2028,  0.1304],
         [-0.1669, -0.1361,  0.1724]],

        [[-0.2407, -0.1090,  0.2486],
         [ 0.1700, -0.0607,  0.0455],
         [ 0.2134, -0.0807,  0.0412],
         [ 0.0218, -0.1701,  0.1125],
         [ 0.1341, -0.1551,  0.0690],
         [ 0.2758, -0.0733,  0.0209],
         [ 0.1731, -0.1654,  0.0398]]], grad_fn=<TransposeBackward0>)

In [18]:
manual_output = []

h = initial_hidden
c = initial_cell
for t in range(seq_size):
    x_t = x_in[:,t,:]
    print("Time Step ", t)
    print("i(input)")
    i = torch.sigmoid(torch.matmul(x_t, W_ii.T)+b_ii+torch.matmul(h, W_hi.T)+b_hi).squeeze(0)
    print(i)
    print("f(forget)")
    f = torch.sigmoid(torch.matmul(x_t, W_if.T)+b_if + torch.matmul(h, W_hf.T)+b_hf).squeeze(0)
    print(f)
    print("g(cell)")
    g = torch.matmul(x_t, W_ig.T)+b_ig + torch.matmul(h, W_hg.T)+b_hg
    g = torch.tanh(g).squeeze(0)
    print(g)
    print("o(output)")
    o = torch.sigmoid(torch.matmul(x_t, W_io.T)+b_io+torch.matmul(h, W_ho.T)+b_ho).squeeze(0)
    print(o)
    print("c(cell state)")
    c = f * c + i * g
    print(c)
    print("h(hidden state)")
    h = o * torch.tanh(c)
    h = h.squeeze(0)
    print(h)
    manual_output.append(h)
    print("-"*100)

manual_h_final = h
manual_c_final = c

Time Step  0
i(input)
tensor([[0.6561, 0.3989, 0.2135],
        [0.8615, 0.4331, 0.6957]], grad_fn=<SqueezeBackward1>)
f(forget)
tensor([[0.2325, 0.2856, 0.4908],
        [0.3791, 0.5730, 0.5447]], grad_fn=<SqueezeBackward1>)
g(cell)
tensor([[ 0.4629, -0.7871, -0.2415],
        [-0.6241, -0.6966,  0.8679]], grad_fn=<SqueezeBackward1>)
o(output)
tensor([[0.7171, 0.1089, 0.1723],
        [0.4900, 0.3721, 0.4606]], grad_fn=<SqueezeBackward1>)
c(cell state)
tensor([[[ 0.3037, -0.3139, -0.0515],
         [-0.5377, -0.3017,  0.6038]]], grad_fn=<AddBackward0>)
h(hidden state)
tensor([[ 0.2113, -0.0331, -0.0089],
        [-0.2407, -0.1090,  0.2486]], grad_fn=<SqueezeBackward1>)
----------------------------------------------------------------------------------------------------
Time Step  1
i(input)
tensor([[0.7967, 0.5070, 0.4956],
        [0.5731, 0.5608, 0.1658]], grad_fn=<SqueezeBackward1>)
f(forget)
tensor([[0.2216, 0.3757, 0.6727],
        [0.1258, 0.2683, 0.6552]], grad_fn=<SqueezeBackwa

In [19]:
manual_output = torch.stack(manual_output).permute(1,0,2)
manual_output

tensor([[[ 0.2113, -0.0331, -0.0089],
         [ 0.0287, -0.1838,  0.0847],
         [-0.4075, -0.1287,  0.3579],
         [ 0.0038, -0.1198,  0.0980],
         [-0.4373, -0.2892,  0.2922],
         [-0.4714, -0.2028,  0.1304],
         [-0.1669, -0.1361,  0.1724]],

        [[-0.2407, -0.1090,  0.2486],
         [ 0.1700, -0.0607,  0.0455],
         [ 0.2134, -0.0807,  0.0412],
         [ 0.0218, -0.1701,  0.1125],
         [ 0.1341, -0.1551,  0.0690],
         [ 0.2758, -0.0733,  0.0209],
         [ 0.1731, -0.1654,  0.0398]]], grad_fn=<PermuteBackward0>)

In [20]:
output

tensor([[[ 0.2113, -0.0331, -0.0089],
         [ 0.0287, -0.1838,  0.0847],
         [-0.4075, -0.1287,  0.3579],
         [ 0.0038, -0.1198,  0.0980],
         [-0.4373, -0.2892,  0.2922],
         [-0.4714, -0.2028,  0.1304],
         [-0.1669, -0.1361,  0.1724]],

        [[-0.2407, -0.1090,  0.2486],
         [ 0.1700, -0.0607,  0.0455],
         [ 0.2134, -0.0807,  0.0412],
         [ 0.0218, -0.1701,  0.1125],
         [ 0.1341, -0.1551,  0.0690],
         [ 0.2758, -0.0733,  0.0209],
         [ 0.1731, -0.1654,  0.0398]]], grad_fn=<TransposeBackward0>)

In [21]:
manual_h_final

tensor([[-0.1669, -0.1361,  0.1724],
        [ 0.1731, -0.1654,  0.0398]], grad_fn=<SqueezeBackward1>)

In [22]:
manual_c_final

tensor([[[-0.2965, -0.6367,  0.5053],
         [ 0.4401, -0.5866,  0.1724]]], grad_fn=<AddBackward0>)

In [23]:
hc_final

(tensor([[[-0.1669, -0.1361,  0.1724],
          [ 0.1731, -0.1654,  0.0398]]], grad_fn=<StackBackward0>),
 tensor([[[-0.2965, -0.6367,  0.5053],
          [ 0.4401, -0.5866,  0.1724]]], grad_fn=<StackBackward0>))

# 5. The Linear layers

In [24]:
batch_size, seq_size, feat_size = output.shape
y_out = output.contiguous().view(batch_size * seq_size, feat_size)
y_out

tensor([[ 0.2113, -0.0331, -0.0089],
        [ 0.0287, -0.1838,  0.0847],
        [-0.4075, -0.1287,  0.3579],
        [ 0.0038, -0.1198,  0.0980],
        [-0.4373, -0.2892,  0.2922],
        [-0.4714, -0.2028,  0.1304],
        [-0.1669, -0.1361,  0.1724],
        [-0.2407, -0.1090,  0.2486],
        [ 0.1700, -0.0607,  0.0455],
        [ 0.2134, -0.0807,  0.0412],
        [ 0.0218, -0.1701,  0.1125],
        [ 0.1341, -0.1551,  0.0690],
        [ 0.2758, -0.0733,  0.0209],
        [ 0.1731, -0.1654,  0.0398]], grad_fn=<ViewBackward0>)

![image.png](attachment:image.png)

In [25]:
seed_everything(42)
fc = nn.Linear(in_features   = rnn_hidden_size, 
               out_features  = seq_size)

seed: 42


In [26]:
list(fc.named_parameters())

[('weight',
  Parameter containing:
  tensor([[ 0.4414,  0.4792, -0.1353],
          [ 0.5304, -0.1265,  0.1165],
          [-0.2811,  0.3391,  0.5090],
          [-0.4236,  0.5018,  0.1081],
          [ 0.4266,  0.0782,  0.2784],
          [-0.0815,  0.4451,  0.0853],
          [-0.2695,  0.1472, -0.2660]], requires_grad=True)),
 ('bias',
  Parameter containing:
  tensor([-0.0677, -0.2345,  0.3830, -0.4557, -0.2662, -0.1630, -0.3471],
         requires_grad=True))]

In [27]:
y_out = fc(output)
y_out

tensor([[[ 0.0109, -0.1193,  0.3078, -0.5628, -0.1811, -0.1957, -0.4066],
         [-0.1546, -0.1862,  0.3557, -0.5510, -0.2447, -0.2400, -0.4045],
         [-0.3577, -0.3926,  0.6361, -0.3091, -0.3504, -0.1566, -0.3515],
         [-0.1367, -0.2059,  0.3912, -0.5069, -0.2466, -0.2083, -0.3919],
         [-0.4388, -0.3958,  0.5566, -0.3841, -0.3940, -0.2312, -0.3496],
         [-0.3906, -0.4437,  0.5131, -0.3437, -0.4468, -0.2037, -0.2846],
         [-0.2299, -0.2857,  0.4715, -0.4347, -0.3000, -0.1953, -0.3680]],

        [[-0.2598, -0.3194,  0.5402, -0.3816, -0.3082, -0.1707, -0.3644],
         [-0.0279, -0.1314,  0.3378, -0.5533, -0.1857, -0.2000, -0.4140],
         [-0.0178, -0.1063,  0.3166, -0.5822, -0.1700, -0.2128, -0.4275],
         [-0.1548, -0.1883,  0.3764, -0.5382, -0.2388, -0.2309, -0.4080],
         [-0.0921, -0.1357,  0.3278, -0.5829, -0.2019, -0.2371, -0.4245],
         [ 0.0160, -0.0765,  0.2913, -0.6071, -0.1485, -0.2164, -0.4378],
         [-0.0760, -0.1171,  0.2985,

![image-3.png](attachment:image-3.png)

In [28]:
new_feat_size = y_out.shape[-1]
y_out = y_out.view(batch_size, seq_size, new_feat_size)
y_out

tensor([[[ 0.0109, -0.1193,  0.3078, -0.5628, -0.1811, -0.1957, -0.4066],
         [-0.1546, -0.1862,  0.3557, -0.5510, -0.2447, -0.2400, -0.4045],
         [-0.3577, -0.3926,  0.6361, -0.3091, -0.3504, -0.1566, -0.3515],
         [-0.1367, -0.2059,  0.3912, -0.5069, -0.2466, -0.2083, -0.3919],
         [-0.4388, -0.3958,  0.5566, -0.3841, -0.3940, -0.2312, -0.3496],
         [-0.3906, -0.4437,  0.5131, -0.3437, -0.4468, -0.2037, -0.2846],
         [-0.2299, -0.2857,  0.4715, -0.4347, -0.3000, -0.1953, -0.3680]],

        [[-0.2598, -0.3194,  0.5402, -0.3816, -0.3082, -0.1707, -0.3644],
         [-0.0279, -0.1314,  0.3378, -0.5533, -0.1857, -0.2000, -0.4140],
         [-0.0178, -0.1063,  0.3166, -0.5822, -0.1700, -0.2128, -0.4275],
         [-0.1548, -0.1883,  0.3764, -0.5382, -0.2388, -0.2309, -0.4080],
         [-0.0921, -0.1357,  0.3278, -0.5829, -0.2019, -0.2371, -0.4245],
         [ 0.0160, -0.0765,  0.2913, -0.6071, -0.1485, -0.2164, -0.4378],
         [-0.0760, -0.1171,  0.2985,

In [29]:
y_out.max(dim=1)

torch.return_types.max(
values=tensor([[ 0.0109, -0.1193,  0.6361, -0.3091, -0.1811, -0.1566, -0.2846],
        [ 0.0160, -0.0765,  0.5402, -0.3816, -0.1485, -0.1707, -0.3644]],
       grad_fn=<MaxBackward0>),
indices=tensor([[0, 0, 2, 2, 0, 2, 5],
        [5, 5, 0, 0, 5, 0, 0]]))

![image-2.png](attachment:image-2.png)