In [1]:
import torch
import torch.nn as nn
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import numpy as np

# Text representation 

## Vectorization
Convert text to indices vector

In [2]:
# Init tokenizer
tokenizer = get_tokenizer('basic_english')

# using yield for large corpus
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)


def vectorize(text, vocab, sequence_length=5):
    tokens = tokenizer(text)
    tokens = [vocab[token] for token in tokens]

    # padding, if len(text) > sequence_lenght -> num_pad < 0
    num_pads = sequence_length - len(tokens)
    tokens = tokens[:sequence_length] + [vocab["<pad>"]]*num_pads
    
    return torch.tensor(tokens, dtype=torch.long)

In [3]:
# sample data
sample1 = 'We are learning AI'
sample2 = 'AI is a CS topic'
data = [sample1, sample2]

# create vocab size
vocab_size = 8
vocab = build_vocab_from_iterator(yield_tokens(data),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", 
                                            "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'learning': 7,
 'is': 6,
 'are': 4,
 'cs': 5,
 'a': 3,
 'ai': 2,
 '<pad>': 1,
 '<unk>': 0}

In [4]:
SEQUENCE_LENGTH = 5
v1 = vectorize(sample1, vocab, sequence_length=SEQUENCE_LENGTH)
v2 = vectorize(sample2, vocab, sequence_length=SEQUENCE_LENGTH)
v1, v2

(tensor([0, 4, 7, 2, 1]), tensor([2, 6, 3, 5, 0]))

## Embedding
Convert indices vector to tensor using embedding layer (trainable embedding matrix)

In [9]:
# Set random seed for PyTorch
torch.manual_seed(42)

embed_dim = 4
embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
print(embedding.weight.shape)

torch.Size([8, 4])


In [53]:
e1 = embedding(v1)
e2 = embedding(v2)
e1.shape, e2.shape

(torch.Size([5, 4]), torch.Size([5, 4]))

# RNN_layer

In [45]:
import torchinfo

sequence_lenth = 5
embed_dim = 4
hidden_dim = 3
rnn = nn.RNN(embed_dim, hidden_dim, num_layers=2, batch_first=True)
# input = (bath, L, embed_dim)
torchinfo.summary(rnn, input=(1, sequence_lenth, embed_dim))

Layer (type:depth-idx)                   Param #
RNN                                      51
Total params: 51
Trainable params: 51
Non-trainable params: 0

In [46]:
# input embedding 1
e1.shape, e1.unsqueeze(0).shape

(torch.Size([5, 4]), torch.Size([1, 5, 4]))

In [47]:
rnn_op, rnn_hn = rnn(e1.unsqueeze(0))
rnn_op.shape, rnn_hn.shape

(torch.Size([1, 5, 3]), torch.Size([2, 1, 3]))

##  Visualize output

In [48]:
# all hiden state [batch, L, H_out]
rnn_op[:, -1, :]

tensor([[ 0.3051,  0.1533, -0.8618]], grad_fn=<SliceBackward0>)

In [51]:
# final hidden state for each element in the batch.
# [num_layers, N, H_out]
rnn_hn[-1, :, :]

tensor([[ 0.3051,  0.1533, -0.8618]], grad_fn=<SliceBackward0>)

In [57]:
# batch = 8 ?
batch_size = 8
rnn_op, rnn_hn = rnn(torch.rand([batch_size, 5, 4]))
rnn_op.shape, rnn_hn.shape

(torch.Size([8, 5, 3]), torch.Size([2, 8, 3]))

## Bidirectional

In [67]:
batch_size = 8
sequence_lenth = 5
embed_dim = 4
hidden_dim = 3
num_layers = 1
rnn = nn.RNN(embed_dim, hidden_dim, num_layers, bidirectional=True, batch_first=True)
torchinfo.summary(rnn, input=(batch_size, sequence_lenth, embed_dim))

Layer (type:depth-idx)                   Param #
RNN                                      54
Total params: 54
Trainable params: 54
Non-trainable params: 0

In [68]:
rnn_op, rnn_hn = rnn(torch.rand([batch_size, sequence_lenth, embed_dim]))
rnn_op.shape, rnn_hn.shape

(torch.Size([8, 5, 6]), torch.Size([2, 8, 3]))