# Recurrent Neural Networks

### Computing Activations and Outputs in RNN

In [1]:
import torch 
import torch.nn as nn

import platform as pl 

use_GPU = True

if use_GPU:
    if pl.system().lower() == "linux":
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
else:
    device = torch.device("cpu")

print(f"Using device {device}")

Using device cuda


In [2]:
torch.manual_seed(1)
rnn_layer = nn.RNN(input_size=5, hidden_size=2, batch_first=True, num_layers=1)

In [3]:
# extract the recurrence layers (layers with weights)
w_xh = rnn_layer.weight_ih_l0 
w_hh = rnn_layer.weight_hh_l0
b_xh = rnn_layer.bias_ih_l0
b_hh = rnn_layer.bias_hh_l0

print(f"W_xh shape: {w_xh.shape}")
print(f"W_hh shape: {w_hh.shape}")
print(f"B_xh shape: {b_xh.shape}")
print(f"B_hh shape: {b_hh.shape}")

W_xh shape: torch.Size([2, 5])
W_hh shape: torch.Size([2, 2])
B_xh shape: torch.Size([2])
B_hh shape: torch.Size([2])


Manually call forward pass on a single layer `rnn_layer` and compute outputs at each time step  $o^{(0)},o^{(1)},o^{(2)}$. 

In [4]:
x_seq = torch.Tensor([[1.0]*5, [2.0]*5, [3.0]*5]).float()
print(f"x_seq shape: {x_seq.shape}")

# output of simple RNN
output, hn = rnn_layer(x_seq.view(1,3,5))

# manually compute the output
out_manual = []
for t in range(3):
    xt = x_seq[t].reshape(1,5)
    print(f"Time step {t} =>")
    print(f"\tInput = {xt.numpy()}")
    ht = torch.matmul(xt, torch.transpose(w_xh, 0, 1)) + b_xh
    print(f"\tHidden = {ht.detach().numpy()}")
    if t > 0:
        prev_h = out_manual[t-1]
    else:
        prev_h = torch.zeros((ht.shape))
    ot = ht + torch.matmul(prev_h, torch.transpose(w_hh, 0, 1)) + b_hh
    ot = torch.tanh(ot)
    out_manual.append(ot)
    print(f"\tOutput (manual) = {ot.detach().numpy()}")
    print(f"\tRNN Output = {output[:,t].detach().numpy()}")


x_seq shape: torch.Size([3, 5])
Time step 0 =>
	Input = [[1. 1. 1. 1. 1.]]
	Hidden = [[-0.4701929   0.58639044]]
	Output (manual) = [[-0.3519801   0.52525216]]
	RNN Output = [[-0.35198015  0.52525216]]
Time step 1 =>
	Input = [[2. 2. 2. 2. 2.]]
	Hidden = [[-0.88883156  1.2364398 ]]
	Output (manual) = [[-0.68424344  0.76074266]]
	RNN Output = [[-0.68424344  0.76074266]]
Time step 2 =>
	Input = [[3. 3. 3. 3. 3.]]
	Hidden = [[-1.3074702  1.8864892]]
	Output (manual) = [[-0.8649416  0.9046636]]
	RNN Output = [[-0.8649416   0.90466356]]


### Project 1: Predicting Sentiment of IMDb Movie Reviews

Before data is fed into the model, the following preprocessing steps are needed:
1. Split training set into train and val
2. Identify unique words 
3. Map unique word to unique integer and encode movie review into encoded integers
4. Divide dataset into mini-batches as RNN input

In [5]:
from torchtext.datasets import IMDB

train_dataset = IMDB(split="train")
test_dataset = IMDB(split="test")



In [6]:
from torch.utils.data.dataset import random_split

# 1. create the datasets
torch.manual_seed(1)
train_dataset, val_dataset = random_split(list(train_dataset), [20000, 5000])

In [7]:
# 2. find unique tokens
import re 
from collections import Counter, OrderedDict

def tokenizer(text):
    # remove URLS
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

token_counts = Counter() # dict subclass for counting frequency of hashable items

for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)

print(f"Vocab size = {len(token_counts)}")

Vocab size = 69023


In [8]:
# 3. encode each unique token into integers 
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ["this", "is", "an", "example"]])

[11, 7, 35, 457]


In [9]:
# 3-A. define functions for token transformation
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1 if x == "pos" else 0

In [10]:
# 3-B. wrap the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), 
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [11]:
# see how padding works 
from torch.utils.data import DataLoader

dataloader = DataLoader(train_dataset, batch_size=2,shuffle=False, collate_fn=collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch)
print("Labels = ", label_batch)
print("Batch lengths = ", length_batch)
print("Batch shape = ", text_batch.shape) # will see that batches all have shape equal to longest length

tensor([[   35,  1739,     7,   449,   721,     6,   301,     4,   787,     9,
             4,    18,    44,     2,  1705,  2460,   186,    25,     7,    24,
           100,  1874,  1739,    25,     7, 34415,  3568,  1103,  7517,   787,
             5,     2,  4991, 12401,    36,     7,   148,   111,   939,     6,
         11598,     2,   172,   135,    62,    25,  3199,  1602,     3,   928,
          1500,     9,     6,  4601,     2,   155,    36,    14,   274,     4,
         42945,     9,  4991,     3,    14, 10296,    34,  3568,     8,    51,
           148,    30,     2,    58,    16,    11,  1893,   125,     6,   420,
          1214,    27, 14542,   940,    11,     7,    29,   951,    18,    17,
         15994,   459,    34,  2480, 15211,  3713,     2,   840,  3200,     9,
          3568,    13,   107,     9,   175,    94,    25,    51, 10297,  1796,
            27,   712,    16,     2,   220,    17,     4,    54,   722,   238,
           395,     2,   787,    32,    27,  5236,  

In [12]:
batch_size = 32  

train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

### Embedding

Idea: use finite-sized vectors to represent infinite number of real numbers.

Since embeddings are learnable, salient features can be extracted.

In [13]:
embedding = nn.Embedding(num_embeddings=10, embedding_dim=3, padding_idx=0) # 10 just means there's 10 different tokens for this toy example
text_encoded_input = torch.LongTensor([[1,2,3,4],[4,3,2,0]])
print(embedding(text_encoded_input)) 

tensor([[[ 0.7039, -0.8321, -0.4651],
         [-0.3203,  2.2408,  0.5566],
         [ 0.0946, -0.3531,  0.9124],
         [-0.4643,  0.3046,  0.7046]],

        [[-0.4643,  0.3046,  0.7046],
         [ 0.0946, -0.3531,  0.9124],
         [-0.3203,  2.2408,  0.5566],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


### Building RNN Model

Can either use `RNN`, `LSTM`, or `GRU` when building a recurrent neural network.

NOTE: `LSTM` will output a tuple containing (1) the output features and (2) hidden states and cell states.

In [14]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, rnn_layer=nn.RNN):
        super().__init__()
        self.rnn = rnn_layer(input_size, hidden_size, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        _, hidden = self.rnn(x)
        if isinstance(self.rnn, nn.LSTM):
            hidden = hidden[0]
        out = hidden[-1, :, :]
        out = self.fc(out)
        return out

In [15]:
model = RNN(input_size=64, hidden_size=32, rnn_layer=nn.LSTM)
print(model)
print(model(torch.randn((5, 3, 64))))

RNN(
  (rnn): LSTM(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)
tensor([[0.0774],
        [0.0780],
        [0.0749],
        [0.0779],
        [0.0865]], grad_fn=<AddmmBackward0>)


### Building a RNN for Sentiment Analysis Task

In [16]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size, output_size, rnn_layer=nn.RNN):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = rnn_layer(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), 
                                                    enforce_sorted=False, batch_first=True)
        if isinstance(self.rnn, nn.LSTM):
            out, (hidden, cell) = self.rnn(out)
        else:
            out, hidden = self.rnn(out)
        
        out = hidden[-1,:,:]
        out = self.relu(self.fc1(out))
        out = self.sigmoid(self.fc2(out))
        return out     

In [17]:
vocab_size = len(vocab)
print(f"Vocab size = {vocab_size}")

embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
torch.manual_seed(1)

model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size, 1, rnn_layer=nn.LSTM)
print(model)

Vocab size = 69025
RNN(
  (embedding): Embedding(69025, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [18]:
def train(model, train_loader, optimizer, loss_fn):
    model.to(device)
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in train_loader:
        text_batch, label_batch = text_batch.to(device), label_batch.to(device).float()
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:,0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(train_loader.dataset), total_loss/len(train_loader.dataset)

In [19]:
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    for text_batch, label_batch, lengths in val_loader:
        text_batch, label_batch = text_batch.to(device), label_batch.to(device)
        pred = model(text_batch, lengths)[:,0]
        total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
    return total_acc/len(val_loader.dataset)

In [20]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [21]:
num_epochs = 10
torch.manual_seed(1)
for e in range(num_epochs):
    acc_train, acc_loss = train(model, train_dl, optimizer, loss_fn)
    acc_val = evaluate(model, valid_dl)
    print(f"Epoch {e}/{num_epochs}: Loss = {acc_loss:.4f} | Train Acc = {acc_train:.4f} | Val Acc = {acc_val:.4f}")