**This tutorial is based on/ copied from http://peterbloem.nl/blog/transformers.**

# Imports

In [None]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam

!pip install pytorch_lightning
import pytorch_lightning as pl

# Load the TensorBoard notebook extension
%load_ext tensorboard

# Transformer Implementation
From previous exercise.

The transformer architecture consists of multiple transformer blocks that typically look like this: 

<img src="https://raw.githubusercontent.com/leox1v/dl20/b3d5b5556d1b2bd360a4abeef4fd82f056ab0301/imgs/transformer-block.svg?token=AD5WN2SZYWM6XGH5SXMZM7S7VU3H4" alt="drawing" width="500"/>


It combines a self attention layer, [layer normalization](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html), a feed forward layer and another layer normalization. Additionally, it uses residual connections around the self attention and feed forward layer.

In [None]:
# Let's implement a SelfAttention torch module.

class SelfAttention(nn.Module):
    """
    A SelfAttention model.
    
    Args:
        d: The embedding dimension.
        heads: The number of attention heads.
    """
    def __init__(self, d: int, heads: int=8):
        super().__init__()
        self.k, self.h = d, heads
        
        self.Wq = nn.Linear(d, d * heads, bias=False)
        self.Wk = nn.Linear(d, d * heads, bias=False)
        self.Wv = nn.Linear(d, d * heads, bias=False)
        
        # This unifies the outputs of the different heads into 
        # a single k-dimensional vector.
        self.unifyheads = nn.Linear(heads * d, d)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: The input embedding of shape [b, l, d].
            
        Returns:
            Self attention tensor of shape [b, l, d].
        """
        b, l, d = x.size()
        h = self.h
        
        # Transform the input embeddings x of shape [b, l, d] to queries, keys, values.
        # The output shape is [b, l, d, d*h] which we transform into [b, l, h, d]. Then,
        # we fold the heads into the batch dimenstion to arrive at [b*h, l, d]
        queries = self.Wq(x).view(b, l, h, d).transpose(1, 2).contiguous().view(b * h, l, d)
        keys = self.Wk(x).view(b, l, h, d).transpose(1, 2).contiguous().view(b * h, l, d)
        values = self.Wv(x).view(b, l, h, d).transpose(1, 2).contiguous().view(b * h, l, d)
        
        # Compute the product of queries and keys and scale with sqrt(d).
        # The tensor w' has shape (b*h, l, l) containing raw weights.
        w_prime = torch.bmm(queries, keys.transpose(1, 2)) / np.sqrt(d)

        # Compute w by normalizing w' over the last dimension.
        w = F.softmax(w_prime, dim=-1) 
        
        # Apply the self attention to the values.
        out = torch.bmm(w, values).view(b, h, l, d)
        
        # Swap h, l back.
        out = out.transpose(1, 2).contiguous().view(b, l, h * d)
        
        # Unify heads to arrive at shape [b, l, d].
        return self.unifyheads(out)
  
  
class TransformerBlock(nn.Module):
    """
    A Transformer block consisting of self attention and ff-layer.
    
    Args:
        d (int): The embedding dimension.
        heads (int): The number of attention heads.
        n_mlp (int): The number of mlp 'blocks'.
    """
    def __init__(self, d: int, heads: int=8, n_mlp: int=4):
        super().__init__()
        
        # The self attention layer.
        self.attention = SelfAttention(d, heads=heads)
        
        # The two layer norms.
        self.norm1 = nn.LayerNorm(d)
        self.norm2 = nn.LayerNorm(d)
        
        # The feed-forward layer.
        self.ff = nn.Sequential(
            nn.Linear(d, n_mlp*d),
            nn.ReLU(),
            nn.Linear(n_mlp*d, d)
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: The input embedding of shape [b, l, d].
            
        Returns:
            Transformer output tensor of shape [b, l, d].
        """
        x_prime = self.attention(x)
        x = self.norm1(x_prime + x)
        
        x_prime = self.ff(x)
        return self.norm2(x_prime + x)


# IMDB Movie Reviews Sentiment Classification
We train a Transformer classification model to predict the sentiment (positive, negative) of movie reviews in the [IMDB dataset](https://keras.io/api/datasets/imdb/).

We use the [pytorch lightning framework](https://pytorch-lightning.readthedocs.io/en/latest/) to massively reduce the amount of code we need to write.

<img src="https://raw.githubusercontent.com/leox1v/dl20/b3d5b5556d1b2bd360a4abeef4fd82f056ab0301/imgs/classifier.svg?token=AD5WN2R4NHQQTQFOSQHAVWK7VU3JU" alt="drawing" width="800"/>

## Data Loader

In [None]:

class IMDBDataModule(pl.LightningDataModule):
    """
    LightningDataModule to load the IMDB movie review sentiment data.
    """ 
    
    def __init__(self, batch_size: int):
        super().__init__()
        self.batch_size = batch_size
        
    def setup(self, num_words: int, max_seq_len: int):
        """
        Initial loading of the dataset and transformation.
        
        Args:
            num_words (int): The vocabulary size. The vocabulary is 
                sorted by frequency of appearance in the dataset.
            max_seq_len (int): The maximum number of tokens per
                review.
        """
        (self.x_train, self.y_train), (self.x_test, self.y_test) = imdb.load_data(
            num_words=num_words, 
            maxlen=max_seq_len
        )
        print(f'# Training Examples: {len(self.y_train)}')
        print(f'# Test Examples: {len(self.y_test)}')
        
        self.word2idx = dict(
            **{k: v+3 for k, v in imdb.get_word_index().items()},
            **{'<PAD>': 0,
               '<START>': 1,
               '<UNK>': 2,
               '<UNUSED>': 3,
              },
        )
        self.idx2word = {v: k for k, v in self.word2idx.items()}
        
        # Pad the inputs and convert to torch Tensors.
        self.x_train = pad_sequences(self.x_train, maxlen=max_seq_len, value = 0.0)
        self.x_test = pad_sequences(self.x_test, maxlen=max_seq_len, value = 0.0)
        
    
    def example(self):
        """Returns a random training example."""        
        idx = np.random.randint(0, len(self.x_train))
        x, y = self.x_train[idx], self.y_train[idx]
        review = ' '.join(self.idx2word[token_id] for token_id in x if token_id > 1)
        sentiment = 'POSITIVE' if y else 'NEGATIVE'
        return f'{review}\nSentiment: {sentiment}'
    
    def train_dataloader(self):
        dataset = TensorDataset(torch.LongTensor(self.x_train), 
                                torch.LongTensor(self.y_train))
        return DataLoader(dataset, self.batch_size)
                                
    def test_dataloader(self):
        dataset = TensorDataset(torch.LongTensor(self.x_test), 
                                torch.LongTensor(self.y_test))
        return DataLoader(dataset, self.batch_size)
    
    def val_dataloader(self):
        dataset = TensorDataset(torch.LongTensor(self.x_test), 
                                torch.LongTensor(self.y_test))
        return DataLoader(dataset, self.batch_size)
    
imdb_data = IMDBDataModule(128)
imdb_data.setup(num_words=30000,
                max_seq_len=100)
print('\nExamples:')
print('\n\n'.join(imdb_data.example() for _ in range(3)))
    

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
# Training Examples: 2773
# Test Examples: 2963
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json

Examples:
its not braveheart thankfully but it is fine entertainment with engaging characters and good acting all around i enjoyed this film when it was released and upon viewing it again last week find it has held up well over time not a classic film but a very fine and watchable movie to enjoy as great entertainment
Sentiment: POSITIVE

this is an absolutely incredible film it shows south african racism from the perspective of the victims and provokes a feeling of anti racism in everyone who sees it it is the best historic film i have ever seen
Sentiment: POSITIVE

this movie really kicked some ass i watched it over and over and it never got boring angelina jolie really kicked some ass in the movie you should see the movie you won't be disappoin

In [None]:
class TextClassificationTransformer(pl.LightningModule):
    """
    Stacked Transformer blocks for sequence classification.
    
    Args:
        d (int): The embedding dimension.
        heads (int): The number of attention heads for each transformer block.
        depth (int): The number of transformer blocks.
        max_seq_len (int): The maximum number of tokens of each sequence.
        num_classes (int): The number of classification classes.
    """
    def __init__(self, d: int=128, heads: int=8, depth: int=6,
                max_seq_len: int=512, num_tokens: int=30000, 
                num_classes: int=2):
        super().__init__()
        self.save_hyperparameters()
        
        self.num_tokens = num_tokens
        
        # Embeddings for tokens and position.
        self.token_emb = nn.Embedding(num_tokens, d)
        self.pos_emb = nn.Embedding(max_seq_len, d)
        
        # The stacked transformer blocks.
        self.transformer_blocks = nn.Sequential(
            *[TransformerBlock(d=d, heads=heads) for _ in range(depth)]
        )
        
        # Mapping of final output sequence to class logits.
        self.classification = nn.Linear(d, num_classes)
        
        self.criterion = nn.CrossEntropyLoss()
        self.accuracy = pl.metrics.Accuracy()
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x (torch.Tensor): A tensor of shape (b, l) of integer values
                representing words in some predetermined vocabulary.
        
        Returns:
            A tensor of shape (b, c) of logits over the classes
                (c is the number of classes).
        """
        # 1. Generate token embeddings. Shape: [b, l, d].
        # 2. Generate position embeddings. Shape: [b, l, d].
        # 3. Generate final embedding by taking the sum of the two embeddings.
        #----------------
        # TODO
        #----------------
        
        # 4. Feed the embedding into the transformer blocks. Shape: [b, l, d].
        # 5. Compute the mean latent vector for each sequence.
        #    The mean is applied over dim=1 (time). Shape: [b, d].
        # 6. Classify. Shape: [b, num_classes].
        #----------------
        # TODO
        #----------------

        return x

    
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=1e-4)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        
        # Forward pass.
        logits = self(x)
        
        # Compute the loss with CrossEntropy.
        loss = self.criterion(logits, y)
        
        # Log the metrics.
        self.log('loss', loss, on_epoch=True, prog_bar=True)
        self.log('acc', self.accuracy(logits, y), on_epoch=True, prog_bar=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        # Lightning automatically disables gradients and puts model in eval mode.
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        
        # Log the metrics.
        self.log('test_loss', loss, on_epoch=True)
        self.log('test_acc', self.accuracy(logits, y), on_epoch=True, prog_bar=True)
        
    def validation_step(self, batch, batch_idx):
        return self.test_step(batch, batch_idx)
        
        
        

In [None]:
NUM_WORDS = 10000
MAX_SEQ_LEN = 128
EMBEDDING_DIM = 128
BATCH_SIZE = 32

imdb_data = IMDBDataModule(batch_size=BATCH_SIZE)
imdb_data.setup(num_words=NUM_WORDS,
                max_seq_len=MAX_SEQ_LEN)

model = TextClassificationTransformer(d=EMBEDDING_DIM,
                                      max_seq_len=MAX_SEQ_LEN,
                                      num_tokens=NUM_WORDS)
logger = pl.loggers.TensorBoardLogger('tb_logs', name='transformer')
trainer = pl.Trainer(max_epochs=5,
                     default_root_dir='ckpts',
                     gpus=1,
                     logger=logger)
trainer.fit(model, imdb_data)
_ = trainer.test()

## RNN
Let's compare our transformer model against an RNN. We want to build a basic RNN architecture as shown in the image below, where we use the output of the final cell to predict the sentiment label. 

<img src="https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/assets/sentiment4.png?raw=true" alt="drawing" width="600"/>

In [None]:
class TextClassificationRNN(pl.LightningModule):
    """
    RNN for sequence classification.
    
    Args:
        d (int): The embedding dimension.
        depth (int): The number of RNN blocks.
        max_seq_len (int): The maximum number of tokens of each sequence.
        num_classes (int): The number of classification classes.
    """
    def __init__(self, d: int=128, depth: int=6,
                max_seq_len: int=512, num_tokens: int=30000, 
                num_classes: int=2):
        super().__init__()
        self.save_hyperparameters()
        
        self.num_tokens = num_tokens
        
        # Embeddings for tokens.
        self.token_emb = nn.Embedding(num_tokens, d)
        
        # The stacked GRU layers.
        self.rnn = nn.GRU(input_size=d,
                          hidden_size=d,
                          num_layers=depth,
                          batch_first=True)
        
        # Mapping of final output sequence to class logits.
        self.classification = nn.Linear(d, num_classes)
        
        self.criterion = nn.CrossEntropyLoss()
        self.accuracy = pl.metrics.Accuracy()
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x (torch.Tensor): A tensor of shape (b, l) of integer values
                representing words in some predetermined vocabulary.
        
        Returns:
            A tensor of shape (b, c) of logits over the classes
                (c is the number of classes).
        """
        # 1. Generate token embeddings. (No position embedding required for RNNs!) 
        #     Shape: [b, l, d].
        # 2. Feed the embedding into the GRU. Shape: [b, l, d]. Use the output
        #     of the last token as the encoding.
        # 3. Classify. Shape: [b, num_classes].
        #----------------
        # TODO
        #----------------
        return x

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=1e-4)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        
        # Forward pass.
        logits = self(x)
        
        # Compute the loss with CrossEntropy.
        loss = self.criterion(logits, y)
        
        # Log the metrics.
        self.log('loss', loss, on_epoch=True, prog_bar=True)
        self.log('acc', self.accuracy(logits, y), on_epoch=True, prog_bar=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        # Lightning automatically disables gradients and puts model in eval mode.
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        
        # Log the metrics.
        self.log('test_loss', loss, on_epoch=True)
        self.log('test_acc', self.accuracy(logits, y), on_epoch=True, prog_bar=True)
        
    def validation_step(self, batch, batch_idx):
        return self.test_step(batch, batch_idx)

In [None]:
NUM_WORDS = 10000
MAX_SEQ_LEN = 128
EMBEDDING_DIM = 512
BATCH_SIZE = 32

imdb_data = IMDBDataModule(batch_size=BATCH_SIZE)
imdb_data.setup(num_words=NUM_WORDS,
                max_seq_len=MAX_SEQ_LEN)

model = TextClassificationRNN(d=EMBEDDING_DIM,
                              max_seq_len=MAX_SEQ_LEN,
                              num_tokens=NUM_WORDS)
logger = pl.loggers.TensorBoardLogger('tb_logs', name='rnn')
trainer = pl.Trainer(max_epochs=5,
                     default_root_dir='ckpts',
                     gpus=1,
                     logger=logger)
trainer.fit(model, imdb_data)
_ = trainer.test()

In [None]:
%tensorboard --logdir tb_logs