In [1]:
## Standard libraries
import os
import numpy as np
import random
import math
import json
from functools import partial

## Imports for plotting
import matplotlib.pyplot as plt
plt.set_cmap('cividis')
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf') # For export
from matplotlib.colors import to_rgb
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0
import seaborn as sns
sns.reset_orig()

## tqdm for loading bars
from tqdm.notebook import tqdm

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim

## Torchvision
import torchvision
from torchvision.datasets import CIFAR100
from torchvision import transforms

  set_matplotlib_formats('svg', 'pdf') # For export


In [3]:
import pytorch_lightning as pl

# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = "../data"
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = "../saved_models/tutorial6"

pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Global seed set to 42


Device: cpu


In [4]:
# download two pre-trained models

import urllib.request
from urllib.error import HTTPError
# Github URL where saved models are stored for this tutorial
base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial6/"
# Files to download
pretrained_files = ["ReverseTask.ckpt", "SetAnomalyTask.ckpt"]

# Create checkpoint path if it doesn't exist yet
os.makedirs(CHECKPOINT_PATH, exist_ok=True)

# For each file, check whether it already exists. If not, try downloading it.
for file_name in pretrained_files:
    file_path = os.path.join(CHECKPOINT_PATH, file_name)
    if "/" in file_name:
        os.makedirs(file_path.rsplit("/",1)[0], exist_ok=True)
    if not os.path.isfile(file_path):
        file_url = base_url + file_name
        print(f"Downloading {file_url}...")
        try:
            urllib.request.urlretrieve(file_url, file_path)
        except HTTPError as e:
            print("Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", e)

Downloading https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial6/ReverseTask.ckpt...
Downloading https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial6/SetAnomalyTask.ckpt...


### What is Attention? 
The attention mechanism describes a recent new group of layers in neural networks that has attracted a lot of interest in the past few years, especially in sequence tasks. <br>

*The attention mechanism describes a weighted average of (sequence) elements with the weights dynamically computed based on an input query and elements’ keys.* We want to dynamically decide on which inputs we want to “attend” more than others. <br>

There are 4 usual parts: 
1. **Query**: feature vector that describes what we are looking for in the sequence i.e. what we maybe want to pay attention to
2. **Keys**: for each input element, we have a key (a feature vector); roughly describes what this element is "offering" or when it might be important. Keys should be designed in a way in which we can identify the elements we want to pay attention to based on the query.
3. **Values**: for each input element, we have a value vector. This is the feature vector that we want to average over.
4. **Score Function**: to rate which elements we want to pay attention to, need to specify the scope function $f_{attn}$. Score function takes the query and the keys as input, and the output score/attention weight of the query-key pair. Usually implemented by simple metrics such as the dot product or a small MLP. <br>

The weights of the avg are calculated by a softmax over all score function inputs. Hence, we assign those value vectors a higher weight whose corresponding key is most similar to the query.

The attention applied inside the Transformer architecture is called **self-attention**.

### Scaled Dot Product Attention
Allows for a network to attend over a sequence.

In [6]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1] # hidden dimensionality of query and keys
    attn_logits = torch.matmul(q, k.transpose(-2,-1)) # swaps the last and second to last dim
    attn_logits = attn_logits / math.sqrt(d_k) # to scale dot product variance back down to original variance of Q and K
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15) # fill elements of attn_logits to -9e15 where mask == 0
                                                                # shape of mask must be broadcastable with underlying tensor
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

In [9]:
seq_len, d_k = 3, 2
q = torch.rand(seq_len, d_k)
k = torch.rand(seq_len, d_k)
v = torch.rand(seq_len, d_k)
values, attention = scaled_dot_product(q, k, v)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("Values\n", values)
print("Attention\n", attention) # will have dim (seq_len, seq_len)

Q
 tensor([[0.2666, 0.6274],
        [0.2696, 0.4414],
        [0.2969, 0.8317]])
K
 tensor([[0.1053, 0.2695],
        [0.3588, 0.1994],
        [0.5472, 0.0062]])
V
 tensor([[0.9516, 0.0753],
        [0.8860, 0.5832],
        [0.3376, 0.8090]])
Values
 tensor([[0.7303, 0.4861],
        [0.7262, 0.4902],
        [0.7336, 0.4830]])
Attention
 tensor([[0.3351, 0.3408, 0.3241],
        [0.3302, 0.3390, 0.3308],
        [0.3388, 0.3429, 0.3184]])


### Multi-Head Attention

Often there are multiple different aspects a sequence element wants to attend to, and a single weighted average is not a good option for it.This is why we extend the attention mechanisms to multiple heads, i.e. multiple different query-key-value triplets on the same features. Given a query, key, and value matrix, we transform those into sub-queries, sub-keys, and sub-values, which we pass through the scaled dot product attention independently. Afterward, we concatenate the heads and combine them with a final weight matrix. 

In [None]:
class MultiheadAttention(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be 0 modulo number of heads."

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # stack all weight matrices 1...h for efficiency
        # NOTE: in many implementations, see "bias=False" which is optional
        self.qkv_proj = nn.Linear(input_dim, 3*embed_dim)
        self.o_proj = nn.Linear(embed_dim, embed_dim)

        self._reset_parameters()

    def _reset_parameters(self):
        # original Transformer initialization
        nn.init.xavier_uniform_(self.qkv_proj.weight)
        self.qkv_proj.bias.data.fill_(0)
        nn.init.xavier_uniform_(self.o_proj.weight)
        self.o_proj.bias.data.fill_(0)

    def forward(self, x, mask=None, return_attention=False):
        batch_size, seq_len, embed_dim = x.size()
        qkv = self.qkv_proj(x) # set current feature map in a neural network

        # separate Q, K, V from linear ouput 
        qkv = qkv.reshape(batch_size, seq_len, self.num_heads, 3*self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3) # [batch, head, seq_len, dims]
        q, k, v = qkv.chunk(3, dim=-1) # attempts to split a tensor into the specified number of chunks

        # Determine value outputs
        values, attention = scaled_dot_product(q, k, v, mask=mask)
        values = values.permute(0, 2, 1, 3) # [batch, head, seq_len, dims]
        values = values.reshape(batch_size, seq_len, embed_dim)
        o = self.o_proj(values)

        if return_attention:
            return o, attention
        else:
            return o

Multi-head attention is looking at the input as a set, rather than a sequence (permutation-equivariant with respect to inputs). If it is important for the task to consider order, then it is common to encode the position in the input features. Compared to RNNs, self-attention layers can parallelize all of its operations making it much faster to compute for smaller sequence lengths. When the sequence length exceeds the hidden dimensionality, self-attention is more expensive to compute than an RNN.

### Transformer Encoder
Originally, the Transformer model was designed for machine translation. Hence, it got an encoder-decoder structure where the encoder takes as input the sentence in the original language and generates an attention-based representation. On the other hand, the decoder attends over the encoded information and generates the translated sentence in an autoregressive manner, as in a standard RNN. <br>

The encoder consists of $N$ identical blocks that are applied in sequence. The input $x$ is passed through a Multi-Head Attention layer. The output is then added to the original input using a residual connection and then a Layer Norm is applied. The residual connection is important for two reasons:
1. Transformers are designed to be very deep. Residual connections are crucial for enabling smooth gradient flow through the model.
2. W/o residual connection, info about the original sequence is lost. <br>

Layer Norm enables faster training and provides small regularization. It also ensures that features are in a similar magnitude among the elements in a sequence. Batch Norm is NOT used becuase it depends on the batch size, which is normally very small for Transformers. <br>

In addition to Multi-head Attention, a small fully connected feed-forward network is added to the model, which is applied to each position separately and identically. Can see this MLP as a post-process to prepare it for the next attention block. Usually, the dimensionality of this MLP is 2 to 8 times larger then $d_{model}$. General advantage of a wider layer instead of a narrow, multi-layer MLP is the faster, parallelizable execution.

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
        """
        Inputs:
            input_dim - Dimensionality of the input
            num_heads - Number of heads to use in the attention block
            dim_feedforward - Dimensionality of the hidden layer in the MLP
            dropout - Dropout probability to use in the dropout layers
        """
        super().__init__()

        # Attentionlayer
        self.self_attn = MultiheadAttention(input_dim, input_dim, num_heads)

        # Two-Layer MLP
        self.linear_net = nn.Sequential(
            nn.Linear(input_dim, dim_feedforward),
            nn.Dropout(dropout),
            nn.ReLu(inplace=True),
            nn.Linear(dim_feedforward, input_dim)
        )
