# Attention is all you Need Vanilla Transformer

In [4]:
import copy
import math

import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

import matplotlib.pyplot as plt
print("PyTorch Version: ",torch.__version__)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Device:', device)
num_gpu = torch.cuda.device_count()
print('Number of GPUs Available:', num_gpu)

PyTorch Version:  1.1.0
Device: cuda:0
Number of GPUs Available: 8


In [6]:
batch_size = 64
sequence_length = 10
hidden_size = 32
attention_heads = 8

In [10]:
# Copy a module N times
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

# Implement attention
def attention(query, key, value, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    key = key.transpose(-2, -1)
    scores = torch.matmul(query, key) / math.sqrt(d_k)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    attention_result = torch.matmul(p_attn, value)
    return attention_result, p_attn


In [11]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, hidden_size, linears=True, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        assert hidden_size % h == 0
        # We assume d_v always equals d_k
        self.d_k = hidden_size // h
        self.h = h
        if linears: 
            self.linears = clones(nn.Linear(hidden_size, hidden_size), 4)
        else:
            self.linears = [lambda arg: arg] * 4
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value):
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from hidden_size => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        
        x = self.linears[-1](x)
        return x

In [13]:
mha = MultiHeadedAttention(h=attention_heads, hidden_size=hidden_size)
print("With as many attention queries as there are values:\n")
query = torch.tensor(np.ones([batch_size, 1, hidden_size])).float()
value = torch.tensor(np.ones([batch_size, sequence_length, hidden_size])).float()
result = mha.forward(query, value, value)
print("query:", query.size())
print("value:", value.size())
print("result:", result.size())
print("\n")


With as many attention queries as there are values:

query: torch.Size([64, 1, 32])
value: torch.Size([64, 10, 32])
result: torch.Size([64, 1, 32])


