-
Notifications
You must be signed in to change notification settings - Fork 172
/
linear_attention.py
70 lines (52 loc) · 2.41 KB
/
linear_attention.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#
# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
# Apoorv Vyas <avyas@idiap.ch>
#
"""Implement unmasked linear attention."""
import torch
from torch.nn import Module
def elu_feature_map(x):
return torch.nn.functional.elu(x) + 1
class LinearAttention(Module):
"""Implement unmasked attention using dot product of feature maps in
O(N D^2) complexity.
Given the queries, keys and values as Q, K, V instead of computing
V' = softmax(Q.mm(K.t()), dim=-1).mm(V),
we make use of a feature map function Φ(.) and perform the following
computation
V' = normalize(Φ(Q).mm(Φ(K).t())).mm(V).
The above can be computed in O(N D^2) complexity where D is the
dimensionality of Q, K and V and N is the sequence length. Depending on the
feature map, however, the complexity of the attention might be limited.
Arguments
---------
feature_map: callable, a callable that applies the feature map to the
last dimension of a tensor (default: elu(x)+1)
eps: float, a small number to ensure the numerical stability of the
denominator (default: 1e-6)
"""
def __init__(self, feature_map=None, eps=1e-6):
super(LinearAttention, self).__init__()
self.feature_map = feature_map or elu_feature_map
self.eps = eps
def forward(self, queries, keys, values, attn_mask, query_lengths,
key_lengths):
# Apply the feature map to the queries and keys
Q = self.feature_map(queries)
K = self.feature_map(keys)
# Apply the key padding mask and make sure that the attn_mask is
# all_ones
if not attn_mask.all_ones:
raise RuntimeError(("LinearAttention does not support arbitrary "
"attention masks"))
K = K * key_lengths.float_matrix[:, :, None, None]
# Compute the KV matrix, namely the dot product of keys and values so
# that we never explicitly compute the attention matrix and thus
# decrease the complexity
KV = torch.einsum("nshd,nshm->nhmd", K, values)
# Compute the normalizer
Z = 1/(torch.einsum("nlhd,nhd->nlh", Q, K.sum(dim=1))+self.eps)
# Finally compute and return the new values
V = torch.einsum("nlhd,nhmd,nlh->nlhm", Q, KV, Z)
return V.contiguous()