In [1]:
import tensorflow as tf

# attention

attention 最早是应用于计算机视觉，随后在NLP领域中应用。随着
GPT和BERT的大火，attention也随之得到了关注。
attention的核心就是从关注全部到关注重点。

attention两个优点
1 参数少
2 速度快

attention 可以解决rnn不能并行计算的问题，每一步计算不依赖于上一步的计算结果。



早期attention引入nlp中，是和rnn相结合，应用于Machine translation中。对于NMT任务，一般使用seq2seq模型，或者说是encoder-decoder模型。其流程是讲输入语句encoder一个向量，然后通过decoder进行解码，最后输出目标语句。

这种方法存在一些问题，RNN中存在长程梯度消失的问题，对于较长的句子，我们很难寄希望于将输入的序列化为定长的向量而保存所有的有效信息，所以随着所需翻译句子的长度增加，这种结构的效果会显著下降。

attenion 的结构如下

![title](img/attention_v1.png)

attention + RNN做NMT的流程如下：
1）利用RNN结构得到encoder的hidden state(h1, h2, ...hT)
2）假设当前的decoder的hidden state是s_t-1, 我们可以计算每一个输入位置j与当前输出位置的关联性，$e_{ij}=a(s_{t-1}, h_j)$,写成相应的向量形式即为
$\overrightarrow{e_t}=(a(s_{t-1}, h_1),...,a(s_{t-1}, h_T))$, 其$a$是一种相关性的算符，例如常见的有点乘形式$\overrightarrow{e_t}=\overrightarrow{s_{t-1}}\overrightarrow{h}$, 加权点乘$\overrightarrow{e_t}=\overrightarrow{s_{t-1}}W\overrightarrow{h}$, 加和$\overrightarrow{e_t}=\overrightarrow{v}tanh(W_1\overrightarrow{h}+W_2\overrightarrow{s_{t-1}})$

对于$\overrightarrow{t}$进行softmax操作将其normalize得到attention的分布，$\overrightarrow{\alpha_t}=softmax(\overrightarrow{e_t})$, 展开形式为$\alpha_{tj}=\frac{exp(e_{tj})}{\sum_{k=1}^{T}{exp(e_{tk})}}$

利用$\overrightarrow{\alpha_t}$我们可以进行加权求和得到相应的context vector $\overrightarrow{c_t}=\sum_{j=1}^{T}{\alpha_{tj}h_j}$

由此，我们可以计算decoder的下一个hidden state $s_t=f(s_{t-1}, y_{t-1}, c_t)$以及该位置的输入$p(y_t|y_1,...,y_{t-1},\overrightarrow{x})=g(y_{i-1}, s_i, c_i)$

这里的关键操作是计算encoder 和 decoder state 之间的关联的权重，得到attention 分布，从而对于当前输出位置得到比较重要的输入位置的权重，在预测输出时相应的会占较大的比重。

## attention 原理【1】
![title](img/attention_v2.png)

对于每一个query, 第一步首先计算query和每个一个key的相似度，得到权值，第二步，将权值经过softmax 归一化得到权重。第三步，将权重与对应的value进行加权平均。

attention 有很多不同的类型，

![title](img/attention_v3.png)

## 1 按照计算区域划分，

- soft-attention: 对所有可以求相似度权重，每个可以都有一个对应的权重
- hard-attention: 这种方式直接精准定位到某个key, 其余key就都不管了，相当于这个key的概率是1， 其余key的概率全部是0， 因此，这种对齐方式要求很高，要求一步到位，而且不可导，一般需要用强化学习的方法进行训练
- local-attention: 这种方式其实是soft-attenion和hard-attention的综合，首选使用attention的方式定位到一个小的区域，然后在这个小区域内用soft-attention.

按所用信息划分
- general-attenion
- self-attention

按照使用模型划分
- CNN+attention
- LSTM+Attention

按照模型结构划分
- 单层attention
- 多层attention
- 多头attention

按照权值计算方式划分
- 点乘： 对应元素相乘
- 矩阵相乘 $s(q,k)=q{^T}k$
- 余弦相似度 $s(q,k)=\frac{q{^T}k}{||q||.||k||}$
- 串联concatenate： $s(q,k)=W[q,k]$
- 多层感知机MLP: $s(q,k)=v{^T_\alpha}tanh(W_q+U_k)$



[1] 参考资料
（1） Attention专题 https://zhuanlan.zhihu.com/p/104677204

In [2]:
with open("cmn.txt", "r", encoding="utf-8") as f:
    data = f.read()

In [3]:
import re
import tensorflow as tf

In [4]:
# 预处理数据
en = []
cn = []
for x in data.split("\n")[:100]:
    if len(x.split("\t")) < 2:
        continue
    ei, ci = x.split("\t")
    ei = ei.lower()
    ei = re.sub(r"([.?,])", r" \1", ei)
    
    en.append(re.split(r"\s", ei))
    cn.append([c for c in ci])
    

In [5]:
en_word2id = {"<start>": 1, "<pad>": 0}
cn_word2id = {"<end>": 1, "<start>": 2, "<pad>": 0}

In [6]:
input_en = []
for ei in en:
    input_en.append(ei)
    
    for e in ei:
        if e not in en_word2id:
            en_word2id[e] = len(en_word2id)

In [7]:
input_cn = []
target_cn = []
for ci in cn:
    input_cn.append(["<start>"]+ci)
    target_cn.append(ci+["<end>"])
    
    for c in ci:
        if c not in cn_word2id:
            cn_word2id[c] = len(cn_word2id) 

In [8]:
input_en_id = [[en_word2id[e] for e in ei] for ei in input_en]
input_cn_id = [[cn_word2id[c] for c in ci] for ci in input_cn]
target_cn_id = [[cn_word2id[c] for c in ci] for ci in target_cn]

In [9]:
input_en_id_pad = tf.keras.preprocessing.sequence.pad_sequences(input_en_id, maxlen=64, padding="post")
input_cn_id_pad = tf.keras.preprocessing.sequence.pad_sequences(input_cn_id, maxlen=64, padding="post")
target_cn_id_pad = tf.keras.preprocessing.sequence.pad_sequences(target_cn_id, maxlen=64, padding="post")

In [10]:
dataset = tf.data.Dataset.from_tensor_slices((input_en_id_pad, input_cn_id_pad, target_cn_id_pad))
dataset = dataset.shuffle(100).batch(100)

In [11]:
EMBEDING_SIZE = 10
EN_VOCAB_SIZE = len(en_word2id)
CN_VOCAB_SIZE = len(cn_word2id)

LSTM_SIZE = 10

In [12]:
CN_VOCAB_SIZE

149

In [13]:
train_x, train_y, train_yd = next(iter(dataset))

In [14]:
class Encoder(tf.keras.Model):
    
    def __init__(self):
        super(Encoder, self).__init__()
        self.word_embed = tf.keras.layers.Embedding(EN_VOCAB_SIZE, EMBEDING_SIZE)
        self.lstm = tf.keras.layers.LSTM(LSTM_SIZE, return_sequences=True, return_state=True)
        
    def call(self, input_x):
        x = self.word_embed(input_x)
        
        x, h_state, c_state = self.lstm(x)
        
        return x, h_state, c_state
        

In [15]:

encoder = Encoder()
out, h_state, c_state = encoder(train_x)

In [16]:
class Attention(tf.keras.layers.Layer):
    
    
    def __init__(self):
        super(Attention, self).__init__()
        
    
    def call(self, query, value):
        
        score = tf.matmul(value, query)
        
        attention = tf.keras.backend.softmax(score)
        
        context = tf.matmul(tf.transpose(attention, perm=[0, 2, 1]), value)
        
        return context
        

In [17]:
attention = Attention()
ah_state = tf.expand_dims(h_state, axis=2)
ch_state = tf.expand_dims(c_state, axis=2)

state_v =  ah_state + ch_state
c = attention(state_v, out)

In [18]:
class Decoder(tf.keras.Model):
    
    def __init__(self):
        super(Decoder, self).__init__()
        self.word_embed = tf.keras.layers.Embedding(CN_VOCAB_SIZE, EMBEDING_SIZE)
        self.lstm = tf.keras.layers.LSTM(LSTM_SIZE, return_state=True)
        self.attention = Attention()
        self.out = tf.keras.layers.Dense(CN_VOCAB_SIZE, activation="softmax")
        
    def call(self, input_x, encoder_out, encoder_state, origin_state):
        x = self.word_embed(input_x)
        
        context = self.attention(encoder_state, encoder_out)
        
        x = tf.concat([context, x], axis=-1)
        
        
        output, h_state, c_state = self.lstm(x, initial_state=origin_state)
       
        
        logits = self.out(output)
        
        return logits, h_state, c_state
        

In [19]:
train_y.shape

TensorShape([100, 64])

In [20]:
decoder = Decoder()
logits, dh, dc = decoder(train_y[:,0:1], out, state_v, [h_state, c_state])

In [21]:
logits.shape

TensorShape([100, 149])

In [22]:
optimizer = tf.keras.optimizers.Adam()
loss_func = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [23]:
@tf.function
def train_step(input_x, input_y):
    
    with tf.GradientTape() as tape:
        out, h_state, c_state = encoder(input_x)
        
        y_shape = input_y.shape[1]
        d_state = [h_state, c_state ]
        
        ah_state = tf.expand_dims(h_state, axis=2)
        ch_state = tf.expand_dims(c_state, axis=2)

        state_v =  ah_state + ch_state
        
        current_word = input_y[:,0:1]
        loss = None
        for i in range(1, y_shape):
            y = input_y[:,i:i+1]
            
            logits, dh, dc = decoder(y, out, state_v, d_state)
            d_state = [dh, dc]
            
            if loss:
                loss += loss_func(y, logits)
            else:
                loss = loss_func(y, logits)
            current_word = y
        
    batch_loss = loss/100

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))
        
    
    return batch_loss
            
            
            
            

In [24]:
%%time
loss = train_step(train_x, train_y)

KeyboardInterrupt: 

In [None]:
epcoh = 100
for i in range(epoch):
    
    for j, (tx, ty, tt) in enumerate(dataset):
        loss = train_step(tx, ty)
        
        if j % 100 == 0:
            print("epoch {0} batch {1} loss {2}".format(i, j, loss))

In [64]:
v = tf.keras.activations.softmax(c)

<tf.Tensor: id=2139, shape=(2, 1, 2), dtype=float32, numpy=
array([[[0.49975556, 0.50024444]],

       [[0.49998578, 0.50001425]]], dtype=float32)>

In [67]:
tf.shape

<function tensorflow.python.ops.array_ops.shape_v2(input, out_type=tf.int32, name=None)>

In [68]:
v.shape

TensorShape([2, 1, 2])

In [71]:
tf.matmul(v, b).shape

TensorShape([2, 1, 10])

In [129]:
import numpy as np

In [130]:
a = np.array([[1, 2], [3, 4]])
b = np.array([[4, 5], [6, 7]])

In [131]:
a * b

array([[ 4, 10],
       [18, 28]])

In [132]:
np.matmul(a, b)

array([[16, 19],
       [36, 43]])

In [140]:
a = np.random.rand(1, 2, 3, 4)
b = np.random.rand(1, 2, 4, 6)

In [141]:
np.matmul(a, b)

array([[[[0.43513685, 0.40704364, 0.77139082, 0.53660503, 0.35848674,
          0.46423774],
         [1.45423849, 1.0364473 , 1.42037525, 1.54664568, 0.89226894,
          1.16445379],
         [0.35798607, 0.37586426, 0.845875  , 0.70075828, 0.27867491,
          0.55235812]],

        [[0.51958773, 0.7358483 , 0.79695818, 0.44954007, 0.6752351 ,
          0.94518021],
         [0.40665771, 0.59176367, 0.75550295, 0.41395344, 0.49090909,
          0.68102663],
         [1.01395338, 0.84519891, 0.84555878, 0.53843571, 1.24088543,
          1.46941139]]]])