In [2]:
# 1) 常用api测试

In [3]:
import tensorflow as tf 

class TransformerV1:
    
    def forward(self, inputs, seq_len, is_need_mask):
        inputs_dim = inputs.get_shape().as_list()[-1]

        inputs_3d = tf.cond(is_need_mask,
            lambda: tf.reshape(inputs, [-1, seq_len, inputs_dim]), #(B,D) reshape to (B/seq_len, seq_len, D) 
            lambda: tf.reshape(inputs, [1, -1, inputs_dim]))
        inputs_att = self.multihead_attention(inputs_3d, num_heads=1) # (B/seq_len, seq_len, D) in, (B/seq_len, seq_len, D) out
        inputs_ff = self.feed_forward(inputs_att, [inputs_dim, inputs_dim]) 
        ff_out = tf.reshape(inputs_ff, [-1, inputs_dim]) # (B/seq_len, seq_len, D) reshape to (B,D)

        return ff_out

    def multihead_attention(self, keys, num_heads=1):
        num_units = keys.get_shape().as_list()[-1]

        # Linear Projections 线性投影.
        Q = tf.layers.dense(keys, num_units, activation=tf.nn.relu)  # [Bs, Kl, C]
        K = tf.layers.dense(keys, num_units, activation=tf.nn.relu)  # [Bs, Kl, C]
        V = tf.layers.dense(keys, num_units, activation=tf.nn.relu)  # [Bs, Kl, C]

        # Split and concat 分割成 head = num_heads 块，再拼起来.
        Q_ = tf.concat(tf.split(Q, num_heads, axis=-1), axis=0)  # [num_heads*Bs, Kl, C/num_heads]
        K_ = tf.concat(tf.split(K, num_heads, axis=-1), axis=0)  # [num_heads*Bs, Kl, C/num_heads]
        V_ = tf.concat(tf.split(V, num_heads, axis=-1), axis=0)  # [num_heads*Bs, Kl, C/num_heads]

        # Attention 根据公式，做 Attention 计算 weight matrix.
        weights = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # [num_heads*Bs, Kl, Kl]

        # Norm操作 weights = weights / sqrt(C/num_heads).
        weights = weights / (K_.get_shape().as_list()[-1]
        ** 0.5) # [num_heads*Bs, Kl, Kl]

        # Activation: weights is a weight matrix.
        weights = tf.nn.softmax(weights)  # [num_heads*Bs, Kl, Kl]

        # weighted sum
        outputs = tf.matmul(weights, V_) # [num_heads*Bs, Kl, C/num_heads]
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # [Bs, Kl, C]

        # residual connection
        outputs += keys

        # layer normaliztion
        outputs = tf.contrib.layers.layer_norm(outputs)

        return outputs

    def feed_forward(self, inputs, num_units):
        outputs = tf.layers.dense(inputs, num_units[0], activation=tf.nn.relu)
        outputs = tf.layers.dense(outputs, num_units[1])
        outputs += inputs
        outputs = tf.contrib.layers.layer_norm(outputs)

In [4]:
mytransformer = TransformerV1()

inputs = tf.random.normal(shape=(6, 20)) # （B,D)
seq_len = 3
is_need_mask = tf.placeholder(tf.bool, name="is_need_mask")
ops_v1 = mytransformer.forward(inputs, seq_len, is_need_mask)

with tf.Session() as sess:
    ret_v1 = sess.run(inputs)
    # ret_v1 = sess.run(ops_v1)

    # inputs_v1, ret_v1 = sess.run([inputs, ops_v1, ])
    # print("TransformerV1 inputs_v1: ", inputs_v1)
    print("TransformerV1 ret_v1: ", ret_v1)

ValueError: Tried to convert 'tensor' to a tensor and failed. Error: None values not supported.