- [google-research/albert: ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://github.com/google-research/albert)

For blog:

[ALBERT 论文+代码笔记 | Yam](https://yam.gift/2020/05/10/Paper/2020-05-10-ALBERT/)

In [1]:
class AlbertConfig:
    def __init__(
        self,
        vocab_size,
        embedding_size=128,
        hidden_size=4096,
        num_hidden_layers=12,
        num_hidden_groups=1,
        num_attention_heads=64,
        intermediate_size=16384,
        inner_group_num=1,
        down_scale_factor=1,
        hidden_act="gelu",
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02):
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_hidden_groups = num_hidden_groups
        self.num_attention_heads = num_attention_heads
        self.inner_group_num = inner_group_num
        self.down_scale_factor = down_scale_factor
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range

In [2]:
config = AlbertConfig(vocab_size=32000, 
                      hidden_size=512, 
                      num_hidden_layers=8, 
                      num_attention_heads=6, 
                      intermediate_size=1024)

In [3]:
import collections
import copy
import json
import math
import re
import numpy as np
import six
from six.moves import range
import tensorflow.compat.v1 as tf

In [4]:
class AlbertModel:
    def __init__(
        self,
        config,
        is_training,
        input_ids,
        input_mask=None,
        token_type_ids=None,
        use_one_hot_embeddings=False,
        use_einsum=True,
        scope=None):
        
        config = copy.deepcopy(config)
        input_shape = get_shape_list(input_ids, expected_rank=2)
        batch_size, seq_length = input_shape
        
        with tf.variable_scope(scope, default_name="bert"):
            with tf.variable_scope("embeddings"):
                self.word_embedding_output, self.output_embedding_table = embedding_lookup(
                    input_ids=input_ids, 
                    vocab_size=config.vocab_size, 
                    embedding_size=config.embedding_size,
                    initializer_range=config.initializer_range,
                    word_embedding_name="word_embeddings",
                    use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, 
                # then layer normalize and perform dropout.
                self.embedding_output = embedding_postprocessor(
                    input_tensor=self.word_embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob,
                    use_one_hot_embeddings=use_one_hot_embeddings)
            
            with tf.variable_scope("encoder"):
                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=input_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_hidden_groups=config.num_hidden_groups,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    inner_group_num=config.inner_group_num,
                    intermediate_act_fn=get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True,
                    use_einsum=use_einsum)
        
            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=create_initializer(config.initializer_range))

## Embedding

In [138]:
# (batch_size, seq_length)
input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
token_type_ids = tf.constant([[0, 0, 1], [0, 0, 1]])

### Embedding Lookup

In [98]:
def embedding_lookup(input_ids,
                     vocab_size,
                     embedding_size=128,
                     word_embedding_name="word_embeddings"):
    # This function assumes that the input is of shape [batch_size, seq_length, num_inputs].
    # If the input is a 2D tensor of shape [batch_size, seq_length], we
    # reshape to [batch_size, seq_length, 1].
    if input_ids.shape.ndims == 2:
        # [batch_size, seq_length, 1]
        input_ids = tf.expand_dims(input_ids, axis=[-1])
    
    embedding_table = tf.get_variable(
        name=word_embedding_name,
        shape=[vocab_size, embedding_size],
        initializer=create_initializer(0.02))
    
    output = tf.nn.embedding_lookup(embedding_table, input_ids)
    print(output.shape)
    input_shape = get_shape_list(input_ids)
    output = tf.reshape(output, input_shape[0:-1] + [input_shape[-1] * embedding_size])
    return (output, embedding_table)

In [99]:
input_ids.shape.ndims

2

In [100]:
tf.expand_dims(input_ids, axis=[-1]).shape

TensorShape([2, 3, 1])

In [101]:
input_shape = get_shape_list(input_ids)

In [102]:
word_embedding_output, output_embedding_table = embedding_lookup(input_ids, 100)

(2, 3, 1, 128)


In [103]:
output_embedding_table.shape

TensorShape([100, 128])

In [104]:
word_embedding_output.shape

TensorShape([2, 3, 128])

### Embedding Postprocessor

In [105]:
def embedding_postprocessor(input_tensor,
                            token_type_ids,
                            token_type_vocab_size=2,
                            max_position_embeddings=512,
                            dropout_prob=0.1,
                            token_type_embedding_name="token_type_embeddings",
                            position_embedding_name="position_embeddings"):
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size, seq_length, width = input_shape
    
    output = input_tensor
    token_type_table = tf.get_variable(
        token_type_embedding_name,
        shape=[token_type_vocab_size, width],
        initializer=create_initializer(0.02)
    )
    # 
    token_type_embeddings = tf.nn.embedding_lookup(token_type_table, token_type_ids)
    print(token_type_embeddings.shape)
    output += token_type_embeddings
    
    full_position_embeddings = tf.get_variable(
        name=position_embedding_name,
        shape=[max_position_embeddings, width],
        initializer=create_initializer(0.02))
    
    position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1])
    num_dims = len(output.shape.as_list())

    position_broadcast_shape = []
    for _ in range(num_dims - 2):
        position_broadcast_shape.append(1)
    position_broadcast_shape.extend([seq_length, width])
    position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape)
    print(position_embeddings.shape)
    output += position_embeddings
    
    output = layer_norm_and_dropout(output, dropout_prob)
    return output

In [106]:
full_position_embeddings = tf.get_variable(
        name="position_embedding_name",
        shape=[512, 128],
        initializer=create_initializer(0.02))

In [107]:
full_position_embeddings.shape

TensorShape([512, 128])

In [108]:
position_broadcast_shape

[1, 3, 128]

In [109]:
position_embeddings = tf.slice(full_position_embeddings, [0, 0], [3, -1])

In [110]:
position_embeddings.shape

TensorShape([3, 128])

In [111]:
token_type_ids = tf.zeros(shape=[2, 3], dtype=tf.int32)

In [112]:
embedding_output = embedding_postprocessor(word_embedding_output, token_type_ids)

(2, 3, 128)
(1, 3, 128)


In [113]:
embedding_output.shape

TensorShape([2, 3, 128])

### Transformer

In [63]:
def transformer_model(input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_hidden_groups=1,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      inner_group_num=1,
                      intermediate_act_fn="gelu",
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False,
                      use_einsum=True):
    attention_head_size = hidden_size // num_attention_heads
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    input_width = input_shape[2]
    
    all_layer_outputs = []
    
    if input_width != hidden_size:
        # 本文的情况（第一个调整点）
        prev_output = dense_layer_2d(
            input_tensor, hidden_size, create_initializer(initializer_range),
            None, use_einsum=use_einsum, name="embedding_hidden_mapping_in")
    else:
        # 正常情况（如 Bert）
        prev_output = input_tensor
    
    with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE):
        for layer_idx in range(num_hidden_layers):
            group_idx = int(layer_idx / num_hidden_layers * num_hidden_groups)
            with tf.variable_scope("group_%d" % group_idx):
                with tf.name_scope("layer_%d" % layer_idx):
                    layer_output = prev_output
                    for inner_group_idx in range(inner_group_num):
                        with tf.variable_scope("inner_group_%d" % inner_group_idx):
                            layer_output = attention_ffn_block(
                                layer_input=layer_output,
                                hidden_size=hidden_size,
                                attention_mask=attention_mask,
                                num_attention_heads=num_attention_heads,
                                attention_head_size=attention_head_size,
                                attention_probs_dropout_prob=attention_probs_dropout_prob,
                                intermediate_size=intermediate_size,
                                intermediate_act_fn=intermediate_act_fn,
                                initializer_range=initializer_range,
                                hidden_dropout_prob=hidden_dropout_prob,
                                use_einsum=use_einsum)
                            prev_output = layer_output
                            all_layer_outputs.append(layer_output)
    if do_return_all_layers:
        return all_layer_outputs
    else:
        return all_layer_outputs[-1]

In [131]:
embedding_output.shape

TensorShape([2, 3, 128])

In [136]:
prev_output = dense_layer_2d(
            embedding_output, 768, create_initializer(0.02),
            None, use_einsum=True, name="embedding_hidden_mapping_in")
prev_output.shape

TensorShape([2, 3, 768])

#### attention ffn block

In [133]:
def attention_ffn_block(layer_input,
                        attention_mask,
                        hidden_size=768,
                        num_attention_heads=12,
                        attention_head_size=64,
                        attention_probs_dropout_prob=0.1,
                        intermediate_size=3072,
                        intermediate_act_fn="gleu",
                        initializer_range=0.02,
                        hidden_dropout_prob=0.1,
                        use_einsum=True):
    with tf.variable_scope("attention_1"):
        with tf.variable_scope("self"):
            attention_output = attention_layer(
                from_tensor=layer_input,
                to_tensor=layer_input,
                attention_mask=attention_mask,
                num_attention_heads=num_attention_heads,
                attention_probs_dropout_prob=attention_probs_dropout_prob,
                initializer_range=initializer_range,
                use_einsum=use_einsum
            )
        # Run a linear projection of `hidden_size` then add a residual
        # with `layer_input`.
        with tf.variable_scope("output"):
            attention_output = dense_layer_3d_proj(
                attention_output,
                hidden_size,
                attention_head_size,
                create_initializer(initializer_range),
                None,
                use_einsum=use_einsum,
                name="dense"
            )
            attention_output = dropout(attention_output, hidden_dropout_prob)
    attention_output = layer_norm(attention_output + layer_input)
    
    with tf.variable_scope("ffn_1"):
        with tf.variable_scope("intermediate"):
            intermediate_output = dense_layer_2d(
                attention_output,
                intermediate_size,
                create_initializer(initializer_range),
                intermediate_act_fn,
                use_einsum=use_einsum,
                num_attention_heads=num_attention_heads,
                name="dense")
        with tf.variable_scope("output"):
            ffn_output = dense_layer_2d(
                intermediate_output,
                hidden_size,
                create_initializer(initializer_range),
                None,
                use_einsum=use_einsum,
                num_attention_heads=num_attention_heads,
                name="dense")
        ffn_output = dropout(ffn_output, hidden_dropout_prob)
    ffn_output = layer_norm(ffn_output + attention_output)
    return ffn_output

#### attention layer

In [140]:
get_shape_list(prev_output, expected_rank=[2, 3])

[2, 3, 768]

In [166]:
def attention_layer(from_tensor,
                    to_tensor,
                    attention_mask,
                    num_attention_heads=12,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.1,
                    initializer_range=0.02,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None,
                    use_einsum=True):
    
    # (batch_size, seq_length, hidden_size)
    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
    # 768/12 = 64
    size_per_head = int(from_shape[2]/num_attention_heads)
    
    batch_size = from_shape[0]
    from_seq_length = from_shape[1]
    to_seq_length = to_shape[1]


    # `query_layer` = [B, F, N, H]
    q = dense_layer_3d(from_tensor, num_attention_heads, size_per_head, 
                       create_initializer(initializer_range), query_act, use_einsum, "query")

    # `key_layer` = [B, T, N, H]
    k = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
                       create_initializer(initializer_range), key_act, use_einsum, "key")
    # `value_layer` = [B, T, N, H]
    v = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, 
                       create_initializer(initializer_range), value_act, use_einsum, "value")
    
    q = tf.transpose(q, [0, 2, 1, 3])
    k = tf.transpose(k, [0, 2, 1, 3])
    v = tf.transpose(v, [0, 2, 1, 3])
    
    if attention_mask is not None:
        attention_mask = tf.reshape(attention_mask, [batch_size, 1, to_seq_length, 1])
    
    # 'new_embeddings = [B, N, F, H]'
    new_embeddings = dot_product_attention(q, k, v, attention_mask, attention_probs_dropout_prob)
    return tf.transpose(new_embeddings, [0, 2, 1, 3])

In [146]:
q = dense_layer_3d(prev_output, 12, 64, create_initializer(0.02), None, True, "query")
q.shape

TensorShape([2, 3, 12, 64])

#### dot production attention

In [149]:
from_shape=get_shape_list(q)
from_shape

[2, 3, 12, 64]

In [155]:
def dot_product_attention(q, k, v, mask, dropout_rate=0.1):
    # (seq_length, num_heads, q_length, kv_length)
    logits = tf.matmul(q, k, transpose_b=True)  # [..., length_q, length_kv]
    logits = tf.multiply(logits, 1.0 / math.sqrt(float(get_shape_list(q)[-1])))
    if mask is not None:
        # `attention_mask` = [B, T]
        from_shape = get_shape_list(q)
        broadcast_ones = tf.ones([from_shape[0], 1, from_shape[2], 1], tf.float32)
        mask = tf.matmul(broadcast_ones,
                         tf.cast(mask, tf.float32), transpose_b=True)

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        adder = (1.0 - mask) * -10000.0

        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        logits += adder
    else:
        adder = 0.0

    attention_probs = tf.nn.softmax(logits, name="attention_probs")
    attention_probs = dropout(attention_probs, dropout_rate)
    return tf.matmul(attention_probs, v)

In [156]:
q = tf.transpose(q, [0, 2, 1, 3])
q.shape

TensorShape([2, 12, 3, 64])

In [157]:
broadcast_ones = tf.ones([from_shape[0], 1, from_shape[2], 1], tf.float32)
broadcast_ones.shape

TensorShape([2, 1, 12, 1])

In [159]:
attention_mask = tf.reshape(input_mask, [2, 1, 3, 1])
attention_mask.shape

TensorShape([2, 1, 3, 1])

In [160]:
bias = tf.matmul(broadcast_ones, tf.cast(attention_mask, tf.float32), transpose_b=True)
bias.shape

TensorShape([2, 1, 12, 3])

In [164]:
input_mask

<tf.Tensor: shape=(2, 3), dtype=int32, numpy=
array([[1, 1, 1],
       [1, 1, 0]], dtype=int32)>

In [163]:
bias

<tf.Tensor: shape=(2, 1, 12, 3), dtype=float32, numpy=
array([[[[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]]],


       [[[1., 1., 0.],
         [1., 1., 0.],
         [1., 1., 0.],
         [1., 1., 0.],
         [1., 1., 0.],
         [1., 1., 0.],
         [1., 1., 0.],
         [1., 1., 0.],
         [1., 1., 0.],
         [1., 1., 0.],
         [1., 1., 0.],
         [1., 1., 0.]]]], dtype=float32)>

In [162]:
adder = (1.0 - bias) * -10000.0
adder

<tf.Tensor: shape=(2, 1, 12, 3), dtype=float32, numpy=
array([[[[    -0.,     -0.,     -0.],
         [    -0.,     -0.,     -0.],
         [    -0.,     -0.,     -0.],
         [    -0.,     -0.,     -0.],
         [    -0.,     -0.,     -0.],
         [    -0.,     -0.,     -0.],
         [    -0.,     -0.,     -0.],
         [    -0.,     -0.,     -0.],
         [    -0.,     -0.,     -0.],
         [    -0.,     -0.,     -0.],
         [    -0.,     -0.,     -0.],
         [    -0.,     -0.,     -0.]]],


       [[[    -0.,     -0., -10000.],
         [    -0.,     -0., -10000.],
         [    -0.,     -0., -10000.],
         [    -0.,     -0., -10000.],
         [    -0.,     -0., -10000.],
         [    -0.,     -0., -10000.],
         [    -0.,     -0., -10000.],
         [    -0.,     -0., -10000.],
         [    -0.,     -0., -10000.],
         [    -0.,     -0., -10000.],
         [    -0.,     -0., -10000.],
         [    -0.,     -0., -10000.]]]], dtype=float32)>

In [161]:
logits = tf.matmul(q, q, transpose_b=True)  # [..., length_q, length_kv]
logits = tf.multiply(logits, 1.0 / math.sqrt(float(get_shape_list(q)[-1])))
logits.shape

TensorShape([2, 12, 3, 3])

In [174]:
# 最后一层
all_layer_outputs = transformer_model(embedding_output, input_mask)

In [175]:
all_layer_outputs.shape

TensorShape([2, 3, 768])

### Pooling

In [176]:
first_token_tensor = tf.squeeze(all_layer_outputs[:, 0:1, :], axis=1)
first_token_tensor.shape

TensorShape([2, 768])

In [181]:
pool = tf.keras.layers.Dense(768, activation=tf.tanh, kernel_initializer=create_initializer(0.02))
pooled_output = pool(first_token_tensor)
pooled_output.shape

TensorShape([2, 768])

## SOP Loss

In [232]:
output_weights = tf.get_variable(
    "output_weights", 
    shape=[2, 768],
    initializer=create_initializer(0.02))
output_bias = tf.get_variable(
    "output_bias", shape=[2], initializer=tf.zeros_initializer())

In [233]:
output_weights

<tf.Variable 'output_weights:0' shape=(2, 768) dtype=float32, numpy=
array([[ 0.00258045,  0.00957985, -0.00063457, ..., -0.00678589,
         0.01690069,  0.02294636],
       [ 0.00786927,  0.0039197 , -0.00218187, ..., -0.026448  ,
        -0.03057633, -0.01545808]], dtype=float32)>

In [234]:
output_bias

<tf.Variable 'output_bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>

In [235]:
logits = tf.matmul(pooled_output, output_weights, transpose_b=True)
logits.shape

TensorShape([2, 2])

In [236]:
logits = tf.nn.bias_add(logits, output_bias)
logits.shape

TensorShape([2, 2])

In [237]:
logits

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[ 0.28498146, -0.3474557 ],
       [ 0.38374817, -0.06567392]], dtype=float32)>

In [238]:
log_probs = tf.nn.log_softmax(logits, axis=-1)
log_probs.shape

TensorShape([2, 2])

In [239]:
log_probs

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[-0.426114  , -1.0585512 ],
       [-0.49347395, -0.942896  ]], dtype=float32)>

In [240]:
tf.nn.softmax(logits, axis=-1)

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.6530419 , 0.34695813],
       [0.6105018 , 0.38949817]], dtype=float32)>

In [241]:
labels = tf.constant([[0], [1]])
labels.shape

TensorShape([2, 1])

In [242]:
labels = tf.reshape(labels, [-1])
labels.shape

TensorShape([2])

In [243]:
one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
one_hot_labels.shape

TensorShape([2, 2])

In [244]:
one_hot_labels

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[1., 0.],
       [0., 1.]], dtype=float32)>

In [245]:
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)

In [246]:
per_example_loss.shape

TensorShape([2])

In [247]:
per_example_loss

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.426114, 0.942896], dtype=float32)>

In [248]:
loss

<tf.Tensor: shape=(), dtype=float32, numpy=0.684505>

## Helpers

In [11]:
def layer_norm(input_tensor, name=None):
    return tf.keras.layers.LayerNormalization(name=name,axis=-1,epsilon=1e-12,dtype=tf.float32)(input_tensor)

In [29]:
def dropout(input_tensor, dropout_prob):
    if dropout_prob is None or dropout_prob == 0.0:
        return input_tensor
    output = tf.nn.dropout(input_tensor, rate=dropout_prob)
    return output

In [25]:
def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
    output_tensor = layer_norm(input_tensor, name)
    output_tensor = dropout(output_tensor, dropout_prob)
    return output_tensor

In [12]:
def get_shape_list(tensor, expected_rank=None, name=None):
    return tensor.shape.as_list()

In [13]:
def create_initializer(initializer_range=0.02):
    return tf.truncated_normal_initializer(stddev=initializer_range)

In [143]:
def dense_layer_3d_proj(input_tensor,
                        hidden_size,
                        head_size,
                        initializer,
                        activation,
                        use_einsum,
                        name=None):
    input_shape = get_shape_list(input_tensor)
    num_attention_heads = input_shape[2]
    with tf.variable_scope(name):
        w = tf.get_variable(
            name="kernel", shape=[num_attention_heads * head_size, hidden_size], initializer=initializer)
    w = tf.reshape(w, [num_attention_heads, head_size, hidden_size])
    b = tf.get_variable(
        name="bias", shape=[hidden_size], initializer=tf.zeros_initializer)
    if use_einsum:
        ret = tf.einsum("BFND,NDH->BFH", input_tensor, w)
    else:
        ret = einsum_via_matmul(input_tensor, w, 2)
    ret += b
    if activation is not None:
        return activation(ret)
    else:
        return ret

In [169]:
def dense_layer_3d(input_tensor,
                   num_attention_heads,
                   head_size,
                   initializer,
                   activation,
                   use_einsum,
                   name=None):
    input_shape = get_shape_list(input_tensor)
    hidden_size = input_shape[2]
    
    with tf.variable_scope(name):
        w = tf.get_variable(
            name="kernel", shape=[hidden_size, num_attention_heads * head_size], initializer=initializer)
    w = tf.reshape(
        w, [hidden_size, num_attention_heads, head_size])
    b = tf.get_variable(
        name="bias", shape=[num_attention_heads * head_size], initializer=tf.zeros_initializer)
    b = tf.reshape(b, [num_attention_heads, head_size])
    if use_einsum:
        ret = tf.einsum("BFH,HND->BFND", input_tensor, w)
    else:
        ret = einsum_via_matmul(input_tensor, w, 1)
    ret += b
    if activation is not None:
        return get_activation(activation)(ret)
    else:
        return ret

In [170]:
def dense_layer_2d(input_tensor,
                   output_size,
                   initializer,
                   activation,
                   use_einsum,
                   num_attention_heads=1,
                   name=None):
    input_shape = get_shape_list(input_tensor)
    hidden_size = input_shape[2]
    with tf.variable_scope(name):
        w = tf.get_variable(name="kernel", shape=[hidden_size, output_size], initializer=initializer)
        b = tf.get_variable(name="bias", shape=[output_size], initializer=tf.zeros_initializer)
    if use_einsum:
        ret = tf.einsum("BFH,HO->BFO", input_tensor, w)
    else:
        ret = tf.matmul(input_tensor, w)
    ret += b
    if activation is not None:
        return get_activation(activation)(ret)
    else:
        return ret

In [173]:
def gelu(x):
    """Gaussian Error Linear Unit.

    This is a smoother version of the RELU.
    Original paper: https://arxiv.org/abs/1606.08415
    Args:
    x: float Tensor to perform activation.

    Returns:
    `x` with the GELU activation applied.
    """
    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf


def get_activation(activation_string):
    if not isinstance(activation_string, six.string_types):
        return activation_string

    if not activation_string:
        return None

    act = activation_string.lower()
    if act == "linear":
        return None
    elif act == "relu":
        return tf.nn.relu
    elif act == "gelu":
        return gelu
    elif act == "tanh":
        return tf.tanh

In [116]:
x = tf.constant([[[1, 2, 3], [4, 5, 6]]])
x.shape

TensorShape([1, 2, 3])

In [118]:
w = tf.constant([[1], [2], [3]])
w.shape

TensorShape([3, 1])

In [119]:
tf.matmul(x, w)

<tf.Tensor: shape=(1, 2, 1), dtype=int32, numpy=
array([[[14],
        [32]]], dtype=int32)>

In [120]:
tf.einsum("BFH,HO->BFO", x, w)

<tf.Tensor: shape=(1, 2, 1), dtype=int32, numpy=
array([[[14],
        [32]]], dtype=int32)>

In [123]:
tf.tensordot(x, w, 1)

<tf.Tensor: shape=(1, 2, 1), dtype=int32, numpy=
array([[[14],
        [32]]], dtype=int32)>

## Data

In [253]:
pvals = 1. / np.arange(1, 3 + 1)
pvals /= pvals.sum(keepdims=True)
pvals

array([0.54545455, 0.27272727, 0.18181818])

In [258]:
i = 2
p=pvals[:i] /pvals[:i].sum(keepdims=True)
p

array([0.66666667, 0.33333333])

In [259]:
ngrams = np.arange(1, 3 + 1, dtype=np.int64)
ngrams

array([1, 2, 3])

In [262]:
n = np.random.choice(ngrams[:i], p=pvals[:i] /pvals[:i].sum(keepdims=True))
n

1

In [264]:
import random
rng = random.Random(12345)

In [265]:
cd ~/Documents/Study/DL-Models/albert/

/Users/HaoShaochun/Documents/Study/DL-Models/albert


In [267]:
import tokenization

In [268]:
tokenizer = tokenization.FullTokenizer(
      vocab_file="vocab.txt", do_lower_case=True,
      spm_model_file=None)

In [275]:
tokens_a = "Text should be one-sentence-per-line, with empty lines between documents.".split()
tokens_b = "This sample text is public domain and was randomly selected from Project Guttenberg.".split()

In [276]:
tokens = []
tokens.append("[CLS]")
for token in tokens_a:
    tokens.append(token)
tokens.append("[SEP]")
for token in tokens_b:
    tokens.append(token)
tokens.append("[SEP]")

In [281]:
MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
                                          ["index", "label"])

In [382]:
(output_tokens, 
 masked_lm_positions, 
 masked_lm_labels) = create_masked_lm_predictions(tokens)

num_to_predict:  4
[[[9]], [[9], [11]], [[9], [11], [12]]]
[[[20]], [[20], [21]], [[20], [21], [22]]]
[[[3]], [[3], [4]], [[3], [4], [5]]]


In [383]:
masked_lm_positions

[9, 11, 12, 20]

In [384]:
masked_lm_labels

['documents.', 'This', 'sample', 'selected']

In [385]:
print(" ".join(tokens))

[CLS] Text should be one-sentence-per-line, with empty lines between documents. [SEP] This sample text is public domain and was randomly selected from Project Guttenberg. [SEP]


In [386]:
print(" ".join(output_tokens))

[CLS] Text should be one-sentence-per-line, with empty lines between [MASK] [SEP] [MASK] [MASK] text is public domain and was randomly 屿 from Project Guttenberg. [SEP]


In [381]:
def create_masked_lm_predictions(
    tokens, 
    masked_lm_prob=0.15, 
    max_predictions_per_seq=20, 
    vocab_words=list(tokenizer.vocab.keys()), 
    rng=rng):
    
    cand_indexes = []
    for (i, token) in enumerate(tokens):
        if token == "[CLS]" or token == "[SEP]":
            continue
        cand_indexes.append([i])
    
    output_tokens = list(tokens)
    masked_lm_positions = []
    masked_lm_labels = []
    num_to_predict = min(max_predictions_per_seq, 
                         max(1, int(round(len(tokens) * masked_lm_prob))))
    
    print("num_to_predict: ", num_to_predict)
    ngrams = np.arange(1, 3 + 1, dtype=np.int64)
    pvals = 1. / np.arange(1, 3 + 1)
    pvals /= pvals.sum(keepdims=True)
    
    ngram_indexes = []
    for idx in range(len(cand_indexes)):
        ngram_index = []
        for n in ngrams:
            ngram_index.append(cand_indexes[idx:idx+n])
        ngram_indexes.append(ngram_index)
    rng.shuffle(ngram_indexes)
    
    masked_lms = []
    covered_indexes = set()
    for cand_index_set in ngram_indexes:
        print(cand_index_set)
        if len(masked_lms) >= num_to_predict:
            break
        n = np.random.choice(
            ngrams[:len(cand_index_set)], 
            p=pvals[:len(cand_index_set)] / pvals[:len(cand_index_set)].sum(keepdims=True))
        # [16, 17] = sum([[16], [17]], [])
        index_set = sum(cand_index_set[n - 1], [])
        for index in index_set:
            masked_token = None
            if rng.random() < 0.8:
                masked_token = "[MASK]"
            else:
                if rng.random() < 0.5:
                    masked_token = tokens[index]
                else:
                    masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
            output_tokens[index] = masked_token
            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
    rng.shuffle(ngram_indexes)
  
    masked_lms = sorted(masked_lms, key=lambda x: x.index)

    for p in masked_lms:
        masked_lm_positions.append(p.index)
        masked_lm_labels.append(p.label)
    return (output_tokens, masked_lm_positions, masked_lm_labels)

In [370]:
def create_masked_lm_predictions(
    tokens, 
    masked_lm_prob=0.15, 
    max_predictions_per_seq=20, 
    vocab_words=list(tokenizer.vocab.keys()), 
    rng=rng):
    
    cand_indexes = []
    token_boundary = [0] * len(tokens)

    for (i, token) in enumerate(tokens):
        if token == "[CLS]" or token == "[SEP]":
            token_boundary[i] = 1
            continue
        cand_indexes.append([i])
        token_boundary[i] = 1
    
    output_tokens = list(tokens)
    masked_lm_positions = []
    masked_lm_labels = []
    num_to_predict = min(max_predictions_per_seq, 
                         max(1, int(round(len(tokens) * masked_lm_prob))))
    
    print("num_to_predict: ", num_to_predict)
    ngrams = np.arange(1, 3 + 1, dtype=np.int64)
    pvals = 1. / np.arange(1, 3 + 1)
    pvals /= pvals.sum(keepdims=True)
    
    ngram_indexes = []
    for idx in range(len(cand_indexes)):
        ngram_index = []
        for n in ngrams:
            ngram_index.append(cand_indexes[idx:idx+n])
        ngram_indexes.append(ngram_index)
    rng.shuffle(ngram_indexes)
    
    masked_lms = []
    covered_indexes = set()
    for cand_index_set in ngram_indexes:
        print("cand_index_set", cand_index_set)
        if len(masked_lms) >= num_to_predict:
            break
        if not cand_index_set:
            continue
        for index_set in cand_index_set[0]:
            print(index_set)
            print("H?")
            for index in index_set:
                print(index, covered_indexes)
                if index in covered_indexes:
                    continue

        n = np.random.choice(
            ngrams[:len(cand_index_set)], 
            p=pvals[:len(cand_index_set)] / pvals[:len(cand_index_set)].sum(keepdims=True))
        # [16, 17] = sum([[16], [17]], [])
        index_set = sum(cand_index_set[n - 1], [])
        n -= 1
        while len(masked_lms) + len(index_set) > num_to_predict:
            print("I?")
            if n == 0:
                break
            index_set = sum(cand_index_set[n - 1], [])
            n -= 1
        if len(masked_lms) + len(index_set) > num_to_predict:
            continue
        is_any_index_covered = False
        for index in index_set:
            print("J?")
            if index in covered_indexes:
                is_any_index_covered = True
                break
        if is_any_index_covered:
            continue
        for index in index_set:
            covered_indexes.add(index)
            masked_token = None
            if rng.random() < 0.8:
                masked_token = "[MASK]"
            else:
                if rng.random() < 0.5:
                    masked_token = tokens[index]
                else:
                    masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
            output_tokens[index] = masked_token
            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
    rng.shuffle(ngram_indexes)
  
    masked_lms = sorted(masked_lms, key=lambda x: x.index)

    for p in masked_lms:
        masked_lm_positions.append(p.index)
        masked_lm_labels.append(p.label)
    return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary)

In [371]:
(output_tokens, 
 masked_lm_positions, 
 masked_lm_labels,
token_boundary) = create_masked_lm_predictions(tokens)

num_to_predict:  4
cand_index_set [[[12]], [[12], [13]], [[12], [13], [14]]]
[12]
H?
12 set()
J?
J?
cand_index_set [[[3]], [[3], [4]], [[3], [4], [5]]]
[3]
H?
3 {12, 13}
J?
cand_index_set [[[22]], [[22], [23]], [[22], [23]]]
[22]
H?
22 {3, 12, 13}
J?
cand_index_set [[[15]], [[15], [16]], [[15], [16], [17]]]


In [317]:
masked_lm_positions

[5, 21, 22, 23]

In [318]:
masked_lm_labels

['with', 'from', 'Project', 'Guttenberg.']

In [316]:
output_tokens

['[CLS]',
 'Text',
 'should',
 'be',
 'one-sentence-per-line,',
 '##聰',
 'empty',
 'lines',
 'between',
 'documents.',
 '[SEP]',
 'This',
 'sample',
 'text',
 'is',
 'public',
 'domain',
 'and',
 'was',
 'randomly',
 'selected',
 '[MASK]',
 '[MASK]',
 'Guttenberg.',
 '[SEP]']