In [27]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras import Model
import math
import json

In [2]:
def gelu(x):
    cdf = 0.5 * (1.0 + tf.tanh(
    (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf

In [3]:
def reshape_matrix(input_tensor):
    ndims = input_tensor.shape.ndims
    if ndims < 2:
        raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
                     (input_tensor.shape))
    if ndims == 2:
        return input_tensor

    width = input_tensor.shape[-1]
    output_tensor = tf.reshape(input_tensor, [-1, width])
    return output_tensor



In [4]:
def attention_layer(seq_length=128,
                    attention_mask=None,
                    num_attention_heads=1,
                    size_per_head=512,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    do_return_2d_tensor=False,
                    from_seq_length=None,
                    to_seq_length=None):
    
    def transpose_for_scores(input_tensor, num_attention_heads, seq_length, size_per_head):
        output_tensor = tf.reshape(input_tensor, [-1, seq_length, num_attention_heads, size_per_head])
        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor
        
    
    
    # input_shape = [None, seq_length, hidden_size]
    
    inputs = tf.keras.Input([num_attention_heads*size_per_head])
    reshape_inputs = reshape_matrix(inputs)
    query_layer = tf.keras.layers.Dense(num_attention_heads * size_per_head, activation=query_act)(reshape_inputs)
    key_layer = tf.keras.layers.Dense(num_attention_heads * size_per_head, activation=key_act)(reshape_inputs)
    value_layer = tf.keras.layers.Dense(num_attention_heads * size_per_head, activation=value_act)(reshape_inputs)
    
    query_layer = transpose_for_scores(query_layer, num_attention_heads, seq_length, size_per_head)
    key_layer = transpose_for_scores(key_layer, num_attention_heads, seq_length, size_per_head)

    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
    attention_scores = tf.multiply(attention_scores, 1.0/math.sqrt(float(size_per_head)))
    
    if attention_mask is not None:
        attention_mask = tf.expand_dims(attention_mask, axis=[1])
        adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
        attention_scores += adder
    
    attention_probs = tf.keras.layers.Softmax()(attention_scores)
    attention_probs = tf.keras.layers.Dropout(attention_probs_dropout_prob)(attention_probs)
        
    value_layer = transpose_for_scores(value_layer, num_attention_heads, seq_length, size_per_head)

    context_layer = tf.matmul(attention_probs, value_layer)
    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
    
    if do_return_2d_tensor:
        context_layer = tf.reshape(context_layer, [-1, num_attention_heads*size_per_head])
    else:
        context_layer = tf.reshape(context_layer, [-1, seq_length, num_attention_heads, size_per_head])
    
    return Model(inputs, context_layer)

In [6]:
def transformer_model(seq_length=128,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False):
    inputs = tf.keras.Input([seq_length, hidden_size])
    
    if hidden_size % num_attention_heads != 0:
        raise ValueError("The hidden size (%d) is not a multiple of the number of attention heads (%d)" % (hidden_size, num_attention_heads))
    
    attention_head_size = int(hidden_size / num_attention_heads)
    prev_output = reshape_matrix(inputs)
    
    all_layer_outputs = []
    for layer_idx in range(num_hidden_layers):
        layer_input = prev_output
        
        attention_heads = []
        attention_head = attention_layer(seq_length=seq_length,
                                        attention_mask=attention_mask,
                                        num_attention_heads=num_attention_heads,
                                        size_per_head=attention_head_size,
                                        attention_probs_dropout_prob=attention_probs_dropout_prob,
                                        initializer_range=initializer_range,
                                        do_return_2d_tensor=True)(layer_input)
        attention_heads.append(attention_head)
        
        attention_output = None
        
        # attention layer를 한번에 연산할지 나눠서 연산할지에 따라 다르게 구성 => 논문에서는 두가지 방법은 사실상 같다고 언급
        if len(attention_heads) == 1:
            attention_output = attention_heads[0]
        else:
            attention_output = tf.concat(attention_heads, axis=-1)
        
        attention_output = tf.keras.layers.Dense(hidden_size)(attention_output)
        attention_output = tf.keras.layers.Dropout(hidden_dropout_prob)(attention_output)
        attention_output = tf.keras.layers.LayerNormalization()(attention_output + layer_input)
        
        intermediate_output = tf.keras.layers.Dense(intermediate_size, activation=intermediate_act_fn)(attention_output)
        
        layer_output = tf.keras.layers.Dense(hidden_size)(intermediate_output)
        layer_output = tf.keras.layers.Dropout(hidden_dropout_prob)(layer_output)
        layer_output = tf.keras.layers.LayerNormalization()(layer_output + attention_output)
        
        prev_output = layer_output
        all_layer_outputs.append(layer_output)
    
    if do_return_all_layers:
        final_outputs = []
        for layer_output in all_layer_outputs:
            final_output = tf.reshape(layer_output, [-1, seq_length, hidden_size])
            final_outputs.append(final_output)
        return Model(inputs, final_outputs)
    else:
        final_output = tf.reshape(prev_output, [-1, seq_length, hidden_size])
        return Model(inputs, final_output)

In [36]:
class BertModel(object):
    
    def __init__(self,
                 config,
                 is_training,
                input_mask=None,
                token_type_ids=None,
                use_one_hot_embedding=False):
        
        inputs = tf.keras.Input([config["seq_length"]])
        
        config = config.copy()
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0
        
        if input_mask is None:
            input_mask = tf.ones(shape=[None, inputs.shape[1]], dtype=tf.int32)
        
        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[inputs.shape[0], inputs.shape[1]], dtype=tf.int32)
            
        self.embedding_output = tf.keras.layers.Embedding(config["vocab_size"], config["hidden_size"])(inputs)
        self.embedding_output = embedding_postprocessor(config["seq_length"],
                                                        config["hidden_size"],
                                                       use_token_type=True,
                                                       token_type_ids=token_type_ids,
                                                       token_type_vocab_size=config["type_vocab_size"],
                                                       token_type_embedding_name="token_type_embeddings",
                                                       use_position_embeddings=True,
                                                       position_embedding_name="position_embeddings",
                                                       initializer_range=config["initializer_range"],
                                                       max_position_embeddings=config["max_position_embeddings"],
                                                       dropout_prob=config["hidden_dropout_prob"])
        
        to_mask = tf.cast(tf.reshape(input_mask, [-1, 1, config["seq_length"]]), tf.float32)
        broadcast_ones = tf.ones(shape=[inputs.shape[0], inputs.shape[1], 1], dtype=tf.float32)
        attention_mask = broadcast_ones * to_mask
        
        self.all_encoder_layer = transformer_model(seq_length = config["seq_length"],
                                                attention_mask=attention_mask,
                                                hidden_size=config["hidden_size"],
                                                num_hidden_layers=config["num_hidden_layers"],
                                                num_attention_heads=config["num_attention_heads"],
                                                intermediate_size=config["intermediate_size"],
                                                intermediate_act_fn=get_activation(config["hidden_act"]),
                                                hidden_dropout_prob=config["hidden_dropout_prob"],
                                                attention_probs_dropout_prob=config["attention_probs_dropout_prob"],
                                                initializer_range=config["initializer_range"],
                                                do_return_all_layers=True)(self.embedding_output)
        
        self.sequence_output = self.all_encoder_layer[-1]
        
        self.model = Model(inputs, self.sequence_output)
        
        return self.model
        
    def get_pooled_output(self):
        
        first_token_tensor = tf.squeeze(self.model.output[:, 0:1, :], axis=1)
        self.pooled_output = tf.keras.layers.Dense(config["hidden_size"],
                                                  activation=tf.tanh)(first_token_tensor)

        return self.pooled_output
    
    def get_sequence_output(self):
        return self.model.output
    
        

In [37]:
def embedding_postprocessor(seq_length,
                            hidden_size,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1):

    inputs = tf.keras.Input([seq_length, hidden_size])
    
    output = inputs
    
    if use_token_type:
        if token_type_ids is None:
            raise ValueError("'token_type_ids' must be specified if 'use_token_type' is True.")
        
        token_type_table = tf.keras.layers.Embedding(token_type_vocab_size, inputs.shape[2])(inputs)
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings, [-1, seq_length, inputs.shape[2]])
        
        output += token_type_embeddings
        
    if use_position_embeddings:
        full_position_embeddings = tf.keras.layers.Embedding(max_position_embeddings, inputs.shape[2])
        position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1])
        num_dims = len(output.shape.as_list())
        
        position_broadcast_shape = []
        for _ in range(num_dim - 2):
            position_broadcast_shape.append(1)
        position_broadcast_shape.extend([seq_length, inputs.shape[2]])
        position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape)
        
        output += position_embeddings
    
    output = tf.keras.layers.LayerNormalization()(output)
    output = tf.keras.layers.Dropout(dropout_prob)(output)
    
    return Model(inputs, output)

In [38]:
bert_config = {
    "vocab_size" : 32000,
    "hidden_size" : 768,
    "num_hidden_layers" : 12,
    "num_attention_heads" : 12,
    "intermediate_size" : 3072,
    "hidden_act" : 'gelu',
    "hidden_dropout_prob" : 0.1,
    "attention_probs_dropout_prob" : 0.1,
    "max_position_embeddings" : 512,
    "type_vocab_size" : 16,
    "initializer_range" : 0.02,
    "seq_length" : 128
}

In [39]:
bert = BertModel(config=bert_config, is_training=True)

ValueError: Attempt to convert a value (None) with an unsupported type (<class 'NoneType'>) to a Tensor.

In [None]:
def create_attention_mask_from_input_mask(seq_length, hidden_size, to_mask):
    
    inputs = tf.keras.Input([seq_length, hidden_size])
    
    to_mask = tf.cast(tf.reshape(to_mask, [-1, 1, seq_length]), tf.float32)
    
    broadcast_ones = tf.ones(shape=[inputs.shape[0], inputs.shape[1], 1], dtype=tf.float32)
    
    mask = broadcast_ones * to_mask
    
    return mask

In [16]:
help(tf.assert_greater)

Help on function assert_greater_v2 in module tensorflow.python.ops.check_ops:

assert_greater_v2(x, y, message=None, summarize=None, name=None)
    Assert the condition `x > y` holds element-wise.
    
    This Op checks that `x[i] > y[i]` holds for every pair of (possibly
    broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
    trivially satisfied.
    
    If `x` is not greater than `y` element-wise, `message`, as well as the first
    `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError` is
    raised.
    
    Args:
      x:  Numeric `Tensor`.
      y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
      message: A string to prefix to the default message.
      summarize: Print this many entries of each tensor.
      name: A name for this operation (optional).  Defaults to "assert_greater".
    
    Returns:
      Op that raises `InvalidArgumentError` if `x > y` is False. This can be
        used with `tf.control_dependenci

In [7]:
tr = transformer_model()

In [17]:
help(tf.control_dependencies)

Help on function control_dependencies in module tensorflow.python.framework.ops:

control_dependencies(control_inputs)
    Wrapper for `Graph.control_dependencies()` using the default graph.
    
    See `tf.Graph.control_dependencies` for more details.
    
    Note: *In TensorFlow 2 with eager and/or Autograph, you should not require
    this method, as ops execute in the expected order thanks to automatic control
    dependencies.* Only use `tf.control_dependencies` when working with v1
    `tf.Graph` code.
    
    When eager execution is enabled, any callable object in the `control_inputs`
    list will be called.
    
    Args:
      control_inputs: A list of `Operation` or `Tensor` objects which must be
        executed or computed before running the operations defined in the context.
        Can also be `None` to clear the control dependencies. If eager execution
        is enabled, any callable object in the `control_inputs` list will be
        called.
    
    Returns:
     

In [64]:
32000 * 768

24576000