In [3]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [4]:
import tensorflow_text as text
import tensorflow as tf

In [5]:
tokenizer.do_lower_case

False

In [7]:
vocab_file = 'bert_tokenizer_dir/vocab.txt'
def _create_vocab_table_and_initializer(vocab_file):
    vocab_initializer = tf.lookup.TextFileInitializer(
        vocab_file,
        key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
        value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER)
    vocab_table = tf.lookup.StaticHashTable(vocab_initializer, default_value=-1)
    return vocab_table, vocab_initializer

vocab_table , vocab_initializer = _create_vocab_table_and_initializer(vocab_file)
bert_tokenizer = text.BertTokenizer(
        vocab_table, lower_case=False)

In [10]:
text = ['Hello how are you', 
       'Plants are beautiful :-)']

In [17]:
tokenizer(text, add_special_tokens=False)['input_ids']

[[8667, 1293, 1132, 1128], [25880, 1132, 2712, 131, 118, 114]]

In [23]:
enc = bert_tokenizer.tokenize(tf.identity(text))
enc.merge_dims(-2, -1).to_tensor()

<tf.Tensor: shape=(2, 6), dtype=int64, numpy=
array([[ 8667,  1293,  1132,  1128,     0,     0],
       [25880,  1132,  2712,   131,   118,   114]])>

In [24]:
def tokenize_map(text):
    enc = bert_tokenizer.tokenize(tf.identity(text))
    return enc.merge_dims(-2, -1).to_tensor()

In [25]:
dataset = tf.data.Dataset.from_tensor_slices(text)

In [29]:
for item in dataset:
    print(item)
    print('--------')

tf.Tensor(b'Hello how are you', shape=(), dtype=string)
--------
tf.Tensor(b'Plants are beautiful :-)', shape=(), dtype=string)
--------


In [31]:
dataset_mapped = dataset.map(tokenize_map)

In [32]:
for item in dataset_mapped:
    print(item)
    print('--------')

tf.Tensor([[8667 1293 1132 1128]], shape=(1, 4), dtype=int64)
--------
tf.Tensor([[25880  1132  2712   131   118   114]], shape=(1, 6), dtype=int64)
--------


In [90]:
import numpy as np
mask_token_id = tokenizer.mask_token_id
def get_masked_input_and_labels(encoded_texts):
    # 15% BERT masking
    inp_mask = np.random.rand(*encoded_texts.shape) < 0.15
    # Do not mask special tokens
    # inp_mask[encoded_texts <= 2] = False
    # Set targets to -1 by default, it means ignore
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    # Set labels for masked tokens
    labels[inp_mask] = encoded_texts[inp_mask]

    # Prepare input
    encoded_texts_masked = np.copy(encoded_texts)
    # Set input to [MASK] which is the last token for the 90% of tokens
    # This means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(*encoded_texts.shape) < 0.90)
    encoded_texts_masked[
        inp_mask_2mask
    ] = mask_token_id  # mask token is the last in the dict

    # Set 10% to a random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*encoded_texts.shape) < 1 / 9)
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
        3, mask_token_id, inp_mask_2random.sum()
    )

    # Prepare sample_weights to pass to .fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0

    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(encoded_texts)
    return encoded_texts_masked, y_labels, sample_weights

In [91]:
def augment(encoded_text):
    # Input to `augment()` is a TensorFlow tensor which
    # is not supported by `imgaug`. This is why we first
    # convert it to its `numpy` variant.
    return get_masked_input_and_labels(encoded_text.numpy())

In [96]:
dataset_mapped_new = dataset_mapped.map(
        lambda x: tf.py_function(augment, [x], [tf.int32, tf.int32, tf.float32])
)

In [98]:
for item in dataset_mapped_new:
    print(item)

(<tf.Tensor: shape=(1, 4), dtype=int32, numpy=array([[ 103, 1293, 1132, 1128]], dtype=int32)>, <tf.Tensor: shape=(1, 4), dtype=int32, numpy=array([[8667, 1293, 1132, 1128]], dtype=int32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[1., 0., 0., 0.]], dtype=float32)>)
(<tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[25880,    69,  2712,   131,   118,   114]], dtype=int32)>, <tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[25880,  1132,  2712,   131,   118,   114]], dtype=int32)>, <tf.Tensor: shape=(1, 6), dtype=float32, numpy=array([[0., 1., 0., 0., 0., 0.]], dtype=float32)>)


In [101]:
def dynamic_masking(text):
    encoded_text = tokenize_map(text)
    return tf.py_function(augment, [encoded_text], [tf.int32, tf.int32, tf.float32])

In [102]:
dataset_new = dataset.map(dynamic_masking)

In [104]:
dataset_new = dataset_new.repeat(7)

In [105]:
for item in dataset_new:
    print(item)
    print('--------------')

(<tf.Tensor: shape=(1, 4), dtype=int32, numpy=array([[8667,  103, 1132, 1128]], dtype=int32)>, <tf.Tensor: shape=(1, 4), dtype=int32, numpy=array([[8667, 1293, 1132, 1128]], dtype=int32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0., 1., 0., 0.]], dtype=float32)>)
--------------
(<tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[25880,  1132,  2712,    56,   118,   114]], dtype=int32)>, <tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[25880,  1132,  2712,   131,   118,   114]], dtype=int32)>, <tf.Tensor: shape=(1, 6), dtype=float32, numpy=array([[0., 0., 0., 1., 0., 0.]], dtype=float32)>)
--------------
(<tf.Tensor: shape=(1, 4), dtype=int32, numpy=array([[8667, 1293, 1132, 1128]], dtype=int32)>, <tf.Tensor: shape=(1, 4), dtype=int32, numpy=array([[8667, 1293, 1132, 1128]], dtype=int32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0., 0., 0., 0.]], dtype=float32)>)
--------------
(<tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[25880,  1132,  2712,

In [None]:
class BertTokenizer(tf.keras.layers.Layer):
  """Wraps BertTokenizer with pre-defined vocab as a Keras Layer.
  Attributes:
    tokenize_with_offsets: If true, calls
      `text.BertTokenizer.tokenize_with_offsets()` instead of plain
      `text.BertTokenizer.tokenize()` and outputs a triple of
      `(tokens, start_offsets, limit_offsets)`.
    raw_table_access: An object with methods `.lookup(keys) and `.size()`
      that operate on the raw lookup table of tokens. It can be used to
      look up special token synbols like `[MASK]`.
  """

  def __init__(self, *,
               vocab_file: str,
               lower_case: bool,
               tokenize_with_offsets: bool = False,
               **kwargs):
    """Initialize a `BertTokenizer` layer.
    Args:
      vocab_file: A Python string with the path of the vocabulary file.
        This is a text file with newline-separated wordpiece tokens.
        This layer initializes a lookup table from it that gets used with
        `text.BertTokenizer`.
      lower_case: A Python boolean forwarded to `text.BertTokenizer`.
        If true, input text is converted to lower case (where applicable)
        before tokenization. This must be set to match the way in which
        the `vocab_file` was created.
      tokenize_with_offsets: A Python boolean. If true, this layer calls
        `text.BertTokenizer.tokenize_with_offsets()` instead of plain
        `text.BertTokenizer.tokenize()` and outputs a triple of
        `(tokens, start_offsets, limit_offsets)`
        insead of just tokens.
      **kwargs: Standard arguments to `Layer()`.
    Raises:
      ImportError: If importing `tensorflow_text` failed.
    """
    _check_if_tf_text_installed()

    self.tokenize_with_offsets = tokenize_with_offsets
    # TODO(b/177326279): Stop storing the vocab table initializer as an
    # attribute when https://github.com/tensorflow/tensorflow/issues/46456
    # has been fixed in the TensorFlow versions of the TF Hub users that load
    # a SavedModel created from this layer. Due to that issue, loading such a
    # SavedModel forgets to add .vocab_table._initializer as a trackable
    # dependency of .vocab_table, so that saving it again to a second SavedModel
    # (e.g., the final model built using TF Hub) does not properly track
    # the ._vocab_table._initializer._filename as an Asset.
    self._vocab_table, self._vocab_initializer_donotuse = (
        self._create_vocab_table_and_initializer(vocab_file))
    self._special_tokens_dict = self._create_special_tokens_dict(
        self._vocab_table, vocab_file)
    super().__init__(**kwargs)
    self._bert_tokenizer = text.BertTokenizer(
        self._vocab_table, lower_case=lower_case)

  @property
  def vocab_size(self):
    return self._vocab_table.size()

  def _create_vocab_table_and_initializer(self, vocab_file):
    vocab_initializer = tf.lookup.TextFileInitializer(
        vocab_file,
        key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
        value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER)
    vocab_table = tf.lookup.StaticHashTable(vocab_initializer, default_value=-1)
    return vocab_table, vocab_initializer

  def call(self, inputs: tf.Tensor):
    """Calls `text.BertTokenizer` on inputs.
    Args:
      inputs: A string Tensor of shape `(batch_size,)`.
    Returns:
      One or three of `RaggedTensors` if `tokenize_with_offsets` is False or
      True, respectively. These are
        tokens: A `RaggedTensor` of shape
          `[batch_size, (words), (pieces_per_word)]`
          and type int32. `tokens[i,j,k]` contains the k-th wordpiece of the
          j-th word in the i-th input.
        start_offsets, limit_offsets: If `tokenize_with_offsets` is True,
          RaggedTensors of type int64 with the same indices as tokens.
          Element `[i,j,k]` contains the byte offset at the start, or past the
          end, resp., for the k-th wordpiece of the j-th word in the i-th input.
    """
    # Prepare to reshape the result to work around broken shape inference.
    batch_size = tf.shape(inputs)[0]
    def _reshape(rt):
      values = rt.values
      row_splits = rt.row_splits
      row_splits = tf.reshape(row_splits, [batch_size + 1])
      return tf.RaggedTensor.from_row_splits(values, row_splits)

    # Call the tokenizer.
    if self.tokenize_with_offsets:
      tokens, start_offsets, limit_offsets = (
          self._bert_tokenizer.tokenize_with_offsets(inputs))
      tokens = tf.cast(tokens, dtype=tf.int32)
      return _reshape(tokens), _reshape(start_offsets), _reshape(limit_offsets)
    else:
      tokens = self._bert_tokenizer.tokenize(inputs)
      tokens = tf.cast(tokens, dtype=tf.int32)
      return _reshape(tokens)

  def get_config(self):
    # Skip in tf.saved_model.save(); fail if called direcly.
    raise NotImplementedError("TODO(b/170480226): implement")

  def get_special_tokens_dict(self):
    """Returns dict of token ids, keyed by standard names for their purpose.
    Returns:
      A dict from Python strings to Python integers. Each key is a standard
      name for a special token describing its use. (For example, "padding_id"
      is what BERT traditionally calls "[PAD]" but others may call "<pad>".)
      The corresponding value is the integer token id. If a special token
      is not found, its entry is omitted from the dict.
      The supported keys and tokens are:
        * start_of_sequence_id: looked up from "[CLS]"
        * end_of_segment_id: looked up from "[SEP]"
        * padding_id: looked up form "[PAD]"
        * mask_id: looked up from "[MASK]"
        * vocab_size: one past the largest token id used
    """
    return self._special_tokens_dict

  def _create_special_tokens_dict(self, vocab_table, vocab_file):
    special_tokens = dict(start_of_sequence_id="[CLS]",
                          end_of_segment_id="[SEP]",
                          padding_id="[PAD]",
                          mask_id="[MASK]")
    with tf.init_scope():
      if tf.executing_eagerly():
        special_token_ids = vocab_table.lookup(
            tf.constant(list(special_tokens.values()), tf.string))
        vocab_size = vocab_table.size()
      else:
        # A blast from the past: non-eager init context while building Model.
        # This can happen with Estimator or tf.compat.v1.disable_v2_behavior().
        logging.warning(
            "Non-eager init context; computing "
            "BertTokenizer's special_tokens_dict in tf.compat.v1.Session")
        with tf.Graph().as_default():
          local_vocab_table, _ = self._create_vocab_table_and_initializer(
              vocab_file)
          special_token_ids_tensor = local_vocab_table.lookup(
              tf.constant(list(special_tokens.values()), tf.string))
          vocab_size_tensor = local_vocab_table.size()
          init_ops = [tf.compat.v1.initialize_all_tables()]
          with tf.compat.v1.Session() as sess:
            sess.run(init_ops)
            special_token_ids, vocab_size = sess.run(
                [special_token_ids_tensor, vocab_size_tensor])
      result = dict(
          vocab_size=int(vocab_size)  # Numpy to Python.
      )
      for k, v in zip(special_tokens, special_token_ids):
        v = int(v)
        if v >= 0:
          result[k] = v
        else:
          logging.warning("Could not find %s as token \"%s\" in vocab file %s",
                          k, special_tokens[k], vocab_file)
    return result
