## Pre-train BERT

In [1]:
from dataclasses import dataclass

import numpy as np
from pprint import pprint

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds

from tokenizers import BertWordPieceTokenizer

tf.version.VERSION

'2.6.0'

In [2]:
@dataclass
class Config:
    # tokenizer hyperparameters
    VOCAB_SIZE = 30000
    LIMIT_ALPHABET = 1000
    MIN_TOKEN_FREQ = 2
    SPECIAL_TOKENS = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
    WORDPIECES_PREFIX = '##'
    MAX_LEN = 360
    PROB_OUTPUT_MASK = 0.15
    PROB_INPUT_MASK = 0.90
    PROB_INPUT_RANDOM = 0.1
    
    BATCH_SIZE = 32
    LR = 1e-3
    EPSILON = 1e-6
    BUFFER_SIZE = 10000
    EPOCHS = 10
    DROPOUT = 0.1
    
    EMBED_DIM = 128
    NUM_HEAD = 8
    FF_DIM = 128
    NUM_LAYERS = 1


config = Config()

In [3]:
dataset = tfds.load('imdb_reviews')

In [4]:
train_ds = dataset['train']
test_ds = dataset['test']

In [5]:
train_ds.cardinality()

<tf.Tensor: shape=(), dtype=int64, numpy=25000>

In [6]:
test_ds.cardinality()

<tf.Tensor: shape=(), dtype=int64, numpy=25000>

In [7]:
def get_text(train_ds, test_ds):
    for entry in train_ds:
        text = entry['text']
        yield text.numpy().decode('utf-8')
    for entry in test_ds:
        text = entry['text']
        yield text.numpy().decode('utf-8')


tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True,
)

tokenizer.train_from_iterator(
    get_text(train_ds, test_ds),
    vocab_size=config.VOCAB_SIZE,
    min_frequency=config.MIN_TOKEN_FREQ,
    show_progress=True,
    special_tokens=config.SPECIAL_TOKENS,
    limit_alphabet=config.LIMIT_ALPHABET,
    wordpieces_prefix=config.WORDPIECES_PREFIX,
)

tokenizer

Tokenizer(vocabulary_size=30000, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, wordpieces_prefix=##)

In [8]:
tokenizer.get_vocab()

{'pushing': 7798,
 'deck': 11143,
 'phool': 28649,
 '##eland': 7864,
 'humanoids': 25770,
 'defense': 7915,
 'singers': 9865,
 'rel': 773,
 '##qt': 24278,
 'declared': 15420,
 'catalog': 12129,
 'heartless': 16064,
 'esp': 7045,
 '##yn': 1526,
 'chom': 19768,
 'monster': 2201,
 'medall': 28719,
 'indiana': 11597,
 'decapitated': 18760,
 'abrasive': 25067,
 '4th': 10718,
 'silliest': 22647,
 'ron': 4235,
 'vort': 22755,
 '##abilities': 13066,
 'inquisition': 27756,
 'speeches': 10605,
 'baloo': 27523,
 'zion': 29881,
 '##seud': 6504,
 'vanilla': 15117,
 'noooo': 28472,
 'overlo': 21005,
 'modest': 9358,
 'macy': 7973,
 'mahat': 24545,
 'pisc': 29868,
 'winter': 6938,
 'aime': 13866,
 'george': 2161,
 '##where': 1793,
 '##ervice': 17881,
 'ellie': 28452,
 'khan': 6440,
 'reminisc': 5778,
 'ronald': 10061,
 '##math': 11764,
 'whereabouts': 17826,
 '##medi': 2781,
 'trem': 12127,
 '##lit': 25141,
 'mcgowan': 26479,
 'scr': 3909,
 'ligh': 11319,
 '##athom': 11854,
 'mess': 1306,
 'surgeon':

In [9]:
tokenizer.enable_truncation(max_length=config.MAX_LEN)
tokenizer.enable_padding(length=config.MAX_LEN)

In [10]:
MASK_TOKEN_ID = tokenizer.token_to_id('[MASK]')


def tokenize_tensor(tensor):
    text = tensor.numpy().decode('utf-8')
    result = tokenizer.encode(text, add_special_tokens=True)
    ids = np.array(result.ids)
    sentence_len = len(ids)
    inp_mask = np.random.rand(sentence_len) < config.PROB_OUTPUT_MASK
    # Do not mask special tokens
    inp_mask[ids <= 2] = False
    # Set targets to -1 by default, it means ignore
    labels = -1 * np.ones([sentence_len], dtype=int)
    # Set labels for masked tokens
    labels[inp_mask] = ids[inp_mask]
    # Prepare input
    encoded_texts_masked = np.copy(ids)
    # Set input to [MASK] which is the last token for the 90% of tokens
    # This means leaving 10% unchange
    inp_mask_2mask = inp_mask & (np.random.rand(sentence_len) < config.PROB_INPUT_MASK)
    encoded_texts_masked[inp_mask_2mask] = MASK_TOKEN_ID

    # Set 10% to a random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(sentence_len) < config.PROB_INPUT_RANDOM)
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
        3, MASK_TOKEN_ID, inp_mask_2random.sum()
    )

    # Prepare sample_weights to pass to .fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0

    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(ids)
    
    encoded_texts_masked_tensor = tf.constant(encoded_texts_masked, tf.int32)
    y_labels_tensor = tf.constant(y_labels, tf.int32)
    sample_weights_tensor = tf.constant(sample_weights, tf.float32)
    return encoded_texts_masked_tensor, y_labels_tensor, sample_weights_tensor


def preprocess_text(entry):
    example = tf.py_function(
        tokenize_tensor, [entry['text']], Tout=(tf.int32, tf.int32, tf.float32))
    encoded_texts_masked, y_labels, sample_weights = example
    return encoded_texts_masked, y_labels, sample_weights


def show_example(ds):
    for entry in ds:
        print(entry)
        encoded_texts_masked, y_labels, sample_weights = entry
        decoded_input = tokenizer.decode(encoded_texts_masked[0], skip_special_tokens=False)
        decoded_output = tokenizer.decode(y_labels[0], skip_special_tokens=False)
        print(decoded_input)
        print(decoded_output)
        break


all_ds = train_ds.concatenate(test_ds).shuffle(config.BUFFER_SIZE).map(preprocess_text).batch(config.BATCH_SIZE)
show_example(all_ds)

(<tf.Tensor: shape=(32, 360), dtype=int32, numpy=
array([[   51,   240,  3497, ...,     0,     0,     0],
       [13624,  1418,     4, ...,   331,   183,     4],
       [ 1903,     4,   183, ...,     0,     0,     0],
       ...,
       [ 2625,    12,  9005, ...,     0,     0,     0],
       [    4,   488,  1718, ...,     0,     0,     0],
       [  241,    51,  1134, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(32, 360), dtype=int32, numpy=
array([[   51,   240,  3497, ...,     0,     0,     0],
       [13624,  1418,   330, ...,   331,   183,   761],
       [ 1903,   204,   183, ...,     0,     0,     0],
       ...,
       [ 2625,    12,  9005, ...,     0,     0,     0],
       [  183,   488,  1718, ...,     0,     0,     0],
       [  241,    51,  1134, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(32, 360), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.],
       .

In [11]:
def bert_module(query, key, value, i):
    # Multi headed self-attention
    attention_output = layers.MultiHeadAttention(
        num_heads=config.NUM_HEAD,
        key_dim=config.EMBED_DIM // config.NUM_HEAD,
        name=f"encoder_{i}/multiheadattention")(query, key, value)
    
    attention_output = layers.Dropout(config.DROPOUT, name=f"encoder_{i}/att_dropout")(attention_output)
    attention_output = layers.LayerNormalization(
        epsilon=1e-6, name=f"encoder_{i}/att_layernormalization")(query + attention_output)

    # Feed-forward layer
    ffn = keras.Sequential([
        layers.Dense(config.FF_DIM, activation="relu"),
        layers.Dense(config.EMBED_DIM),
    ], name=f"encoder_{i}/ffn")
    
    ffn_output = ffn(attention_output)
    ffn_output = layers.Dropout(config.DROPOUT, name=f"encoder_{i}/ffn_dropout")(ffn_output)
    sequence_output = layers.LayerNormalization(epsilon=1e-6, name=f"encoder_{i}/ffn_layernormalization")(attention_output + ffn_output)
    return sequence_output


def get_pos_encoding_matrix(max_len, d_emb):
    pos_enc = np.array(
        [
            [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
            if pos != 0
            else np.zeros(d_emb)
            for pos in range(max_len)
        ]
    )
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i+1
    return pos_enc


loss_fn = keras.losses.SparseCategoricalCrossentropy(
    reduction=tf.keras.losses.Reduction.NONE
)
loss_tracker = tf.keras.metrics.Mean(name="loss")


class MaskedLanguageModel(tf.keras.Model):
    def train_step(self, inputs):
        if len(inputs) == 3:
            features, labels, sample_weight = inputs
        else:
            features, labels = inputs
            sample_weight = None

        with tf.GradientTape() as tape:
            predictions = self(features, training=True)
            loss = loss_fn(labels, predictions, sample_weight=sample_weight)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Compute our own metrics
        loss_tracker.update_state(loss, sample_weight=sample_weight)

        # Return a dict mapping metric names to current value
        return {"loss": loss_tracker.result()}

    @property
    def metrics(self):
        # We list our `Metric` objects here so that `reset_states()` can be
        # called automatically at the start of each epoch
        # or at the start of `evaluate()`.
        # If you don't implement this property, you have to call
        # `reset_states()` yourself at the time of your choosing.
        return [loss_tracker]


def create_masked_language_bert_model():
    inputs = layers.Input((config.MAX_LEN,), dtype=tf.int32)

    position = tf.range(start=0, limit=config.MAX_LEN, delta=1, dtype=tf.int32)
    word_embeddings = layers.Embedding(config.VOCAB_SIZE, config.EMBED_DIM, name="word_embedding")(inputs)
    position_embeddings = layers.Embedding(
        input_dim=config.MAX_LEN,
        output_dim=config.EMBED_DIM,
        weights=[get_pos_encoding_matrix(config.MAX_LEN, config.EMBED_DIM)],
        name="position_embedding")(position)
    embeddings = word_embeddings + position_embeddings
    # this will go crazy for some reason
    # embeddings = layers.Add(name='combine_embedding')([position_embeddings, word_embeddings])

    encoder_output = embeddings
    for i in range(config.NUM_LAYERS):
        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)

    mlm_output = layers.Dense(config.VOCAB_SIZE, name="mlm_cls", activation="softmax")(
        encoder_output
    )
    mlm_model = keras.Model(inputs, mlm_output, name='bert_model')
    model_to_train = MaskedLanguageModel(inputs, mlm_output, name="masked_bert_model")

    optimizer = keras.optimizers.Adam(learning_rate=config.LR, epsilon=config.EPSILON)
    model_to_train.compile(
        optimizer=optimizer,
        metrics=[tf.keras.metrics.Mean(name="loss")])
    return mlm_model, model_to_train


keras.backend.clear_session()
bert_model, bert_masked_model = create_masked_language_bert_model()
bert_model.summary()
bert_masked_model.summary()

Model: "bert_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 360)]        0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 360, 128)     3840000     input_1[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.add (TFOpLambd (None, 360, 128)     0           word_embedding[0][0]             
__________________________________________________________________________________________________
encoder_0/multiheadattention (M (None, 360, 128)     66048       tf.__operators__.add[0][0]       
                                                                 tf.__operators__.add[0][

In [12]:
class MaskedTextGenerator(keras.callbacks.Callback):
    def __init__(self, sample_tokens, top_k=5):
        self.sample_tokens = sample_tokens
        self.k = top_k

    def decode(self, ids):
        return " ".join([tokenizer.id_to_token(id) for id in ids if id != 0])
    
    def convert_ids_to_tokens(self, id):
        return tokenizer.id_to_token(id)

    def on_epoch_end(self, epoch, logs=None):
        prediction = self.model.predict(self.sample_tokens)

        masked_index = np.where(self.sample_tokens == MASK_TOKEN_ID)
        masked_index = masked_index[1]
        mask_prediction = prediction[0][masked_index]

        top_indices = mask_prediction[0].argsort()[-self.k :][::-1]
        values = mask_prediction[0][top_indices]
        
        for i in range(len(top_indices)):
            p = top_indices[i]
            v = values[i]
            tokens = np.copy(self.sample_tokens[0])
            tokens[masked_index[0]] = p
            result = {
                "input_text": self.decode(self.sample_tokens[0]),
                "prediction": self.decode(tokens),
                "probability": v,
                "predicted mask token": self.convert_ids_to_tokens(p),
            }
            pprint(result)


sample_text = 'rest star cast was simply okay. music and all songs are good, himesh is impressive as [MASK] singer here.'
encoded_sample = tokenizer.encode(sample_text, add_special_tokens=True)

generator_callback = MaskedTextGenerator(np.array([encoded_sample.ids]))
bert_masked_model.fit(all_ds, epochs=config.EPOCHS, callbacks=[generator_callback])

Epoch 1/10
{'input_text': 'rest star cast was simply okay . music and all songs are good '
               ', himesh is impressive as [MASK] singer here .',
 'predicted mask token': 'the',
 'prediction': 'rest star cast was simply okay . music and all songs are good '
               ', himesh is impressive as the singer here .',
 'probability': 0.059381038}
{'input_text': 'rest star cast was simply okay . music and all songs are good '
               ', himesh is impressive as [MASK] singer here .',
 'predicted mask token': '.',
 'prediction': 'rest star cast was simply okay . music and all songs are good '
               ', himesh is impressive as . singer here .',
 'probability': 0.056060143}
{'input_text': 'rest star cast was simply okay . music and all songs are good '
               ', himesh is impressive as [MASK] singer here .',
 'predicted mask token': ',',
 'prediction': 'rest star cast was simply okay . music and all songs are good '
               ', himesh is impressive as 

{'input_text': 'rest star cast was simply okay . music and all songs are good '
               ', himesh is impressive as [MASK] singer here .',
 'predicted mask token': 'the',
 'prediction': 'rest star cast was simply okay . music and all songs are good '
               ', himesh is impressive as the singer here .',
 'probability': 0.11944608}
{'input_text': 'rest star cast was simply okay . music and all songs are good '
               ', himesh is impressive as [MASK] singer here .',
 'predicted mask token': 'a',
 'prediction': 'rest star cast was simply okay . music and all songs are good '
               ', himesh is impressive as a singer here .',
 'probability': 0.07721416}
{'input_text': 'rest star cast was simply okay . music and all songs are good '
               ', himesh is impressive as [MASK] singer here .',
 'predicted mask token': 'and',
 'prediction': 'rest star cast was simply okay . music and all songs are good '
               ', himesh is impressive as and singer 

<keras.callbacks.History at 0x7f4870140760>

In [13]:
bert_model.save("bert_mlm_imdb.h5")





## Training Downstream Task

In [24]:
def tokenize_text(entry):
    def tokenize(tensor):
        text = tensor.numpy().decode('utf-8')
        result = tokenizer.encode(text, add_special_tokens=True)
        ids = np.array(result.ids)
        return tf.constant(ids, tf.int32)
    
    token_tensor = tf.py_function(tokenize, [entry['text']], Tout=tf.int32)
    return token_tensor, entry['label']


def show_downstream_task_example(ds):
    for entry in ds:
        x, y = entry
        decoded_x = tokenizer.decode(x[0])
        print(x.shape)
        print(y.shape)
        print(decoded_x)
        break


tokenized_train_ds = train_ds.shuffle(config.BUFFER_SIZE).map(tokenize_text).batch(config.BATCH_SIZE)
tokenized_test_ds = test_ds.shuffle(config.BUFFER_SIZE).map(tokenize_text).batch(config.BATCH_SIZE)
show_downstream_task_example(tokenized_train_ds)
show_downstream_task_example(tokenized_test_ds)

(32, 360)
(32,)
is this the movie??? is this what indians are trying to show?? i think this is one more effort from a sick - minded director to turn down pakistani soldiers and in fact country.... but what we pakistani's know that we are always ahead of india in every part of our lives... not only in armed counters. < br / > < br / > well... this is bad filmed as that of border in early 1997... and director and writer just tried to overcome a shame of defeat in kargil by pakistani armed forces, by creating films like these.. < br / > < br / > one thing is very clear... whenever there will be an encounter between pakistan and india.... we will win....!!! so mr. dutta try to make some good movies instead of nonsense movies like this
(32, 360)
(32,)
i went to see the movie because my boyfriend was raving about how much he wanted to see it, and how his friends had already been and loved it. so i came in with a neutral attitude, not really expecting the worst. unfortunately, that is what i 

In [25]:
def create_downstream_classifier_model():
    inputs = layers.Input((config.MAX_LEN,), dtype=tf.int32)
    
    pretrained_model = keras.models.load_model("bert_mlm_imdb.h5")
    
    pretrained_target = pretrained_model(inputs)
    sequence_output = pretrained_model.get_layer("encoder_0/ffn_layernormalization").output
    
    pooled_output = layers.GlobalMaxPooling1D()(sequence_output)
    hidden_layer = layers.Dense(64, activation="relu")(pooled_output)
    outputs = layers.Dense(1, activation="sigmoid")(hidden_layer)
    
    downstream_model = tf.keras.Model(
        pretrained_model.input,
        outputs,
        name='downstream_classifier_model')
    return downstream_model, pretrained_model


keras.backend.clear_session()
downstream_model, pretrained = create_downstream_classifier_model()
downstream_model.summary()





Model: "downstream_classifier_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 360)]        0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 360, 128)     3840000     input_1[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.add (TFOpLambd (None, 360, 128)     0           word_embedding[0][0]             
__________________________________________________________________________________________________
encoder_0/multiheadattention (M (None, 360, 128)     66048       tf.__operators__.add[0][0]       
                                                                 tf.__op

In [26]:
EPOCHS = 5

# Train the classifier with frozen BERT stage
pretrained.trainable = False
downstream_model.compile(
    optimizer=keras.optimizers.Adam(),
    loss="binary_crossentropy",
    metrics=["accuracy"])

downstream_model.fit(
    tokenized_train_ds,
    epochs=EPOCHS,
    validation_data=tokenized_test_ds)


# Unfreeze the BERT model for fine-tuning
pretrained.trainable = True
downstream_model.compile(
    optimizer=keras.optimizers.Adam(),
    loss="binary_crossentropy",
    metrics=["accuracy"])

downstream_model.fit(
    tokenized_train_ds,
    epochs=EPOCHS,
    validation_data=tokenized_test_ds)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f47b40c5280>