<a href="https://github.com/abheesht17/keras-nlp/blob/bert_large_vars/tools/checkpoint_conversion/bert_large_en_cased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install deps

In [1]:
!pip install git+https://github.com/abheesht17/keras-nlp.git@more-bert-variants tensorflow tf-models-official tensorflow_hub --upgrade --quiet

[K     |████████████████████████████████| 511.7 MB 6.8 kB/s 
[K     |████████████████████████████████| 2.1 MB 48.2 MB/s 
[K     |████████████████████████████████| 4.6 MB 48.8 MB/s 
[K     |████████████████████████████████| 5.8 MB 52.6 MB/s 
[K     |████████████████████████████████| 438 kB 64.8 MB/s 
[K     |████████████████████████████████| 1.6 MB 56.7 MB/s 
[K     |████████████████████████████████| 238 kB 68.5 MB/s 
[K     |████████████████████████████████| 352 kB 38.1 MB/s 
[K     |████████████████████████████████| 116 kB 74.8 MB/s 
[K     |████████████████████████████████| 99 kB 10.1 MB/s 
[K     |████████████████████████████████| 43 kB 2.2 MB/s 
[K     |████████████████████████████████| 1.3 MB 50.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 56.0 MB/s 
[K     |████████████████████████████████| 636 kB 69.5 MB/s 
[?25h  Building wheel for keras-nlp (setup.py) ... [?25l[?25hdone
  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building whee

In [2]:
import json
import os

import keras_nlp
import tensorflow as tf
from tensorflow import keras

import tensorflow_hub as hub

In [3]:
MODEL_TYPE = "bert_small"
MODEL_SUFFIX = "uncased"
MODEL_SPEC_STR = "L-4_H-512_A-8"
MODEL_NAME = f"{MODEL_TYPE}_{MODEL_SUFFIX}"
VOCAB_SIZE = 30522
NUM_LAYERS = 4
NUM_ATTN_HEADS = 8
EMBEDDING_SIZE = 512

In [4]:
# BERT ckpt https://github.com/google-research/bert/blob/master/README.md.
zip_path = f"""https://storage.googleapis.com/bert_models/2020_02_20/{MODEL_SUFFIX}_{MODEL_SPEC_STR}.zip"""
zip_file = keras.utils.get_file(
    f"""/content/{MODEL_NAME}""",
    zip_path,
    extract=True,
    archive_format="zip",
)

Downloading data from https://storage.googleapis.com/bert_models/2020_02_20/uncased_L-4_H-512_A-8.zip


In [5]:
!unzip """{MODEL_NAME}""" -d """{MODEL_SUFFIX}_{MODEL_SPEC_STR}"""

Archive:  bert_small_uncased
  inflating: uncased_L-4_H-512_A-8/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-4_H-512_A-8/bert_config.json  
  inflating: uncased_L-4_H-512_A-8/vocab.txt  
  inflating: uncased_L-4_H-512_A-8/bert_model.ckpt.index  


In [6]:
# BERT paths.
extract_dir = f"/content/{MODEL_SUFFIX}_{MODEL_SPEC_STR}"
vocab_path = os.path.join(extract_dir, "vocab.txt")
checkpoint_path = os.path.join(extract_dir, "bert_model.ckpt")
config_path = os.path.join(extract_dir, "bert_config.json")

In [7]:
vars = tf.train.list_variables(checkpoint_path)
weights = {}
for name, shape in vars:
    print(name, shape)
    weight = tf.train.load_variable(checkpoint_path, name)
    weights[name] = weight

bert/embeddings/LayerNorm/beta [512]
bert/embeddings/LayerNorm/gamma [512]
bert/embeddings/position_embeddings [512, 512]
bert/embeddings/token_type_embeddings [2, 512]
bert/embeddings/word_embeddings [30522, 512]
bert/encoder/layer_0/attention/output/LayerNorm/beta [512]
bert/encoder/layer_0/attention/output/LayerNorm/gamma [512]
bert/encoder/layer_0/attention/output/dense/bias [512]
bert/encoder/layer_0/attention/output/dense/kernel [512, 512]
bert/encoder/layer_0/attention/self/key/bias [512]
bert/encoder/layer_0/attention/self/key/kernel [512, 512]
bert/encoder/layer_0/attention/self/query/bias [512]
bert/encoder/layer_0/attention/self/query/kernel [512, 512]
bert/encoder/layer_0/attention/self/value/bias [512]
bert/encoder/layer_0/attention/self/value/kernel [512, 512]
bert/encoder/layer_0/intermediate/dense/bias [2048]
bert/encoder/layer_0/intermediate/dense/kernel [512, 2048]
bert/encoder/layer_0/output/LayerNorm/beta [512]
bert/encoder/layer_0/output/LayerNorm/gamma [512]
bert/

## Load BertSmall model with KerasNLP.

In [8]:
model = keras_nlp.models.BertSmall(vocabulary_size=VOCAB_SIZE)
model.summary()

Model: "bert_custom"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 token_ids (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 token_embedding (Embedding)    (None, None, 512)    15627264    ['token_ids[0][0]']              
                                                                                                  
 segment_ids (InputLayer)       [(None, None)]       0           []                               
                                                                                                  
 position_embedding (PositionEm  (None, None, 512)   262144      ['token_embedding[0][0]']        
 bedding)                                                                               

## Convert Weights

In [9]:
model.get_layer("token_embedding").embeddings.assign(
    weights["bert/embeddings/word_embeddings"]
)
model.get_layer("position_embedding").position_embeddings.assign(
    weights["bert/embeddings/position_embeddings"]
)
model.get_layer("segment_embedding").embeddings.assign(
    weights["bert/embeddings/token_type_embeddings"]
)
model.get_layer("embeddings_layer_norm").gamma.assign(
    weights["bert/embeddings/LayerNorm/gamma"]
)
model.get_layer("embeddings_layer_norm").beta.assign(
    weights["bert/embeddings/LayerNorm/beta"]
)

for i in range(model.num_layers):
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._key_dense.kernel.assign(
        weights[f"bert/encoder/layer_{i}/attention/self/key/kernel"].reshape(
            (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1)
        )
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._key_dense.bias.assign(
        weights[f"bert/encoder/layer_{i}/attention/self/key/bias"].reshape(
            (NUM_ATTN_HEADS, -1)
        )
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._query_dense.kernel.assign(
        weights[f"bert/encoder/layer_{i}/attention/self/query/kernel"].reshape(
            (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1)
        )
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._query_dense.bias.assign(
        weights[f"bert/encoder/layer_{i}/attention/self/query/bias"].reshape(
            (NUM_ATTN_HEADS, -1)
        )
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._value_dense.kernel.assign(
        weights[f"bert/encoder/layer_{i}/attention/self/value/kernel"].reshape(
            (EMBEDDING_SIZE, NUM_ATTN_HEADS, -1)
        )
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._value_dense.bias.assign(
        weights[f"bert/encoder/layer_{i}/attention/self/value/bias"].reshape(
            (NUM_ATTN_HEADS, -1)
        )
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._output_dense.kernel.assign(
        weights[
            f"bert/encoder/layer_{i}/attention/output/dense/kernel"
        ].reshape((NUM_ATTN_HEADS, -1, EMBEDDING_SIZE))
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._output_dense.bias.assign(
        weights[f"bert/encoder/layer_{i}/attention/output/dense/bias"]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layernorm.gamma.assign(
        weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma"]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layernorm.beta.assign(
        weights[f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta"]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._feedforward_intermediate_dense.kernel.assign(
        weights[f"bert/encoder/layer_{i}/intermediate/dense/kernel"]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._feedforward_intermediate_dense.bias.assign(
        weights[f"bert/encoder/layer_{i}/intermediate/dense/bias"]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._feedforward_output_dense.kernel.assign(
        weights[f"bert/encoder/layer_{i}/output/dense/kernel"]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._feedforward_output_dense.bias.assign(
        weights[f"bert/encoder/layer_{i}/output/dense/bias"]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._feedforward_layernorm.gamma.assign(
        weights[f"bert/encoder/layer_{i}/output/LayerNorm/gamma"]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._feedforward_layernorm.beta.assign(
        weights[f"bert/encoder/layer_{i}/output/LayerNorm/beta"]
    )

model.get_layer("pooled_dense").kernel.assign(
    weights["bert/pooler/dense/kernel"]
)
model.get_layer("pooled_dense").bias.assign(weights["bert/pooler/dense/bias"])
pass

## Load Bert Small from TF-Hub.

These weights have been ratified by the authors of BERT: https://github.com/google-research/bert/blob/master/README.md.

### BERT README statement:

"***** New February 7th, 2019: TfHub Module *****
BERT has been uploaded to TensorFlow Hub. See run_classifier_with_tfhub.py for an example of how to use the TF Hub module, or run an example in the browser on Colab."

In [10]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)

preprocessor = hub.load(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
)
tokenizer = hub.KerasLayer(preprocessor.tokenize, name="tokenizer")
tokenized_text = tokenizer(text_input)

packer = hub.KerasLayer(
    preprocessor.bert_pack_inputs, arguments=dict(seq_length=512), name="packer"
)
encoder_inputs = packer([tokenized_text])

encoder = hub.KerasLayer(
    f"https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_{MODEL_SPEC_STR}/2",
    trainable=True,
)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]  # [batch_size, 1024].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 1024].

embedding_model = tf.keras.Model(text_input, (pooled_output, sequence_output))

In [11]:
def preprocess(x):
    tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
        vocabulary=vocab_path, lowercase=False
    )
    packer = keras_nlp.layers.MultiSegmentPacker(
        sequence_length=model.max_sequence_length,
        start_value=tokenizer.token_to_id("[CLS]"),
        end_value=tokenizer.token_to_id("[SEP]"),
    )
    return packer(tokenizer(x))


token_ids, segment_ids = preprocess(["the quick brown fox."])

In [12]:
keras_nlp_output = model(
    {
        "token_ids": token_ids,
        "segment_ids": segment_ids,
        "padding_mask": token_ids != 0,
    }
)

hub_pooled_output, hub_sequence_output = embedding_model(
    tf.constant(["the quick brown fox."])
)

In [13]:
keras_nlp_output["pooled_output"][0, :10], hub_pooled_output[0, :10]

(<tf.Tensor: shape=(10,), dtype=float32, numpy=
 array([ 0.99953026,  0.7659703 ,  0.00867063,  0.4091944 , -0.48858136,
         0.8769211 ,  0.9966225 , -0.98421854, -0.4603666 , -0.75314736],
       dtype=float32)>, <tf.Tensor: shape=(10,), dtype=float32, numpy=
 array([ 0.9995303 ,  0.7659644 ,  0.00867085,  0.40919945, -0.48858863,
         0.8769245 ,  0.9966227 , -0.9842189 , -0.4603672 , -0.75314724],
       dtype=float32)>)

In [14]:
# Very close! Though not 100% exact.
(
    tf.reduce_mean(keras_nlp_output["pooled_output"] - hub_pooled_output),
    tf.reduce_mean(keras_nlp_output["sequence_output"] - hub_sequence_output),
)

(<tf.Tensor: shape=(), dtype=float32, numpy=-5.848767e-08>,
 <tf.Tensor: shape=(), dtype=float32, numpy=-6.5012145e-08>)

In [15]:
# Save BertSmall checkpoint
model.save_weights(f"""{MODEL_NAME}.h5""")

In [16]:
model2 = keras_nlp.models.BertSmall(vocabulary_size=VOCAB_SIZE)
model2.load_weights(f"""{MODEL_NAME}.h5""")

In [17]:
# Same output from loaded checkpoint
keras_nlp_output2 = model2(
    {
        "token_ids": token_ids,
        "segment_ids": segment_ids,
        "padding_mask": token_ids != 0,
    }
)

(
    tf.reduce_mean(
        keras_nlp_output["pooled_output"] - keras_nlp_output2["pooled_output"]
    ),
    tf.reduce_mean(
        keras_nlp_output["sequence_output"]
        - keras_nlp_output2["sequence_output"]
    ),
)

(<tf.Tensor: shape=(), dtype=float32, numpy=0.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0>)

In [18]:
# Save vocab file as well
vocab_info = tf.io.gfile.GFile(vocab_path).read()
f = open("vocab.txt", "w")
f.write(vocab_info)

228209

In [19]:
# Get MD5 of model
!md5sum """{MODEL_NAME}.h5"""

08632c9479b034f342ba2c2b7afba5f7  bert_small_uncased.h5


In [None]:
# Upload model to drive
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Check uploaded model once added to repo
model_cloud = keras_nlp.models.BertSmall(weights="uncased_en")

Downloading data from https://storage.googleapis.com/keras-nlp/models/bert_large_en_cased/model.h5


In [None]:
# Same output from cloud model
keras_nlp_output_cloud = model_cloud(
    {
        "token_ids": token_ids,
        "segment_ids": segment_ids,
        "padding_mask": token_ids != 0,
    }
)["pooled_output"]
tf.reduce_mean(keras_nlp_output["pooled_output"] - keras_nlp_output_cloud)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [None]:
keras_nlp_output_cloud[0, :10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([ 0.97578144,  0.9996469 ,  0.9997959 , -0.94946283,  0.99925387,
        0.9986442 , -0.9969186 , -0.9611691 ,  0.99938154,  0.9999203 ],
      dtype=float32)>