<a href="https://github.com/abheesht17/keras-nlp/blob/bert-base-chinese/tools/checkpoint_conversion/bert_base_multi_cased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install deps

In [1]:
!pip install git+https://github.com/abheesht17/keras-nlp.git@bert-base-chinese tensorflow tf-models-official --upgrade --quiet

[K     |████████████████████████████████| 511.7 MB 6.3 kB/s 
[K     |████████████████████████████████| 2.1 MB 42.7 MB/s 
[K     |████████████████████████████████| 4.6 MB 49.4 MB/s 
[K     |████████████████████████████████| 438 kB 69.8 MB/s 
[K     |████████████████████████████████| 5.8 MB 48.8 MB/s 
[K     |████████████████████████████████| 1.6 MB 57.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 72.3 MB/s 
[K     |████████████████████████████████| 43 kB 2.2 MB/s 
[K     |████████████████████████████████| 99 kB 10.6 MB/s 
[K     |████████████████████████████████| 116 kB 63.0 MB/s 
[K     |████████████████████████████████| 1.3 MB 64.7 MB/s 
[K     |████████████████████████████████| 352 kB 63.4 MB/s 
[K     |████████████████████████████████| 238 kB 100.5 MB/s 
[K     |████████████████████████████████| 636 kB 73.7 MB/s 
[?25h  Building wheel for keras-nlp (setup.py) ... [?25l[?25hdone
  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building whe

In [2]:
import json
import os

import keras_nlp
import tensorflow as tf
import tensorflow_models as tfm
from tensorflow import keras

In [3]:
MODEL_TYPE = "bert_base"
MODEL_SUFFIX = "multi_cased"
MODEL_NAME = f"{MODEL_TYPE}_{MODEL_SUFFIX}"
VOCAB_SIZE = 119547

## Load the model garden checkpoints and weights

In [4]:
# Model garden BERT paths.
zip_path = f"""https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/{MODEL_SUFFIX}_L-12_H-768_A-12.tar.gz"""
zip_file = keras.utils.get_file(
    f"""/content/{MODEL_NAME}""",
    zip_path,
    extract=True,
    archive_format="tar",
)

Downloading data from https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/multi_cased_L-12_H-768_A-12.tar.gz


In [5]:
!tar -xvf """{MODEL_NAME}"""

tmp/temp_dir/raw/
tmp/temp_dir/raw/vocab.txt
tmp/temp_dir/raw/bert_model.ckpt.index
tmp/temp_dir/raw/bert_model.ckpt.data-00000-of-00001
tmp/temp_dir/raw/bert_config.json


In [6]:
# Model garden BERT paths.
extract_dir = "/content/tmp/temp_dir/raw/"
vocab_path = os.path.join(extract_dir, "vocab.txt")
checkpoint_path = os.path.join(extract_dir, "bert_model.ckpt")
config_path = os.path.join(extract_dir, "bert_config.json")

In [7]:
vars = tf.train.list_variables(checkpoint_path)
weights = {}
for name, shape in vars:
    print(name, shape)
    weight = tf.train.load_variable(checkpoint_path, name)
    weights[name] = weight

_CHECKPOINTABLE_OBJECT_GRAPH []
encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE [119547, 768]
encoder/layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE [512, 768]
encoder/layer_with_weights-10/_attention_layer/_key_dense/bias/.ATTRIBUTES/VARIABLE_VALUE [12, 64]
encoder/layer_with_weights-10/_attention_layer/_key_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE [768, 12, 64]
encoder/layer_with_weights-10/_attention_layer/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE [768]
encoder/layer_with_weights-10/_attention_layer/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE [12, 64, 768]
encoder/layer_with_weights-10/_attention_layer/_query_dense/bias/.ATTRIBUTES/VARIABLE_VALUE [12, 64]
encoder/layer_with_weights-10/_attention_layer/_query_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE [768, 12, 64]
encoder/layer_with_weights-10/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE [12, 64]
encoder/layer_with_weights-10/_attention_layer/_value_dense/kernel/.ATTRIBUTES/VARIABL

## Load BertBase model with KerasNLP.

In [8]:
model = keras_nlp.models.BertBase(vocabulary_size=VOCAB_SIZE)
model.summary()

Model: "bert_custom"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 token_ids (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 token_embedding (Embedding)    (None, None, 768)    91812096    ['token_ids[0][0]']              
                                                                                                  
 segment_ids (InputLayer)       [(None, None)]       0           []                               
                                                                                                  
 position_embedding (PositionEm  (None, None, 768)   393216      ['token_embedding[0][0]']        
 bedding)                                                                               

## Convert Weights

In [9]:
model.get_layer("token_embedding").embeddings.assign(
    weights[
        "encoder/layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE"
    ]
)
model.get_layer("position_embedding").position_embeddings.assign(
    weights[
        "encoder/layer_with_weights-1/embeddings/.ATTRIBUTES/VARIABLE_VALUE"
    ]
)
model.get_layer("segment_embedding").embeddings.assign(
    weights[
        "encoder/layer_with_weights-2/embeddings/.ATTRIBUTES/VARIABLE_VALUE"
    ]
)
model.get_layer("embeddings_layer_norm").gamma.assign(
    weights["encoder/layer_with_weights-3/gamma/.ATTRIBUTES/VARIABLE_VALUE"]
)
model.get_layer("embeddings_layer_norm").beta.assign(
    weights["encoder/layer_with_weights-3/beta/.ATTRIBUTES/VARIABLE_VALUE"]
)

for i in range(model.num_layers):
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._key_dense.kernel.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._key_dense.bias.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_attention_layer/_key_dense/bias/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._query_dense.kernel.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._query_dense.bias.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_attention_layer/_query_dense/bias/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._value_dense.kernel.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._value_dense.bias.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_attention_layer/_value_dense/bias/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._output_dense.kernel.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layer._output_dense.bias.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_attention_layer/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layernorm.gamma.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._self_attention_layernorm.beta.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_attention_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._feedforward_intermediate_dense.kernel.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._feedforward_intermediate_dense.bias.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_intermediate_dense/bias/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._feedforward_output_dense.kernel.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_output_dense/kernel/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._feedforward_output_dense.bias.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_output_dense/bias/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._feedforward_layernorm.gamma.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )
    model.get_layer(
        f"transformer_layer_{i}"
    )._feedforward_layernorm.beta.assign(
        weights[
            f"encoder/layer_with_weights-{i + 4}/_output_layer_norm/beta/.ATTRIBUTES/VARIABLE_VALUE"
        ]
    )

model.get_layer("pooled_dense").kernel.assign(
    weights[
        f"encoder/layer_with_weights-{model.num_layers + 4}/kernel/.ATTRIBUTES/VARIABLE_VALUE"
    ]
)
model.get_layer("pooled_dense").bias.assign(
    weights[
        f"encoder/layer_with_weights-{model.num_layers + 4}/bias/.ATTRIBUTES/VARIABLE_VALUE"
    ]
)
pass

## Compare Output

In [10]:
def preprocess(x):
    tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
        vocabulary=vocab_path, lowercase=False
    )
    packer = keras_nlp.layers.MultiSegmentPacker(
        sequence_length=model.max_sequence_length,
        start_value=tokenizer.token_to_id("[CLS]"),
        end_value=tokenizer.token_to_id("[SEP]"),
    )
    return packer(tokenizer(x))


token_ids, segment_ids = preprocess(["The झटपट brown लोमड़ी."])

In [11]:
encoder_config = tfm.nlp.encoders.EncoderConfig(
    type="bert",
    bert=json.load(tf.io.gfile.GFile(config_path)),
)
mg_model = tfm.nlp.encoders.build_encoder(encoder_config)
checkpoint = tf.train.Checkpoint(encoder=mg_model)
checkpoint.read(checkpoint_path).assert_consumed()

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f8f003c50d0>

In [12]:
keras_nlp_output = model(
    {
        "token_ids": token_ids,
        "segment_ids": segment_ids,
        "padding_mask": token_ids != 0,
    }
)["pooled_output"]

mg_output = mg_model(
    {
        "input_word_ids": token_ids,
        "input_type_ids": segment_ids,
        "input_mask": token_ids != 0,
    }
)["pooled_output"]

In [13]:
keras_nlp_output[0, :10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([ 0.23481919, -0.09908431,  0.191012  , -0.09323437,  0.0487478 ,
        0.2899589 ,  0.08575539,  0.19417824, -0.25255474,  0.22692412],
      dtype=float32)>

In [14]:
mg_output[0, :10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([ 0.23481807, -0.09908444,  0.19100997, -0.09323332,  0.04874839,
        0.28995794,  0.08575849,  0.19417666, -0.2525542 ,  0.22692516],
      dtype=float32)>

In [15]:
# Very close! Though not 100% exact.
tf.reduce_mean(keras_nlp_output - mg_output)

<tf.Tensor: shape=(), dtype=float32, numpy=8.3201634e-08>

In [16]:
# Save BertBase checkpoint
model.save_weights(f"""{MODEL_NAME}.h5""")

In [17]:
model2 = keras_nlp.models.BertBase(vocabulary_size=VOCAB_SIZE)
model2.load_weights(f"""{MODEL_NAME}.h5""")

In [18]:
# Same output from loaded checkpoint
keras_nlp_output2 = model2(
    {
        "token_ids": token_ids,
        "segment_ids": segment_ids,
        "padding_mask": token_ids != 0,
    }
)["pooled_output"]
tf.reduce_mean(keras_nlp_output - keras_nlp_output2)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [19]:
# Save vocab file as well
vocab_info = tf.io.gfile.GFile(vocab_path).read()
f = open("vocab.txt", "w")
f.write(vocab_info)

764415

In [None]:
# Get MD5 of model
!md5sum """{MODEL_NAME}.h5"""

b0631cec0a1f2513c6cfd75ba29c33aa  bert_base_multi_cased.h5


In [None]:
# Upload model to drive
# from google.colab import drive
# drive.mount('/content/drive')

In [20]:
# Check uploaded model once added to repo
model_cloud = keras_nlp.models.BertBase(weights=MODEL_SUFFIX)

Downloading data from https://storage.googleapis.com/keras-nlp/models/bert_base_multi_cased/model.h5


In [21]:
# Same output from cloud model
keras_nlp_output_cloud = model_cloud(
    {
        "token_ids": token_ids,
        "segment_ids": segment_ids,
        "padding_mask": token_ids != 0,
    }
)["pooled_output"]
tf.reduce_mean(keras_nlp_output - keras_nlp_output_cloud)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [22]:
keras_nlp_output_cloud[0, :10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([ 0.23481919, -0.09908431,  0.191012  , -0.09323437,  0.0487478 ,
        0.2899589 ,  0.08575539,  0.19417824, -0.25255474,  0.22692412],
      dtype=float32)>