In [1]:
!pip install transformers --quiet
!pip install git+https://github.com/abheesht17/keras-nlp.git@distilbert-presets tensorflow tensorflow-text --upgrade --quiet

[K     |████████████████████████████████| 5.5 MB 5.3 MB/s 
[K     |████████████████████████████████| 182 kB 55.4 MB/s 
[K     |████████████████████████████████| 7.6 MB 50.1 MB/s 
[K     |████████████████████████████████| 578.1 MB 26 kB/s 
[K     |████████████████████████████████| 5.9 MB 43.8 MB/s 
[K     |████████████████████████████████| 5.9 MB 46.3 MB/s 
[K     |████████████████████████████████| 438 kB 59.4 MB/s 
[K     |████████████████████████████████| 1.7 MB 56.3 MB/s 
[?25h  Building wheel for keras-nlp (setup.py) ... [?25l[?25hdone


In [2]:
import json
import os

import keras_nlp
import numpy as np

import tensorflow as tf
from tensorflow import keras

import torch

from transformers import AutoTokenizer, AutoModel

tf.__version__

'2.10.1'

In [3]:
# Variables.

variant = "distilbert-base-uncased"
variant_ = variant.replace("-", "_")
model_save_name = f"{variant}_en.h5".replace("-", "_")

# Paths.
extract_dir = f"/content/pt_{variant}"
vocab_path = os.path.join(extract_dir, "vocab.txt")
config_path = os.path.join(extract_dir, "config.json")

In [4]:
!mkdir {extract_dir}
!wget https://huggingface.co/{variant}/raw/main/config.json -P {extract_dir}
!wget https://huggingface.co/{variant}/raw/main/vocab.txt -P {extract_dir}

--2022-11-17 04:01:22--  https://huggingface.co/distilbert-base-uncased/raw/main/config.json
Resolving huggingface.co (huggingface.co)... 34.200.207.32, 34.227.196.80, 2600:1f18:147f:e800:7a81:37eb:4345:da2, ...
Connecting to huggingface.co (huggingface.co)|34.200.207.32|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 483 [text/plain]
Saving to: ‘/content/pt_distilbert-base-uncased/config.json’


2022-11-17 04:01:22 (76.2 MB/s) - ‘/content/pt_distilbert-base-uncased/config.json’ saved [483/483]

--2022-11-17 04:01:23--  https://huggingface.co/distilbert-base-uncased/raw/main/vocab.txt
Resolving huggingface.co (huggingface.co)... 34.200.207.32, 34.227.196.80, 2600:1f18:147f:e800:7a81:37eb:4345:da2, ...
Connecting to huggingface.co (huggingface.co)|34.200.207.32|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 231508 (226K) [text/plain]
Saving to: ‘/content/pt_distilbert-base-uncased/vocab.txt’


2022-11-17 04:01:23 (2.02 MB/s) - ‘/conte

In [5]:
# Build config.

cfg = {}

with open(config_path, "r") as pt_cfg_handler:
    pt_cfg = json.load(pt_cfg_handler)


cfg["vocabulary_size"] = pt_cfg["vocab_size"]
cfg["num_layers"] = pt_cfg["n_layers"]
cfg["num_heads"] = pt_cfg["n_heads"]
cfg["hidden_dim"] = pt_cfg["dim"]
cfg["intermediate_dim"] = pt_cfg["hidden_dim"]
cfg["dropout"] = pt_cfg["dropout"]
cfg["max_sequence_length"] = pt_cfg["max_position_embeddings"]

cfg

{'vocabulary_size': 30522,
 'num_layers': 6,
 'num_heads': 12,
 'hidden_dim': 768,
 'intermediate_dim': 3072,
 'dropout': 0.1,
 'max_sequence_length': 512}

In [6]:
model = keras_nlp.models.DistilBert.from_preset(f"{variant_}_en", load_weights=False)
model.summary()

Model: "backbone"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 token_ids (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 token_and_position_embedding (  (None, None, 768)   23834112    ['token_ids[0][0]']              
 TokenAndPositionEmbedding)                                                                       
                                                                                                  
 embeddings_layer_norm (LayerNo  (None, None, 768)   1536        ['token_and_position_embedding[0]
 rmalization)                                                    [0]']                            
                                                                                           

In [7]:
pt_model = AutoModel.from_pretrained(variant)
pt_wts = pt_model.state_dict()

pt_model

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [8]:
print(str(pt_wts.keys()).replace(", ", "\n").replace("odict_keys([", "").replace("]", "").replace(")",""))

'embeddings.word_embeddings.weight'
'embeddings.position_embeddings.weight'
'embeddings.LayerNorm.weight'
'embeddings.LayerNorm.bias'
'transformer.layer.0.attention.q_lin.weight'
'transformer.layer.0.attention.q_lin.bias'
'transformer.layer.0.attention.k_lin.weight'
'transformer.layer.0.attention.k_lin.bias'
'transformer.layer.0.attention.v_lin.weight'
'transformer.layer.0.attention.v_lin.bias'
'transformer.layer.0.attention.out_lin.weight'
'transformer.layer.0.attention.out_lin.bias'
'transformer.layer.0.sa_layer_norm.weight'
'transformer.layer.0.sa_layer_norm.bias'
'transformer.layer.0.ffn.lin1.weight'
'transformer.layer.0.ffn.lin1.bias'
'transformer.layer.0.ffn.lin2.weight'
'transformer.layer.0.ffn.lin2.bias'
'transformer.layer.0.output_layer_norm.weight'
'transformer.layer.0.output_layer_norm.bias'
'transformer.layer.1.attention.q_lin.weight'
'transformer.layer.1.attention.q_lin.bias'
'transformer.layer.1.attention.k_lin.weight'
'transformer.layer.1.attention.k_lin.bias'
'transform

In [9]:
model.get_layer("token_and_position_embedding").token_embedding.embeddings.assign(
    pt_wts["embeddings.word_embeddings.weight"]
)
model.get_layer("token_and_position_embedding").position_embedding.position_embeddings.assign(
    pt_wts["embeddings.position_embeddings.weight"]
)

model.get_layer("embeddings_layer_norm").gamma.assign(
    pt_wts["embeddings.LayerNorm.weight"]
)
model.get_layer("embeddings_layer_norm").beta.assign(
    pt_wts["embeddings.LayerNorm.bias"]
)

for i in range(model.num_layers):
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._query_dense.kernel.assign(
        pt_wts[f"transformer.layer.{i}.attention.q_lin.weight"].transpose(1, 0).reshape((cfg["hidden_dim"], cfg["num_heads"], -1)).numpy()
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._query_dense.bias.assign(
        pt_wts[f"transformer.layer.{i}.attention.q_lin.bias"].reshape((cfg["num_heads"], -1)).numpy()
    )

    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._key_dense.kernel.assign(
        pt_wts[f"transformer.layer.{i}.attention.k_lin.weight"].transpose(1, 0).reshape((cfg["hidden_dim"], cfg["num_heads"], -1)).numpy()
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._key_dense.bias.assign(
        pt_wts[f"transformer.layer.{i}.attention.k_lin.bias"].reshape((cfg["num_heads"], -1)).numpy()
    )

    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._value_dense.kernel.assign(
        pt_wts[f"transformer.layer.{i}.attention.v_lin.weight"].transpose(1, 0).reshape((cfg["hidden_dim"], cfg["num_heads"], -1)).numpy()
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._value_dense.bias.assign(
        pt_wts[f"transformer.layer.{i}.attention.v_lin.bias"].reshape((cfg["num_heads"], -1)).numpy()
    )


    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._output_dense.kernel.assign(
        pt_wts[f"transformer.layer.{i}.attention.out_lin.weight"].transpose(1, 0).reshape((cfg["num_heads"], -1, cfg["hidden_dim"])).numpy()
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._output_dense.bias.assign(
         pt_wts[f"transformer.layer.{i}.attention.out_lin.bias"].numpy()
    )

    model.get_layer(f"transformer_layer_{i}")._self_attention_layernorm.gamma.assign(
        pt_wts[f"transformer.layer.{i}.sa_layer_norm.weight"].numpy()
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layernorm.beta.assign(
        pt_wts[f"transformer.layer.{i}.sa_layer_norm.bias"].numpy()
    )

    model.get_layer(f"transformer_layer_{i}")._feedforward_intermediate_dense.kernel.assign(
        pt_wts[f"transformer.layer.{i}.ffn.lin1.weight"].transpose(1, 0).numpy()
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_intermediate_dense.bias.assign(
        pt_wts[f"transformer.layer.{i}.ffn.lin1.bias"].numpy()
    )

    model.get_layer(f"transformer_layer_{i}")._feedforward_output_dense.kernel.assign(
        pt_wts[f"transformer.layer.{i}.ffn.lin2.weight"].transpose(1, 0).numpy()
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_output_dense.bias.assign(
        pt_wts[f"transformer.layer.{i}.ffn.lin2.bias"].numpy()
    )

    model.get_layer(f"transformer_layer_{i}")._feedforward_layernorm.gamma.assign(
        pt_wts[f"transformer.layer.{i}.output_layer_norm.weight"].numpy()
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_layernorm.beta.assign(
        pt_wts[f"transformer.layer.{i}.output_layer_norm.bias"].numpy()
    )


In [10]:
# Instantiate KerasNLP `DistilBertPreprocessor`.
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
    f"{variant_}_en"
)

Downloading data from https://storage.googleapis.com/keras-nlp/models/distilbert_base_uncased_en/vocab.txt


In [11]:
# Instantiate the HF model and tokenizer.

hf_model = AutoModel.from_pretrained(variant)
hf_model.eval()

hf_tokenizer = AutoTokenizer.from_pretrained(variant)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
sample_text = ["cricket is awesome, easily the best sport in the world!"]
sample_input = preprocessor(tf.constant(sample_text))
hf_sample_input = hf_tokenizer(sample_text, padding="max_length", return_tensors="pt")

sample_input["token_ids"][:, :20], hf_sample_input["input_ids"][:, :20]

(<tf.Tensor: shape=(1, 20), dtype=int32, numpy=
 array([[  101,  4533,  2003, 12476,  1010,  4089,  1996,  2190,  4368,
          1999,  1996,  2088,   999,   102,     0,     0,     0,     0,
             0,     0]], dtype=int32)>,
 tensor([[  101,  4533,  2003, 12476,  1010,  4089,  1996,  2190,  4368,  1999,
           1996,  2088,   999,   102,     0,     0,     0,     0,     0,     0]]))

In [13]:
keras_nlp_output = model.predict(sample_input)
hf_output = hf_model(**hf_sample_input)

keras_nlp_output[0, 0, :10], hf_output.last_hidden_state[0, 0, :10]



(array([ 0.02147477, -0.15703455,  0.0651448 , -0.09119402, -0.31850424,
        -0.61443514,  0.22434759,  0.8855009 , -0.21813184, -0.14228325],
       dtype=float32),
 tensor([ 0.0215, -0.1570,  0.0651, -0.0912, -0.3185, -0.6144,  0.2243,  0.8855,
         -0.2181, -0.1423], grad_fn=<SliceBackward0>))

In [14]:
np.mean(keras_nlp_output - hf_output.last_hidden_state.detach().numpy())

-4.0443325e-08

In [15]:
# Save DistilBERT checkpoint.
model.save_weights(model_save_name)

In [None]:
!cp $vocab_path ./
!md5sum vocab.txt

64800d5d8528ce344256daf115d4965e  vocab.txt


In [None]:
!md5sum $model_save_name

6625a649572e74086d74c46b8d0b0da3  distilbert_base_uncased_en.h5


In [16]:
model2 = keras_nlp.models.DistilBert.from_preset(f"{variant_}_en", load_weights=False)
model2.load_weights(model_save_name)

In [17]:
keras_nlp_output2 = model2.predict(sample_input)
tf.reduce_mean(keras_nlp_output - keras_nlp_output2)



<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [18]:
# Check uploaded model once added to repo
model_cloud = keras_nlp.models.DistilBert.from_preset(
    f"{variant_}_en", load_weights=True
)

Downloading data from https://storage.googleapis.com/keras-nlp/models/distilbert_base_uncased_en/model.h5


In [19]:
# Same output from cloud model
keras_nlp_output_cloud = model_cloud.predict(sample_input)
tf.reduce_mean(keras_nlp_output - keras_nlp_output_cloud)



<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [20]:
keras_nlp_output_cloud[0, 0, :10]

array([ 0.02147477, -0.15703455,  0.0651448 , -0.09119402, -0.31850424,
       -0.61443514,  0.22434759,  0.8855009 , -0.21813184, -0.14228325],
      dtype=float32)