In [1]:
!pip install transformers --quiet
!pip install tensorflow==2.10 tensorflow-text==2.10 git+https://github.com/abheesht17/keras-nlp.git@xlm-roberta-presets

[K     |████████████████████████████████| 5.5 MB 29.6 MB/s 
[K     |████████████████████████████████| 182 kB 76.6 MB/s 
[K     |████████████████████████████████| 7.6 MB 49.4 MB/s 
[?25hLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/abheesht17/keras-nlp.git@xlm-roberta-presets
  Cloning https://github.com/abheesht17/keras-nlp.git (to revision xlm-roberta-presets) to /tmp/pip-req-build-qxzqaoa4
  Running command git clone -q https://github.com/abheesht17/keras-nlp.git /tmp/pip-req-build-qxzqaoa4
  Running command git checkout -b xlm-roberta-presets --track origin/xlm-roberta-presets
  Switched to a new branch 'xlm-roberta-presets'
  Branch 'xlm-roberta-presets' set up to track remote branch 'xlm-roberta-presets' from 'origin'.
Collecting tensorflow==2.10
  Downloading tensorflow-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (578.0 MB)
[K     |████████████████████████████████| 578

In [2]:
import json
import os

import keras_nlp
import numpy as np

import tensorflow as tf
from tensorflow import keras

import torch

from keras_nlp.models import XLMRobertaTokenizer, XLMRobertaPreprocessor
from transformers import AutoTokenizer, AutoModel

tf.__version__

'2.10.0'

In [3]:
MODEL_SIZE = "base"

In [4]:
# Get the official checkpoints from fairseq.
!wget "https://dl.fbaipublicfiles.com/fairseq/models/xlmr.{MODEL_SIZE}.tar.gz"
!tar -xvf "xlmr.{MODEL_SIZE}.tar.gz"

--2022-11-18 16:57:01--  https://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 172.67.9.4, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 512274718 (489M) [application/gzip]
Saving to: ‘xlmr.base.tar.gz’


2022-11-18 16:57:13 (45.8 MB/s) - ‘xlmr.base.tar.gz’ saved [512274718/512274718]

xlmr.base/
xlmr.base/dict.txt
xlmr.base/sentencepiece.bpe.model
xlmr.base/model.pt


In [5]:
# XLM-R paths.
extract_dir = f"/content/xlmr.{MODEL_SIZE}"
spm_path = os.path.join(extract_dir, "sentencepiece.bpe.model")
checkpoint_path = os.path.join(extract_dir, "model.pt")

In [6]:
# Load PyTorch XLM-R checkpoint.
pt_ckpt = torch.load(checkpoint_path, map_location=torch.device('cpu'))
pt_cfg = pt_ckpt["args"]
pt_model = pt_ckpt['model']

In [7]:
cfg = {}

cfg["num_layers"] = pt_cfg.encoder_layers
cfg["num_heads"] = pt_cfg.encoder_attention_heads
cfg["hidden_dim"] = pt_cfg.encoder_embed_dim
cfg["intermediate_dim"] = pt_cfg.encoder_ffn_embed_dim
cfg["dropout"] = pt_cfg.dropout
cfg["max_sequence_length"] = pt_cfg.max_positions
cfg["vocab_size"] = pt_model["decoder.sentence_encoder.embed_tokens.weight"].numpy().shape[0]

cfg

{'num_layers': 12,
 'num_heads': 12,
 'hidden_dim': 768,
 'intermediate_dim': 3072,
 'dropout': 0.1,
 'max_sequence_length': 512,
 'vocab_size': 250002}

In [8]:
model = keras_nlp.models.XLMRoberta.from_preset(
    f"xlm_roberta_{MODEL_SIZE}",
    load_weights=False
)

In [9]:
model.summary()

Model: "backbone"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 token_ids (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 embeddings (TokenAndPositionEm  (None, None, 768)   192394752   ['token_ids[0][0]']              
 bedding)                                                                                         
                                                                                                  
 embeddings_layer_norm (LayerNo  (None, None, 768)   1536        ['embeddings[0][0]']             
 rmalization)                                                                                     
                                                                                           

In [10]:
# Embedding Layer.
model.get_layer("embeddings").token_embedding.embeddings.assign(
    pt_model["decoder.sentence_encoder.embed_tokens.weight"].numpy()
)
model.get_layer("embeddings").position_embedding.position_embeddings.assign(
    pt_model["decoder.sentence_encoder.embed_positions.weight"].numpy()[2:, :]
)

# Embedding LayerNorm.
model.get_layer("embeddings_layer_norm").gamma.assign(
    pt_model['decoder.sentence_encoder.emb_layer_norm.weight'].numpy()
)
model.get_layer("embeddings_layer_norm").beta.assign(
    pt_model['decoder.sentence_encoder.emb_layer_norm.bias'].numpy()
)


range_1 = (0, cfg["hidden_dim"])
range_2 = (cfg["hidden_dim"], 2 * cfg["hidden_dim"]) 
range_3 = (2 * cfg["hidden_dim"], 3 * cfg["hidden_dim"])

# Transformer layers.
for i in range(model.num_layers):
    q_k_v_wts =  (
        pt_model[f"decoder.sentence_encoder.layers.{i}.self_attn.in_proj_weight"].numpy().T
    )
    q_k_v_bias = (
        pt_model[f"decoder.sentence_encoder.layers.{i}.self_attn.in_proj_bias"].numpy().T
    )

    # Query
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._query_dense.kernel.assign(
        q_k_v_wts[:, range_1[0]: range_1[1]].reshape((cfg["hidden_dim"], cfg["num_heads"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._query_dense.bias.assign(
        q_k_v_bias[range_1[0]: range_1[1]].reshape((cfg["num_heads"], -1))
    )

    # Key
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._key_dense.kernel.assign(
        q_k_v_wts[:, range_2[0]: range_2[1]].reshape((cfg["hidden_dim"], cfg["num_heads"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._key_dense.bias.assign(
        q_k_v_bias[range_2[0]: range_2[1]].reshape((cfg["num_heads"], -1))
    )

    # Value
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._value_dense.kernel.assign(
        q_k_v_wts[:, range_3[0]: range_3[1]].reshape((cfg["hidden_dim"], cfg["num_heads"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._value_dense.bias.assign(
        q_k_v_bias[range_3[0]: range_3[1]].reshape((cfg["num_heads"], -1))
    )

    # Attention output
    attn_output_wts = (
        pt_model[f'decoder.sentence_encoder.layers.{i}.self_attn.out_proj.weight'].numpy().T
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._output_dense.kernel.assign(
        attn_output_wts.reshape((cfg["num_heads"], -1, cfg["hidden_dim"]))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._output_dense.bias.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.self_attn.out_proj.bias'].numpy()
    )

    # Attention LayerNorm
    model.get_layer(f"transformer_layer_{i}")._self_attention_layernorm.gamma.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.self_attn_layer_norm.weight'].numpy()
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layernorm.beta.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.self_attn_layer_norm.bias'].numpy()
    )

    # Intermediate FF layer
    model.get_layer(f"transformer_layer_{i}")._feedforward_intermediate_dense.kernel.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.fc1.weight'].numpy().T
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_intermediate_dense.bias.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.fc1.bias'].numpy()
    )

    # Output dense layer
    model.get_layer(f"transformer_layer_{i}")._feedforward_output_dense.kernel.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.fc2.weight'].numpy().T
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_output_dense.bias.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.fc2.bias'].numpy()
    )

    # FF LayerNorm
    model.get_layer(f"transformer_layer_{i}")._feedforward_layernorm.gamma.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.final_layer_norm.weight'].numpy()
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_layernorm.beta.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.final_layer_norm.bias'].numpy()
    )


In [11]:
# Define preprocessor.
xlm_roberta_preprocessor = XLMRobertaPreprocessor.from_preset(
    f"xlm_roberta_{MODEL_SIZE}"
)

hf_tokenizer = AutoTokenizer.from_pretrained(f"xlm-roberta-{MODEL_SIZE}")

Downloading data from https://storage.googleapis.com/keras-nlp/models/xlm_roberta_base/vocab.spm


Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [12]:
# Instantiate the HF model.

hf_model = AutoModel.from_pretrained(f"xlm-roberta-{MODEL_SIZE}")
hf_model.eval()
pass

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
# Do a forward pass on both models and check whether the outputs match.

sample_input = ["cricket is awesome! sachin is the goat handsdown."]

inputs = xlm_roberta_preprocessor(tf.constant(sample_input))
hf_inputs = hf_tokenizer(sample_input, padding="max_length", return_tensors="pt")

keras_nlp_output = model.predict(inputs)
hf_output = hf_model(**hf_inputs).last_hidden_state



In [14]:
keras_nlp_output[0, 0, :10], hf_output[0, 0, :10]

(array([ 0.13367906,  0.10714789,  0.05662365, -0.00263481,  0.10225391,
        -0.03752052,  0.04252483, -0.06509331,  0.11365063, -0.14182499],
       dtype=float32),
 tensor([ 0.1337,  0.1071,  0.0566, -0.0026,  0.1023, -0.0375,  0.0425, -0.0651,
          0.1137, -0.1418], grad_fn=<SliceBackward0>))

In [15]:
np.mean(keras_nlp_output - hf_output.detach().numpy())

-7.657932e-05

In [16]:
# Save XLM-RoBERTa checkpoint.
model.save_weights(f"xlm_roberta_{MODEL_SIZE}.h5")

In [17]:
model2 = keras_nlp.models.XLMRoberta.from_preset(
    f"xlm_roberta_{MODEL_SIZE}", load_weights=False
)
model2.load_weights(f"xlm_roberta_{MODEL_SIZE}.h5")

In [18]:
keras_nlp_output2 = model2.predict(inputs)
tf.reduce_mean(keras_nlp_output - keras_nlp_output2)



<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [None]:
# Obtain all the necessary files.
!cp $spm_path /content

!md5sum /content/sentencepiece.bpe.model
!md5sum "/content/xlm_roberta_{MODEL_SIZE}.h5"

bf25eb5120ad92ef5c7d8596b5dc4046  /content/sentencepiece.bpe.model
2eb6fcda5a42f0a88056213ba3d93906  /content/xlm_roberta_base.h5


In [19]:
# Check uploaded model once added to repo
model_cloud = keras_nlp.models.XLMRoberta.from_preset(
    f"xlm_roberta_{MODEL_SIZE}",
    load_weights=True,
)

Downloading data from https://storage.googleapis.com/keras-nlp/models/xlm_roberta_base/model.h5


In [20]:
# Same output from cloud model
keras_nlp_output_cloud = model_cloud.predict(inputs)
tf.reduce_mean(keras_nlp_output - keras_nlp_output_cloud)



<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [21]:
keras_nlp_output_cloud[0, 0, :10]

array([ 0.13367906,  0.10714789,  0.05662365, -0.00263481,  0.10225391,
       -0.03752052,  0.04252483, -0.06509331,  0.11365063, -0.14182499],
      dtype=float32)