In [1]:
!pip install transformers --quiet
!pip install git+https://github.com/abheesht17/keras-nlp.git@xlm-r tensorflow tensorflow-text --upgrade --quiet

[K     |████████████████████████████████| 4.9 MB 7.4 MB/s 
[K     |████████████████████████████████| 120 kB 62.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 52.1 MB/s 
[K     |████████████████████████████████| 578.0 MB 17 kB/s 
[K     |████████████████████████████████| 5.9 MB 46.7 MB/s 
[K     |████████████████████████████████| 1.7 MB 54.2 MB/s 
[K     |████████████████████████████████| 438 kB 60.5 MB/s 
[K     |████████████████████████████████| 5.9 MB 46.6 MB/s 
[?25h  Building wheel for keras-nlp (setup.py) ... [?25l[?25hdone


In [2]:
import json
import os

import keras_nlp
import numpy as np

import tensorflow as tf
from tensorflow import keras

import torch

from keras_nlp.models.xlm_roberta import XLMRobertaPreprocessor
from transformers import AutoTokenizer, AutoModel

tf.__version__

'2.10.0'

In [3]:
MODEL_SIZE = "base"

In [4]:
# Get the official checkpoints from fairseq.
!wget "https://dl.fbaipublicfiles.com/fairseq/models/xlmr.{MODEL_SIZE}.tar.gz"
!tar -xvf "xlmr.{MODEL_SIZE}.tar.gz"

--2022-09-22 20:26:33--  https://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.74.142, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 512274718 (489M) [application/gzip]
Saving to: ‘xlmr.base.tar.gz’


2022-09-22 20:26:46 (40.3 MB/s) - ‘xlmr.base.tar.gz’ saved [512274718/512274718]

xlmr.base/
xlmr.base/dict.txt
xlmr.base/sentencepiece.bpe.model
xlmr.base/model.pt


In [5]:
# XLM-R paths.
extract_dir = f"/content/xlmr.{MODEL_SIZE}"
spm_path = os.path.join(extract_dir, "sentencepiece.bpe.model")
checkpoint_path = os.path.join(extract_dir, "model.pt")

In [6]:
# Load PyTorch XLM-R checkpoint.
pt_ckpt = torch.load(checkpoint_path, map_location=torch.device('cpu'))
pt_cfg = pt_ckpt["args"]
pt_model = pt_ckpt['model']

In [7]:
pt_cfg

Namespace(activation_dropout=0.0, activation_fn='gelu', adam_betas='(0.9, 0.98)', adam_eps=1e-06, arch='roberta_base', attention_dropout=0.1, best_checkpoint_metric='loss', bpe=None, bucket_cap_mb=25, clip_norm=1.0, cpu=False, criterion='masked_lm', curriculum=0, dataset_impl=None, ddp_backend='c10d', device_id=0, disable_validation=False, distributed_backend='nccl', distributed_no_spawn=False, distributed_port=55498, distributed_rank=0, distributed_world_size=256, dropout=0.1, encoder_attention_heads=12, encoder_embed_dim=768, encoder_ffn_embed_dim=3072, encoder_layers=12, end_learning_rate=0.0, fast_stat_sync=True, find_unused_parameters=False, fix_batches_to_gpus=False, force_anneal=None, fp16=True, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, keep_interval_updates=-1, keep_last_epochs=-1, leave_unmasked_prob=0.1, log_format='json', log_interval=100, lr=[0.0006], lr_scheduler='polynomial_decay', mask_prob=0.15, max_epoch=0, max_positions=512, max_sentences=

In [8]:
# Verify whether our preprocessor works okay.

sample_input = "cricket is awesome!"

xlm_roberta_preprocessor = XLMRobertaPreprocessor(
    proto=spm_path,
    sequence_length=pt_cfg.max_positions
)

hf_tokenizer = AutoTokenizer.from_pretrained(f"xlm-roberta-{MODEL_SIZE}")

# The vocab size returned by our tokenizer is 250000. The vocab size
# of the PT model is 250002. This makes sense if we consider `<pad>`
# and shifting all indices by one.
print(xlm_roberta_preprocessor.vocabulary_size())

tok_sample = xlm_roberta_preprocessor(sample_input)
hf_tok_sample = hf_tokenizer(sample_input, padding="max_length")

(
    tok_sample["token_ids"][:15],
    hf_tok_sample["input_ids"][:15],
    tok_sample["padding_mask"][:15],
    hf_tok_sample["attention_mask"][:15]
)

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

250000


(<tf.Tensor: shape=(15,), dtype=int32, numpy=
 array([    0, 13625, 27853,    83, 79929,    38,     2,     1,     1,
            1,     1,     1,     1,     1,     1], dtype=int32)>,
 [0, 13625, 27853, 83, 79929, 38, 2, 1, 1, 1, 1, 1, 1, 1, 1],
 <tf.Tensor: shape=(15,), dtype=bool, numpy=
 array([ True,  True,  True,  True,  True,  True,  True, False, False,
        False, False, False, False, False, False])>,
 [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [9]:
# Check multi-segment inputs.

sample_input_1 = "best in the world."

tok_sample = xlm_roberta_preprocessor((sample_input, sample_input_1))

hf_tok_sample = hf_tokenizer(sample_input, text_pair=sample_input_1, padding="max_length")

(
    tok_sample["token_ids"][:15],
    hf_tok_sample["input_ids"][:15],
    tok_sample["padding_mask"][:15],
    hf_tok_sample["attention_mask"][:15]
)

(<tf.Tensor: shape=(15,), dtype=int32, numpy=
 array([    0, 13625, 27853,    83, 79929,    38,     2,     2,  2965,
           23,    70,  8999,     5,     2,     1], dtype=int32)>,
 [0, 13625, 27853, 83, 79929, 38, 2, 2, 2965, 23, 70, 8999, 5, 2, 1],
 <tf.Tensor: shape=(15,), dtype=bool, numpy=
 array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True, False])>,
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])

In [10]:
cfg = {}

cfg["num_layers"] = pt_cfg.encoder_layers
cfg["num_heads"] = pt_cfg.encoder_attention_heads
cfg["hidden_dim"] = pt_cfg.encoder_embed_dim
cfg["intermediate_dim"] = pt_cfg.encoder_ffn_embed_dim
cfg["dropout"] = pt_cfg.dropout
cfg["max_sequence_length"] = pt_cfg.max_positions
cfg["vocab_size"] = pt_model["decoder.sentence_encoder.embed_tokens.weight"].numpy().shape[0]

cfg

{'num_layers': 12,
 'num_heads': 12,
 'hidden_dim': 768,
 'intermediate_dim': 3072,
 'dropout': 0.1,
 'max_sequence_length': 512,
 'vocab_size': 250002}

In [11]:
model = keras_nlp.models.XLMRobertaBase(weights=None, vocabulary_size=cfg["vocab_size"])

In [12]:
model.summary()

Model: "xlm_roberta_custom"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 token_ids (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 embeddings (TokenAndPositionEm  (None, None, 768)   192394752   ['token_ids[0][0]']              
 bedding)                                                                                         
                                                                                                  
 embeddings_layer_norm (LayerNo  (None, None, 768)   1536        ['embeddings[0][0]']             
 rmalization)                                                                                     
                                                                                 

In [13]:
# Embedding Layer.
model.get_layer("embeddings").token_embedding.embeddings.assign(
    pt_model["decoder.sentence_encoder.embed_tokens.weight"].numpy()
)
model.get_layer("embeddings").position_embedding.position_embeddings.assign(
    pt_model["decoder.sentence_encoder.embed_positions.weight"].numpy()[2:, :]
)

# Embedding LayerNorm.
model.get_layer("embeddings_layer_norm").gamma.assign(
    pt_model['decoder.sentence_encoder.emb_layer_norm.weight'].numpy()
)
model.get_layer("embeddings_layer_norm").beta.assign(
    pt_model['decoder.sentence_encoder.emb_layer_norm.bias'].numpy()
)


range_1 = (0, cfg["hidden_dim"])
range_2 = (cfg["hidden_dim"], 2 * cfg["hidden_dim"]) 
range_3 = (2 * cfg["hidden_dim"], 3 * cfg["hidden_dim"])

# Transformer layers.
for i in range(model.num_layers):
    q_k_v_wts =  (
        pt_model[f"decoder.sentence_encoder.layers.{i}.self_attn.in_proj_weight"].numpy().T
    )
    q_k_v_bias = (
        pt_model[f"decoder.sentence_encoder.layers.{i}.self_attn.in_proj_bias"].numpy().T
    )

    # Query
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._query_dense.kernel.assign(
        q_k_v_wts[:, range_1[0]: range_1[1]].reshape((cfg["hidden_dim"], cfg["num_heads"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._query_dense.bias.assign(
        q_k_v_bias[range_1[0]: range_1[1]].reshape((cfg["num_heads"], -1))
    )

    # Key
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._key_dense.kernel.assign(
        q_k_v_wts[:, range_2[0]: range_2[1]].reshape((cfg["hidden_dim"], cfg["num_heads"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._key_dense.bias.assign(
        q_k_v_bias[range_2[0]: range_2[1]].reshape((cfg["num_heads"], -1))
    )

    # Value
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._value_dense.kernel.assign(
        q_k_v_wts[:, range_3[0]: range_3[1]].reshape((cfg["hidden_dim"], cfg["num_heads"], -1))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._value_dense.bias.assign(
        q_k_v_bias[range_3[0]: range_3[1]].reshape((cfg["num_heads"], -1))
    )

    # Attention output
    attn_output_wts = (
        pt_model[f'decoder.sentence_encoder.layers.{i}.self_attn.out_proj.weight'].numpy().T
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._output_dense.kernel.assign(
        attn_output_wts.reshape((cfg["num_heads"], -1, cfg["hidden_dim"]))
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layer._output_dense.bias.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.self_attn.out_proj.bias'].numpy()
    )

    # Attention LayerNorm
    model.get_layer(f"transformer_layer_{i}")._self_attention_layernorm.gamma.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.self_attn_layer_norm.weight'].numpy()
    )
    model.get_layer(f"transformer_layer_{i}")._self_attention_layernorm.beta.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.self_attn_layer_norm.bias'].numpy()
    )

    # Intermediate FF layer
    model.get_layer(f"transformer_layer_{i}")._feedforward_intermediate_dense.kernel.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.fc1.weight'].numpy().T
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_intermediate_dense.bias.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.fc1.bias'].numpy()
    )

    # Output dense layer
    model.get_layer(f"transformer_layer_{i}")._feedforward_output_dense.kernel.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.fc2.weight'].numpy().T
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_output_dense.bias.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.fc2.bias'].numpy()
    )

    # FF LayerNorm
    model.get_layer(f"transformer_layer_{i}")._feedforward_layernorm.gamma.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.final_layer_norm.weight'].numpy()
    )
    model.get_layer(f"transformer_layer_{i}")._feedforward_layernorm.beta.assign(
        pt_model[f'decoder.sentence_encoder.layers.{i}.final_layer_norm.bias'].numpy()
    )


In [14]:
# Instantiate the HF model.

hf_model = AutoModel.from_pretrained(f"xlm-roberta-{MODEL_SIZE}")
hf_model.eval()
pass

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
# Do a forward pass on both models and check whether the outputs match.

sample_input = "cricket is awesome! sachin is the goat hands-down."

inputs = xlm_roberta_preprocessor(tf.constant(sample_input)[tf.newaxis])
hf_inputs = hf_tokenizer([sample_input], padding="max_length", return_tensors="pt")

keras_nlp_output = model.predict(inputs)
hf_output = hf_model(**hf_inputs).last_hidden_state



In [16]:
keras_nlp_output[0, 0, :10], hf_output[0, 0, :10]

(array([ 0.12744462,  0.08904259,  0.05758653,  0.00650121,  0.08354896,
        -0.03188083,  0.05017043, -0.05571432,  0.10335617, -0.12708372],
       dtype=float32),
 tensor([ 0.1274,  0.0890,  0.0576,  0.0065,  0.0835, -0.0319,  0.0502, -0.0557,
          0.1034, -0.1271], grad_fn=<SliceBackward0>))

In [17]:
np.mean(keras_nlp_output - hf_output.detach().numpy())

1.8176232e-05

In [18]:
# Save XLM-RoBERTa checkpoint.
model.save_weights(f"xlm_roberta_{MODEL_SIZE}.h5")

In [19]:
model2 = keras_nlp.models.XLMRobertaBase(weights=None, vocabulary_size=cfg['vocab_size'])
model2.load_weights(f"xlm_roberta_{MODEL_SIZE}.h5")

In [20]:
inputs = xlm_roberta_preprocessor(tf.constant(sample_input)[tf.newaxis])
keras_nlp_output2 = model2.predict(inputs)

tf.reduce_mean(keras_nlp_output - keras_nlp_output2)



<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [21]:
# Obtain all the necessary files.

!cp $spm_path /content

In [None]:
!md5sum /content/sentencepiece.bpe.model
!md5sum "/content/xlm_roberta_{MODEL_SIZE}.h5"

bf25eb5120ad92ef5c7d8596b5dc4046  /content/sentencepiece.bpe.model
2eb6fcda5a42f0a88056213ba3d93906  /content/xlm_roberta_base.h5


In [22]:
sample_input = "cricket is awesome! sachin is the goat hands-down."

xlm_roberta_preprocessor_cloud = XLMRobertaPreprocessor(
    proto="common_crawl",
    sequence_length=pt_cfg.max_positions
)

inputs = xlm_roberta_preprocessor_cloud(tf.constant(sample_input)[tf.newaxis])

Downloading data from https://storage.googleapis.com/keras-nlp/models/xlm_roberta_base/vocab.spm


In [24]:
# Check uploaded model once added to repo
model_cloud = keras_nlp.models.XLMRobertaBase(weights="xlm_roberta_base")

Downloading data from https://storage.googleapis.com/keras-nlp/models/xlm_roberta_base/model.h5


In [25]:
# Same output from cloud model
keras_nlp_output_cloud = model_cloud.predict(inputs)
tf.reduce_mean(keras_nlp_output - keras_nlp_output_cloud)



<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [26]:
keras_nlp_output_cloud[0, 0, :10]

array([ 0.12744462,  0.08904259,  0.05758653,  0.00650121,  0.08354896,
       -0.03188083,  0.05017043, -0.05571432,  0.10335617, -0.12708372],
      dtype=float32)