## Imports

In [None]:
!pip install -q transformers keras-nlp

[K     |████████████████████████████████| 142 kB 7.5 MB/s 
[K     |████████████████████████████████| 4.6 MB 45.5 MB/s 
[K     |████████████████████████████████| 511.7 MB 5.1 kB/s 
[K     |████████████████████████████████| 511.7 MB 4.0 kB/s 
[K     |████████████████████████████████| 4.9 MB 36.6 MB/s 
[?25h

In [None]:
import tensorflow as tf
from tensorflow import keras
import keras_nlp
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [None]:
text = "Replace me by any text you'd like."

## Run HuggingFace Encoding

In [None]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

# prepare input
encoded_input = tokenizer(text, return_tensors='pt')

Downloading config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading sentencepiece.bpe.model:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

## Run HuggingFace Model

In [None]:
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")
# forward pass
model = model.roberta
model.eval()
output = model(**encoded_input)

## Run XLMR Checkpoint Converted Model

In [None]:
class XLMRModel(keras.Model):
    def __init__(
        self,
        vocab_size, #250002
        num_layers=12,
        hidden_size=768,
        dropout=0.1,
        num_attention_heads=12,
        inner_size=3072,
        inner_activation="gelu",
        initializer_range=0.02,
        max_sequence_length=512,
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_attention_heads = num_attention_heads
        self.max_sequence_length = max_sequence_length
        self.inner_size = inner_size
        self.inner_activation = keras.activations.get(inner_activation)
        self.initializer_range = initializer_range
        self.initializer = keras.initializers.TruncatedNormal(
            stddev=initializer_range
        )
        self.dropout = dropout

        self._token_and_position_embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
            vocabulary_size=vocab_size,
            sequence_length=max_sequence_length,
            embedding_dim=hidden_size,
            name="token_and_position_embeddings"
        )

        self._embedding_norm_layer = keras.layers.LayerNormalization(
            name="embeddings/layer_norm",
            axis=-1,
            epsilon=1e-5,
            dtype=tf.float32,
        )

        self._embedding_dropout = keras.layers.Dropout(
            rate=dropout, name="embedding_dropout"
        )

        self._transformer_layers = []
        for i in range(num_layers):
            layer = keras_nlp.layers.TransformerEncoder(
                num_heads=num_attention_heads,
                intermediate_dim=inner_size,
                activation=self.inner_activation,
                dropout=dropout,
                kernel_initializer=self.initializer,
                name="transformer/layer_%d" % i,
            )
            self._transformer_layers.append(layer)

        self.inputs = dict(
            input_ids=keras.Input(shape=(None,), dtype=tf.int32),
            input_mask=keras.Input(shape=(None,), dtype=tf.int32),
            segment_ids=keras.Input(shape=(None,), dtype=tf.int32),
        )

    def call(self, inputs):
        if isinstance(inputs, dict):
            input_ids = inputs.get("input_ids")
            input_mask = inputs.get("input_mask")
        else:
            raise ValueError(f"Inputs should be a dict. Received: {inputs}.")

        embeddings = self._token_and_position_embedding_layer(input_ids)
        embeddings = self._embedding_norm_layer(embeddings)
        embeddings = self._embedding_dropout(embeddings)

        x = embeddings
        for layer in self._transformer_layers:
            x = layer(x, padding_mask=input_mask)
        sequence_output = x
        return sequence_output

    def get_embedding_table(self):
        return self._token_and_position_embedding_layer.token_embedding.embeddings

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "vocab_size": self.vocab_size,
                "hidden_size": self.hidden_size,
                "num_layers": self.num_layers,
                "num_attention_heads": self.num_attention_heads,
                "max_sequence_length": self.max_sequence_length,
                "inner_size": self.inner_size,
                "inner_activation": keras.activations.serialize(
                    self.inner_activation
                ),
                "dropout": self.dropout,
                "initializer_range": self.initializer_range,
            }
        )
        return config
tf_model = XLMRModel(vocab_size=250002)
tf_model.load_weights('drive/MyDrive/tf_xlmr_ckp')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fe3caeab6d0>

In [None]:
tf_encoded_input = dict()
tf_encoded_input['input_ids'] = tf.convert_to_tensor(encoded_input['input_ids'].numpy())
tf_encoded_input['input_mask'] = tf.convert_to_tensor(encoded_input['attention_mask'].numpy())
tf_encoded_input

{'input_ids': <tf.Tensor: shape=(1, 13), dtype=int64, numpy=
 array([[    0,   853, 23935,   163,   390,  2499,  7986,   398,    25,
            71,  1884,     5,     2]])>,
 'input_mask': <tf.Tensor: shape=(1, 13), dtype=int64, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>}

In [None]:
ours = tf_model(tf_encoded_input)

## Compare

In [None]:
ours

<tf.Tensor: shape=(1, 13, 768), dtype=float32, numpy=
array([[[ 1.0484349e-01,  1.4283907e-01,  7.8912623e-02, ...,
         -1.0977873e-01,  6.2910996e-02, -2.2781435e-03],
        [ 9.2993587e-02,  2.8810972e-02, -3.1257663e-02, ...,
          1.1292055e-01, -6.4963967e-02, -8.1136853e-02],
        [ 6.6008449e-02,  4.8830777e-02, -1.9372761e-02, ...,
         -1.9778726e-01, -7.0832990e-02, -2.4076793e-01],
        ...,
        [-4.2952802e-02,  1.5743051e-01, -7.8555718e-03, ...,
          1.0656585e-01,  1.2336706e-05, -5.1163714e-02],
        [ 5.7681262e-02,  8.8349335e-02,  1.6320150e-02, ...,
         -1.9969495e-02,  4.4005312e-02,  2.1914441e-02],
        [ 9.5686302e-02,  1.3345972e-01,  1.3284454e-02, ...,
         -2.1225557e-01, -2.1289704e-02,  4.8880361e-02]]], dtype=float32)>

In [None]:
output['last_hidden_state']

tensor([[[ 1.0484e-01,  1.4284e-01,  7.8913e-02,  ..., -1.0978e-01,
           6.2911e-02, -2.2782e-03],
         [ 9.2994e-02,  2.8811e-02, -3.1258e-02,  ...,  1.1292e-01,
          -6.4964e-02, -8.1137e-02],
         [ 6.6009e-02,  4.8831e-02, -1.9372e-02,  ..., -1.9779e-01,
          -7.0833e-02, -2.4077e-01],
         ...,
         [-4.2953e-02,  1.5743e-01, -7.8550e-03,  ...,  1.0657e-01,
           1.2460e-05, -5.1164e-02],
         [ 5.7681e-02,  8.8349e-02,  1.6320e-02,  ..., -1.9969e-02,
           4.4005e-02,  2.1914e-02],
         [ 9.5686e-02,  1.3346e-01,  1.3284e-02,  ..., -2.1226e-01,
          -2.1290e-02,  4.8880e-02]]], grad_fn=<NativeLayerNormBackward0>)

In [None]:
tf.reduce_sum(ours)

<tf.Tensor: shape=(), dtype=float32, numpy=263.13733>

In [None]:
tf.reduce_sum(tf.convert_to_tensor(output['last_hidden_state'].detach().numpy()))

<tf.Tensor: shape=(), dtype=float32, numpy=263.13733>