# Benchmarking Fairseq Checkpoint Converted RoBERTa with Huggingface RoBERTa


## Imports

In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 4.7 MB 9.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 55.5 MB/s 
[K     |████████████████████████████████| 101 kB 8.1 MB/s 
[?25h

In [None]:
import tensorflow as tf
from tensorflow import keras
import keras_nlp
from transformers import RobertaTokenizer, RobertaModel

In [None]:
text = "Replace me by any text you'd like."

## Load HuggingFace transformer

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
# encoded_input = tokenizer(text, return_tensors='tf')

Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
print(model)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [None]:
model.summary()

Model: "tf_roberta_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLayer  multiple                 124645632 
 )                                                               
                                                                 
Total params: 124,645,632
Trainable params: 124,645,632
Non-trainable params: 0
_________________________________________________________________


In [None]:
output = model(encoded_input)
lhs = output['last_hidden_state']
output


TFBaseModelOutputWithPoolingAndCrossAttentions([('last_hidden_state',
                                                 <tf.Tensor: shape=(1, 12, 768), dtype=float32, numpy=
                                                 array([[[-0.11464322,  0.11033366, -0.01485661, ..., -0.08089949,
                                                          -0.00180671, -0.02707539],
                                                         [-0.02248321,  0.16116916,  0.05555495, ...,  0.536597  ,
                                                           0.11962057,  0.1575807 ],
                                                         [ 0.05315709, -0.00201554,  0.03704416, ..., -0.4886861 ,
                                                           0.16412729,  0.2736186 ],
                                                         ...,
                                                         [-0.1585741 ,  0.0837442 ,  0.1301794 , ...,  0.3970098 ,
                                                  

## TF Converted RoBERTa model

In [None]:
class RoBERTaModel(keras.Model):
    def __init__(
        self,
        vocab_size, #50265
        num_layers=12,
        hidden_size=768,
        dropout=0.1,
        num_attention_heads=12,
        inner_size=3072,
        inner_activation="gelu",
        initializer_range=0.02,
        max_sequence_length=512,
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_attention_heads = num_attention_heads
        self.max_sequence_length = max_sequence_length
        self.inner_size = inner_size
        self.inner_activation = keras.activations.get(inner_activation)
        self.initializer_range = initializer_range
        self.initializer = keras.initializers.TruncatedNormal(
            stddev=initializer_range
        )
        self.dropout = dropout

        self._token_and_position_embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
            vocabulary_size=vocab_size,
            sequence_length=max_sequence_length,
            embedding_dim=hidden_size,
            name="token_and_position_embeddings"
        )

        self._embedding_norm_layer = keras.layers.LayerNormalization(
            name="embeddings/layer_norm",
            axis=-1,
            epsilon=1e-5,
            dtype=tf.float32,
        )

        self._embedding_dropout = keras.layers.Dropout(
            rate=dropout, name="embedding_dropout"
        )

        self._transformer_layers = []
        for i in range(num_layers):
            layer = keras_nlp.layers.TransformerEncoder(
                num_heads=num_attention_heads,
                intermediate_dim=inner_size,
                activation=self.inner_activation,
                dropout=dropout,
                kernel_initializer=self.initializer,
                name="transformer/layer_%d" % i,
            )
            self._transformer_layers.append(layer)

        self.inputs = dict(
            input_ids=keras.Input(shape=(None,), dtype=tf.int32),
            input_mask=keras.Input(shape=(None,), dtype=tf.int32),
            segment_ids=keras.Input(shape=(None,), dtype=tf.int32),
        )

    def call(self, inputs):
        if isinstance(inputs, dict):
            input_ids = inputs.get("input_ids")
            input_mask = inputs.get("input_mask")
        else:
            raise ValueError(f"Inputs should be a dict. Received: {inputs}.")

        embeddings = self._token_and_position_embedding_layer(input_ids)
        embeddings = self._embedding_norm_layer(embeddings)
        embeddings = self._embedding_dropout(embeddings)

        x = embeddings
        for layer in self._transformer_layers:
            x = layer(x, padding_mask=input_mask)
        sequence_output = x
        return sequence_output

    def get_embedding_table(self):
        return self._token_and_position_embedding_layer.token_embedding.embeddings

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "vocab_size": self.vocab_size,
                "hidden_size": self.hidden_size,
                "num_layers": self.num_layers,
                "num_attention_heads": self.num_attention_heads,
                "max_sequence_length": self.max_sequence_length,
                "inner_size": self.inner_size,
                "inner_activation": keras.activations.serialize(
                    self.inner_activation
                ),
                "dropout": self.dropout,
                "initializer_range": self.initializer_range,
            }
        )
        return config

model = RoBERTaModel(vocab_size=50265)

In [None]:
model.load_weights('drive/MyDrive/tf_roberta_ckp')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7ff8792f0ad0>

## Compare

In [None]:
ours = model(encoded_input.data)
sum = tf.reduce_sum(ours)
print(sum)

tf.Tensor(168.82913, shape=(), dtype=float32)


In [None]:
tf.reduce_sum(lhs)

<tf.Tensor: shape=(), dtype=float32, numpy=168.82816>