## Imports

In [None]:
import tensorflow as tf
from tensorflow import keras
import keras_nlp
import torch

## Define the Model

In [None]:
class RoBERTaModel(keras.Model):
    def __init__(
        self,
        vocab_size, #50265
        num_layers=12,
        hidden_size=768,
        dropout=0.1,
        num_attention_heads=12,
        inner_size=3072,
        inner_activation="gelu",
        initializer_range=0.02,
        max_sequence_length=512,
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_attention_heads = num_attention_heads
        self.max_sequence_length = max_sequence_length
        self.inner_size = inner_size
        self.inner_activation = keras.activations.get(inner_activation)
        self.initializer_range = initializer_range
        self.initializer = keras.initializers.TruncatedNormal(
            stddev=initializer_range
        )
        self.dropout = dropout

        self._token_and_position_embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
            vocabulary_size=vocab_size,
            sequence_length=max_sequence_length,
            embedding_dim=hidden_size,
            name="token_and_position_embeddings"
        )

        self._embedding_norm_layer = keras.layers.LayerNormalization(
            name="embeddings/layer_norm",
            axis=-1,
            epsilon=1e-5,
            dtype=tf.float32,
        )

        self._embedding_dropout = keras.layers.Dropout(
            rate=dropout, name="embedding_dropout"
        )

        self._transformer_layers = []
        for i in range(num_layers):
            layer = keras_nlp.layers.TransformerEncoder(
                num_heads=num_attention_heads,
                intermediate_dim=inner_size,
                activation=self.inner_activation,
                dropout=dropout,
                kernel_initializer=self.initializer,
                name="transformer/layer_%d" % i,
            )
            self._transformer_layers.append(layer)

        self.inputs = dict(
            input_ids=keras.Input(shape=(None,), dtype=tf.int32),
            input_mask=keras.Input(shape=(None,), dtype=tf.int32),
            segment_ids=keras.Input(shape=(None,), dtype=tf.int32),
        )

    def call(self, inputs):
        if isinstance(inputs, dict):
            input_ids = inputs.get("input_ids")
            input_mask = inputs.get("input_mask")
        else:
            raise ValueError(f"Inputs should be a dict. Received: {inputs}.")

        embeddings = self._token_and_position_embedding_layer(input_ids)
        embeddings = self._embedding_norm_layer(embeddings)
        embeddings = self._embedding_dropout(embeddings)

        x = embeddings
        for layer in self._transformer_layers:
            x = layer(x, padding_mask=input_mask)
        sequence_output = x
        return sequence_output

    def get_embedding_table(self):
        return self._token_and_position_embedding_layer.token_embedding.embeddings

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "vocab_size": self.vocab_size,
                "hidden_size": self.hidden_size,
                "num_layers": self.num_layers,
                "num_attention_heads": self.num_attention_heads,
                "max_sequence_length": self.max_sequence_length,
                "inner_size": self.inner_size,
                "inner_activation": keras.activations.serialize(
                    self.inner_activation
                ),
                "dropout": self.dropout,
                "initializer_range": self.initializer_range,
            }
        )
        return config


In [None]:
model = RoBERTaModel(vocab_size=50265)
model(dict(
  input_ids=keras.Input(shape=(None,), dtype=tf.int32),
  input_mask=keras.Input(shape=(None,), dtype=tf.int32),
  segment_ids=keras.Input(shape=(None,), dtype=tf.int32),
))

mlm_head = keras_nlp.layers.MLMHead(
    vocabulary_size=50265,
    embedding_weights=model.get_embedding_table(),
)

In [None]:
mlm_head.weights

[<tf.Variable 'ro_ber_ta_model_2/token_and_position_embeddings/token_embedding2/embeddings:0' shape=(50265, 768) dtype=float32, numpy=
 array([[-0.00134202, -0.00773878,  0.00736229, ...,  0.00861523,
          0.00948601, -0.00805972],
        [-0.00815954,  0.00617338,  0.00717904, ...,  0.00723502,
         -0.00207715,  0.00026414],
        [-0.00254391,  0.00130945,  0.00356971, ..., -0.00068971,
         -0.00576109, -0.0100769 ],
        ...,
        [ 0.00705787, -0.00680857,  0.00919869, ..., -0.01081331,
         -0.00946475, -0.00179045],
        [ 0.0077587 ,  0.00287509,  0.00430533, ...,  0.00194891,
         -0.0082851 ,  0.00681807],
        [-0.00688892,  0.00786575, -0.00903068, ...,  0.00690948,
         -0.01009752, -0.00629842]], dtype=float32)>]

In [None]:
model.summary()

NameError: ignored

## Load PyTorch Checkpoints

In [None]:
checkpoint = torch.load("drive/MyDrive/roberta.base/model.pt", map_location=torch.device('cpu'))
ckp = checkpoint['model'] # ckp used later
ckp.keys()

odict_keys(['decoder.sentence_encoder.embed_tokens.weight', 'decoder.sentence_encoder.embed_positions.weight', 'decoder.sentence_encoder.layers.0.self_attn.in_proj_weight', 'decoder.sentence_encoder.layers.0.self_attn.in_proj_bias', 'decoder.sentence_encoder.layers.0.self_attn.out_proj.weight', 'decoder.sentence_encoder.layers.0.self_attn.out_proj.bias', 'decoder.sentence_encoder.layers.0.self_attn_layer_norm.weight', 'decoder.sentence_encoder.layers.0.self_attn_layer_norm.bias', 'decoder.sentence_encoder.layers.0.fc1.weight', 'decoder.sentence_encoder.layers.0.fc1.bias', 'decoder.sentence_encoder.layers.0.fc2.weight', 'decoder.sentence_encoder.layers.0.fc2.bias', 'decoder.sentence_encoder.layers.0.final_layer_norm.weight', 'decoder.sentence_encoder.layers.0.final_layer_norm.bias', 'decoder.sentence_encoder.layers.1.self_attn.in_proj_weight', 'decoder.sentence_encoder.layers.1.self_attn.in_proj_bias', 'decoder.sentence_encoder.layers.1.self_attn.out_proj.weight', 'decoder.sentence_enco

In [None]:
ckp['decoder.lm_head.weight'].shape, ckp['decoder.lm_head.bias'].shape, ckp['decoder.lm_head.dense.weight'].shape, ckp['decoder.lm_head.dense.bias'].shape

(torch.Size([50265, 768]),
 torch.Size([50265]),
 torch.Size([768, 768]),
 torch.Size([768]))

## Get TF RoBERTa layers


In [None]:
layer2name = dict()
layer2shape = dict()
for layer in model.layers:
  layer2name[layer] = list(map(lambda x: x.name, layer.weights))
  layer2shape[layer] = list(map(lambda x: x.shape, layer.weights))
layer2name

{<keras.layers.core.dropout.Dropout at 0x7efc17895910>: [],
 <keras.layers.normalization.layer_normalization.LayerNormalization at 0x7efc17895650>: ['ro_ber_ta_model_1/embeddings/layer_norm/gamma:0',
  'ro_ber_ta_model_1/embeddings/layer_norm/beta:0'],
 <keras_nlp.layers.token_and_position_embedding.TokenAndPositionEmbedding at 0x7efc17887fd0>: ['ro_ber_ta_model_1/token_and_position_embeddings/token_embedding2/embeddings:0',
  'ro_ber_ta_model_1/token_and_position_embeddings/position_embedding2/embeddings:0'],
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at 0x7efc17872c10>: ['ro_ber_ta_model_1/transformer/layer_10/multi_head_attention_10/query/kernel:0',
  'ro_ber_ta_model_1/transformer/layer_10/multi_head_attention_10/query/bias:0',
  'ro_ber_ta_model_1/transformer/layer_10/multi_head_attention_10/key/kernel:0',
  'ro_ber_ta_model_1/transformer/layer_10/multi_head_attention_10/key/bias:0',
  'ro_ber_ta_model_1/transformer/layer_10/multi_head_attention_10/value/kernel:0',


In [None]:
layer2shape

{<keras.layers.core.dropout.Dropout at 0x7efc17895910>: [],
 <keras.layers.normalization.layer_normalization.LayerNormalization at 0x7efc17895650>: [TensorShape([768]),
  TensorShape([768])],
 <keras_nlp.layers.token_and_position_embedding.TokenAndPositionEmbedding at 0x7efc17887fd0>: [TensorShape([50265, 768]),
  TensorShape([512, 768])],
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at 0x7efc17872c10>: [TensorShape([768, 12, 64]),
  TensorShape([12, 64]),
  TensorShape([768, 12, 64]),
  TensorShape([12, 64]),
  TensorShape([768, 12, 64]),
  TensorShape([12, 64]),
  TensorShape([12, 64, 768]),
  TensorShape([768]),
  TensorShape([768]),
  TensorShape([768]),
  TensorShape([768]),
  TensorShape([768]),
  TensorShape([768, 3072]),
  TensorShape([3072]),
  TensorShape([3072, 768]),
  TensorShape([768])],
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at 0x7efc17872d90>: [TensorShape([768, 12, 64]),
  TensorShape([12, 64]),
  TensorShape([768, 12, 64]),
  TensorShap

In [None]:
model.layers

[<keras_nlp.layers.token_and_position_embedding.TokenAndPositionEmbedding at 0x7efc17887fd0>,
 <keras.layers.normalization.layer_normalization.LayerNormalization at 0x7efc17895650>,
 <keras.layers.core.dropout.Dropout at 0x7efc17895910>,
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at 0x7efc17895bd0>,
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at 0x7efc178aa1d0>,
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at 0x7efc178aa2d0>,
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at 0x7efc178aa650>,
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at 0x7efc178aad10>,
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at 0x7efc178aa9d0>,
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at 0x7efc17896150>,
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at 0x7efc1787bd50>,
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at 0x7efc17872d90>,
 <keras_nlp.layers.transformer_encoder.TransformerEncoder at

## Convert PyTorch Tensor to TF Tensor

### Embedding Layers

In [None]:
tokenandpositionembedding_layer = model.layers[0] #TokenAndPositionEmbedding

In [None]:
embedding_tensor = ckp['decoder.sentence_encoder.embed_tokens.weight'].numpy()
position_tensor = ckp['decoder.sentence_encoder.embed_positions.weight'].numpy()[2:, :]
embedding_tensor.shape, position_tensor.shape

((50265, 768), (512, 768))

In [None]:
tokenandpositionembedding_layer.set_weights([embedding_tensor, position_tensor])

In [None]:
embedding_layernorm = model.layers[1]

In [None]:
gamma_tensor = ckp['decoder.sentence_encoder.emb_layer_norm.weight']
beta_tensor = ckp['decoder.sentence_encoder.emb_layer_norm.bias']

In [None]:
embedding_layernorm.set_weights([gamma_tensor, beta_tensor])

### Transformer Layers

In [None]:
for i in range(12):
  transformer_layer = model.layers[i+3]

  size = 768
  # query
  query_weights = ckp[f'decoder.sentence_encoder.layers.{i}.self_attn.in_proj_weight'].numpy()
  query_weights = (query_weights.T)[:, :size].reshape(768, 12, 64)
  query_bias = ckp[f'decoder.sentence_encoder.layers.{i}.self_attn.in_proj_bias'].numpy()
  query_bias = query_bias[:size].reshape(12, 64)
  # key
  key_weights = ckp[f'decoder.sentence_encoder.layers.{i}.self_attn.in_proj_weight'].numpy()
  key_weights = (key_weights.T)[:, size:size*2].reshape(768, 12, 64)
  key_bias = ckp[f'decoder.sentence_encoder.layers.{i}.self_attn.in_proj_bias'].numpy()
  key_bias = key_bias[size:size*2].reshape(12, 64)
  # value
  value_weights = ckp[f'decoder.sentence_encoder.layers.{i}.self_attn.in_proj_weight'].numpy()
  value_weights = (value_weights.T)[:, size*2:size*3].reshape(768, 12, 64)
  value_bias = ckp[f'decoder.sentence_encoder.layers.{i}.self_attn.in_proj_bias'].numpy()
  value_bias = value_bias[size*2:size*3].reshape(12, 64)
  # attention output
  attention_weight = ckp[f'decoder.sentence_encoder.layers.{i}.self_attn.out_proj.weight'].numpy()
  attention_weight = attention_weight.T.reshape(12, 64, 768)
  attention_bias = ckp[f'decoder.sentence_encoder.layers.{i}.self_attn.out_proj.bias'].numpy()
  # layer norms
  layernorm_1_gamma = ckp[f'decoder.sentence_encoder.layers.{i}.self_attn_layer_norm.weight'].numpy()
  layernorm_1_beta = ckp[f'decoder.sentence_encoder.layers.{i}.self_attn_layer_norm.bias'].numpy()
  layernorm_2_gamma = ckp[f'decoder.sentence_encoder.layers.{i}.final_layer_norm.weight'].numpy()
  layernorm_2_beta = ckp[f'decoder.sentence_encoder.layers.{i}.final_layer_norm.bias'].numpy()
  # dense
  dense_1_weight = ckp[f'decoder.sentence_encoder.layers.{i}.fc1.weight'].numpy().T
  dense_1_bias = ckp[f'decoder.sentence_encoder.layers.{i}.fc1.bias'].numpy()
  dense_2_weight = ckp[f'decoder.sentence_encoder.layers.{i}.fc2.weight'].numpy().T
  dense_2_bias = ckp[f'decoder.sentence_encoder.layers.{i}.fc2.bias'].numpy()

  weights = [
      query_weights, query_bias, key_weights, key_bias, value_weights, value_bias,
      attention_weight, attention_bias, 
      layernorm_1_gamma, layernorm_1_beta, layernorm_2_gamma, layernorm_2_beta,
      dense_1_weight, dense_1_bias, dense_2_weight, dense_2_bias
  ]

  transformer_layer.set_weights(weights)

In [None]:
model.save_weights('drive/MyDrive/tf_roberta_ckp_2')