In [7]:
from bigbird.core import modeling
import tensorflow.compat.v2 as tf
from tqdm import tqdm
from transformers import BigBirdPegasusConfig, BigBirdPegasusModel, BigBirdPegasusForConditionalGeneration, BigBirdPegasusTokenizer, BigBirdConfig, EncoderDecoderConfig, EncoderDecoderModel
from bigbird.summarization.run_summarization import serving_input_fn_builder
import torch
import numpy as np
import os

tf.enable_v2_behavior()

In [8]:
# t = BigBirdPegasusTokenizer("tf_ckpt/spiece.model")
# t.save_pretrained("google/bigbird-pegasus-large-arxiv")
# t = BigBirdPegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

In [9]:
# o = t("This is a long example input string containing special characters .\n$?-, numbers 2872 234 12 and words.", max_length=30, padding="max_length").input_ids

# print(o)
# # t.convert_ids_to_tokens(o)
# ifn = serving_input_fn_builder(batch_size=1, max_encoder_length=30, vocab_model_file="tf_ckpt/spiece.model", substitute_newline=False)

# ifn()

In [10]:
def difference_between_tensors(tf_tensor, pt_tensor):
    tf_np = np.array(tf_tensor)
    pt_np = np.array(pt_tensor.detach())
    return np.max(np.abs(tf_np - pt_np))

TF_CKPT_DIR = "tf_ckpt/bigbird-roberta-arxiv/model.ckpt-300000"
HF_CKPT_DIR = "google/bigbird-roberta-arxiv"


In [34]:
# 'couple_encoder_decoder' is switching pegasus & encoder-decoder

bbc = {
      # transformer basic configs
      "couple_encoder_decoder": True,
      "vocab_size": 50358,
      "attention_probs_dropout_prob": 0.0,
      "hidden_act": "gelu",
      "hidden_dropout_prob": 0.0,
      "hidden_size": 768,
      "initializer_range": 0.02,
      "intermediate_size": 3072,
      "max_position_embeddings": 4096,
      "num_attention_heads": 12,
      "num_hidden_layers": 12,
      "num_decoder_layer": 12,
      "type_vocab_size": 1,
      "use_bias": True,
      "rescale_embedding": False,
      "scope": "bert",
      # sparse mask configs
      "attention_type": "original_full", # "block_sparse" "original_full"
      "norm_type": "postnorm",
      "block_size": 16,
      "num_rand_blocks": 3,
      # common bert configs
      "max_encoder_length": 256,
      "max_decoder_length": 16,
      "batch_size": 1,
      "beam_size": 5, #
      "alpha": 0.1, #
}
# hf_bigbird_config = BigBirdConfig(
#         vocab_size=bbc['vocab_size'],
#         hidden_size=768,
#         num_hidden_layers=bbc["num_hidden_layers"],
#         num_attention_heads=bbc['num_attention_heads'],
#         intermediate_size=bbc["intermediate_size"],
#         hidden_act="gelu_fast",
#         hidden_dropout_prob=0.1,
#         attention_probs_dropout_prob=0.1,
#         max_position_embeddings=4096,
#         type_vocab_size=bbc['type_vocab_size'],
#         initializer_range=0.02,
#         layer_norm_eps=1e-12,
#         use_cache=True,
#         rescale_embeddings=False,
#         attention_type=bbc['attention_type'], # only for encoder
#         block_size=bbc['block_size'],
#         num_random_blocks=bbc['num_rand_blocks'],
#         use_bias=bbc['use_bias'],
# )

bigbird_config = bbc

# b = BigBirdConfig.from_pretrained("google/bigbird-roberta-base")
# hf_config = EncoderDecoderConfig.from_encoder_decoder_configs(b, b)

In [49]:
c = BigBirdConfig.from_pretrained("google/bigbird-roberta-base", block_size=bbc['block_size'], num_random_blocks=bbc['num_rand_blocks'])
hf_config = EncoderDecoderConfig.from_pretrained("google/bigbird-roberta-arxiv", encoder=c)

In [51]:
hf_config.encoder

BigBirdConfig {
  "architectures": [
    "BigBirdForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_type": "block_sparse",
  "block_size": 16,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu_fast",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 4096,
  "model_type": "big_bird",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_random_blocks": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rescale_embeddings": false,
  "sep_token_id": 66,
  "transformers_version": "4.5.0.dev0",
  "type_vocab_size": 2,
  "use_bias": true,
  "use_cache": true,
  "vocab_size": 50358
}

In [52]:
s1 = bigbird_config["batch_size"]
s2 = bigbird_config["max_encoder_length"]
s3 = bigbird_config["max_decoder_length"]

np.random.seed(0)
arr = np.random.randint(1, s2, size=s1*s2).reshape(s1, s2)
input_ids = tf.convert_to_tensor(arr, dtype=tf.int32)
hf_input_ids = torch.from_numpy(arr).long()

np.random.seed(0)
arr = np.random.randint(1, s3, size=s1*s3).reshape(s1, s3)
target_ids = tf.convert_to_tensor(arr, dtype=tf.int32)
hf_target_ids = torch.from_numpy(arr).long()

In [53]:
hf_model = EncoderDecoderModel(config=hf_config)
hf_model.load_state_dict(torch.load(os.path.join(HF_CKPT_DIR, "pytorch_model.bin")))
hf_model.eval()
for p in hf_model.parameters():
    p.requires_grad_(False)

In [55]:
hf_model.encoder.config.attention_type

'block_sparse'

In [32]:
model = modeling.TransformerModel(bigbird_config)
o = model(input_ids, target_ids=target_ids)
del o

ckpt_reader = tf.compat.v1.train.NewCheckpointReader(TF_CKPT_DIR)
model.set_weights([ckpt_reader.get_tensor(v.name[:-2]) for v in tqdm(model.trainable_weights, position=0)])

model.trainable = False

INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****


In [59]:
tf_out = model(input_ids, target_ids=target_ids)

INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****
INFO:absl:**** Using original full attention ****


In [57]:
hf_out = hf_model(input_ids=hf_input_ids, decoder_input_ids=hf_target_ids)

In [60]:
tf_out[0][1]

<tf.Tensor: shape=(1, 16, 50358), dtype=float32, numpy=
array([[[ 2.7104834e-01, -5.6082445e-01, -2.1321970e-01, ...,
         -4.0912282e-01, -1.3124352e+00, -1.2683020e+00],
        [-2.2986460e-01, -9.4347262e-01,  1.3863336e+00, ...,
          1.8453598e-04, -1.3332489e+00, -1.4688762e+00],
        [ 4.4439459e-01, -1.1039134e+00,  2.9542822e-01, ...,
         -1.5371733e+00, -9.6540868e-01, -8.9439464e-01],
        ...,
        [ 9.7916800e-01, -6.8775547e-01,  1.3556516e-01, ...,
         -1.4633611e+00, -9.5423007e-01, -9.6981692e-01],
        [ 1.0442281e+00, -4.0241051e-01,  1.5293916e-01, ...,
         -1.3055569e+00, -8.4877443e-01, -8.5212266e-01],
        [ 1.0604000e+00, -1.4837348e-01,  1.7223862e-01, ...,
         -1.3593765e+00, -8.5776043e-01, -8.7298346e-01]]], dtype=float32)>

In [62]:
hf_out['logits']

tensor([[[ 6.6704e-01,  8.3960e-01,  1.5101e+00,  ..., -1.3282e+00,
           2.5375e-01, -9.3701e-03],
         [ 6.6382e-01,  8.3080e-01,  1.5285e+00,  ..., -1.2450e+00,
           2.7344e-01,  4.9897e-04],
         [ 5.9984e-01,  8.0295e-01,  1.4645e+00,  ..., -1.2910e+00,
           2.4942e-01, -1.5131e-02],
         ...,
         [ 5.9278e-01,  8.0517e-01,  1.4685e+00,  ..., -1.2839e+00,
           2.5876e-01, -1.0901e-02],
         [ 5.9458e-01,  8.0307e-01,  1.4751e+00,  ..., -1.2913e+00,
           2.5816e-01, -1.1425e-02],
         [ 5.9999e-01,  8.1174e-01,  1.4873e+00,  ..., -1.2903e+00,
           2.6024e-01, -9.9632e-03]]])

In [15]:
# a = set([v.name[:-2] for v in model.trainable_variables])
# b = set([b[0] for b in tf.train.list_variables(TF_CKPT_DIR)])

In [16]:
print("difference in encoder out", difference_between_tensors(model.encoder_o, hf_model.model.encoder.encoder_o))

print("difference in encoder out", difference_between_tensors(tf_out[1], hf_out['encoder_last_hidden_state']))

print("difference in final out", difference_between_tensors(tf_out[0][1], hf_out['logits']))

difference in encoder out 0.0009796619
difference in encoder out 0.0009796619
difference in final out 0.0019226074


In [17]:
hf_out.logits[0, 4:8, 128:156]

tensor([[ 3.7736,  0.6459,  5.9393, -2.0550,  1.3957,  1.6994,  1.7002,  4.3194,
          6.7270,  0.8877,  2.7457,  0.3128, -0.3091,  3.7636,  6.4191,  3.2155,
         -0.9953,  7.4407,  3.8938,  0.4070,  3.7436,  3.7248,  5.6073,  3.8378,
         -1.9400,  5.2315,  4.6829,  2.0397],
        [ 3.8075,  0.5993,  5.9881, -1.9268,  1.7395,  1.9801,  1.4785,  4.4040,
          6.9427,  0.6825,  2.8742,  0.7088, -0.6241,  3.3309,  6.5836,  3.2848,
         -1.1375,  7.2144,  4.1101,  0.8657,  3.9520,  3.5079,  5.4696,  3.9301,
         -2.2243,  5.2562,  4.6510,  1.9688],
        [ 3.9393,  0.5811,  6.1118, -1.9829,  1.9584,  2.0622,  1.6118,  4.5815,
          7.1832,  0.6703,  2.9474,  0.8766, -0.7241,  3.3090,  6.7720,  3.4544,
         -1.0948,  7.0197,  4.2286,  1.1543,  4.0334,  3.4939,  5.5613,  4.1545,
         -2.2169,  5.4238,  4.7881,  1.9614],
        [ 4.0004,  0.5768,  6.1671, -2.1092,  2.0556,  2.0222,  1.5487,  4.5812,
          7.2975,  0.7099,  3.0134,  0.9069, -0.8406

In [18]:
# print("difference in embed out", difference_between_tensors(model.embed_o, hf_model.model.encoder.embed_o))

# print("difference in before_attn_o out", difference_between_tensors(model.encoder.encoder_layers[0].before_attn_o, hf_model.model.encoder.layers[0].before_attn_o))

# print("difference in after self_o out", difference_between_tensors(tf.reshape(model.encoder.encoder_layers[0].self_o, (1, 128, 1024)), hf_model.model.encoder.layers[0].self_attn.self_o))

# print("difference in after so_o out", difference_between_tensors(model.encoder.encoder_layers[0].so_o, hf_model.model.encoder.layers[0].self_attn.so_o))

# print("difference in after self-attn out", difference_between_tensors(model.encoder.encoder_layers[0].after_attn_o, hf_model.model.encoder.layers[0].after_attn_o))

# print("difference in before inter out", difference_between_tensors(model.encoder.encoder_layers[0].before_inter_o, hf_model.model.encoder.layers[0].before_inter_o))

# print("difference in after inter out", difference_between_tensors(model.encoder.encoder_layers[0].after_inter_o, hf_model.model.encoder.layers[0].after_inter_o))

# print("difference in output out", difference_between_tensors(model.encoder.encoder_layers[0].output_o, hf_model.model.encoder.layers[0].output_o))

# print("difference in l0 out", difference_between_tensors(model.encoder.l0_o, hf_model.model.encoder.l0_o))

# print("difference in l last out", difference_between_tensors(model.encoder.llast_o, hf_model.model.encoder.llast_o))

# print("difference in ki", difference_between_tensors(model.encoder.encoder_layers[0].attn_layer.ki, hf_model.model.encoder.layers[0].self_attn.self.qi))
# print("difference in qi", difference_between_tensors(model.encoder.encoder_layers[0].attn_layer.qi, hf_model.model.encoder.layers[0].self_attn.self.qi))

# print("difference in qo out", difference_between_tensors(model.encoder.encoder_layers[0].attn_layer.qo, hf_model.model.encoder.layers[0].self_attn.self.qo))

# print("difference in ko out", difference_between_tensors(model.encoder.encoder_layers[0].attn_layer.ko, hf_model.model.encoder.layers[0].self_attn.self.ko))

# print("difference in vo out", difference_between_tensors(model.encoder.encoder_layers[0].attn_layer.vo, hf_model.model.encoder.layers[0].self_attn.self.vo))

In [19]:
# bigbird pegasus large pubmed
# difference in encoder out 0.0002682209
# difference in encoder out 0.0002682209
# difference in final out 0.0008444786

# bigbird pegasus large bigpatent
# difference in encoder out 0.00029605627
# difference in encoder out 0.00029605627
# difference in final out 0.00074386597

# bigbird pegasus large arxiv
# difference in encoder out 0.0005502105
# difference in encoder out 0.0005502105
# difference in final out 0.00051164627


# bigbird pegasus large
# difference in encoder out 0.00011986494
# difference in encoder out 0.00011986494
# difference in final out 0.012252808

In [35]:
from transformers import EncoderDecoderModel, BigBirdModel, EncoderDecoderConfig, BigBirdConfig
import tensorflow as tf

In [38]:
# model = EncoderDecoderModel.from_encoder_decoder_pretrained("google/bigbird-roberta-base", "google/bigbird-roberta-base")

In [1]:
# [k for k in model.state_dict().keys()]

# token_type_embeddings = None
# position_ids = torch.arange(config.max_position_embeddings).expand((1, -1))

In [16]:
# [(c[0], c[1]) for c in tf.train.list_variables("tf_ckpt/bigbird-roberta-arxiv/model.ckpt-300000")]
# encoder.encoder.layer.0

In [22]:
# [k for k in model.state_dict().keys()]

In [42]:

# config

When using `BigBirdForCausalLM` as decoder, then `attention_type` must be `original_full`. Setting `attention_type=original_full`
When using `BigBirdForCausalLM` as decoder, then `attention_type` must be `original_full`. Setting `attention_type=original_full`


In [None]:
hf_out = hf_model(input_ids=hf_input_ids, labels=hf_target_ids)