In [1]:
%env XLA_PYTHON_CLIENT_PREALLOCATE=false
from jax.config import config
config.update("jax_enable_x64", True)
from dataclasses import is_dataclass
from jax import numpy as jnp
import transformers
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForMaskedLM, 
    BertTokenizer, 
    BertTokenizerFast, 
    BertEmbeddings,
    BfBertEmbeddings,
    BertConfig,
    BertSelfAttention,
    BfBertSelfAttention,
    BertSelfOutput,
    BfBertSelfOutput,
    BertAttention,
    BfBertAttention,
    BertLayer,
    BfBertLayer,
    BertEncoder,
    BfBertEncoder,
    BaseModelOutputWithPastAndCrossAttentions,
    BfBaseModelOutputWithPastAndCrossAttentions,
    BertForMaskedLM,
    BfBertForMaskedLM,
)
from brunoflow.ad.utils import check_node_equals_tensor, check_node_allclose_tensor
from utils import check_bf_param_weights_match_torch, check_equivalent_class, check_dataclass_keys_match, check_model_outputs_allclose, check_bf_model_outputs_match_torch_outputs, check_bf_param_grads_allclose_torch
torch.manual_seed(0)


env: XLA_PYTHON_CLIENT_PREALLOCATE=false


  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f67759da6d0>

In [2]:
# Init torch and bf models
BF_FROM_MODEL_ID = False ### NOTE: BECAUSE THIS IS SUPER HACKY THIS SOMEWHAT DOES NOT WORK WHEN SET TO TRUE. FROM_PRETRAINED FOR BRUNOFLOW IS PROBABLY SOMEWHAT BROKEN, BUT AT LEAST THIS IS A WORKAROUND. Also it looks like the errors are only bounded by 0.01 :/.
TORCH_FROM_MODEL_ID = True
# model_id = "bert-base-uncased"
model_id = "google/bert_uncased_L-2_H-128_A-2"
config = BertConfig.from_pretrained(pretrained_model_name_or_path="../../brunoflow/models/bert/config-tiny.json")

if TORCH_FROM_MODEL_ID:
    torch_model = BertForMaskedLM.from_pretrained(model_id)
else:
    torch_model = BertForMaskedLM(config)
if BF_FROM_MODEL_ID:
    bf_model = BfBertForMaskedLM.from_pretrained(model_id)
else:
    bf_model = BfBertForMaskedLM(config)

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-01-05 00:04:52.619039: E external/org_tensorflow/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error




In [3]:
# Establish data
tokenizer = BertTokenizerFast.from_pretrained(model_id)
text = ["hello I want to eat some [MASK] meat today. It's thanksgiving [MASK] all!", "yo yo what's up"]
tokens = tokenizer(text, return_tensors="pt", padding=True)

# Create torch and bf inputs to model
input_ids_torch = tokens["input_ids"]
labels_torch = torch.ones_like(input_ids_torch)

input_ids_bf = jnp.array(input_ids_torch.numpy())
labels_bf = jnp.array(labels_torch.numpy())

In [4]:
%%time
outputs_torch = torch_model(input_ids_torch)
print(type(outputs_torch))


<class 'transformers.modeling_outputs.MaskedLMOutput'>
CPU times: user 78.4 ms, sys: 0 ns, total: 78.4 ms
Wall time: 14.2 ms


In [5]:
%%time
outputs_bf = bf_model(input_ids_bf)
print(type(outputs_bf))

<class 'transformers.modeling_bf_outputs.BfMaskedLMOutput'>
CPU times: user 1.22 s, sys: 71.1 ms, total: 1.3 s
Wall time: 1.12 s


In [6]:
# Check that forward pass for bf works and matches output shape with torch
if isinstance(outputs_bf, (list, tuple)):
    # Handle case where outputs is a tuple/list and not just a single item
    assert len(outputs_bf) == len(outputs_torch)
    for i in range(len(outputs_bf)):
        out_bf, out_torch = outputs_bf[i], outputs_torch[i] 
        assert(out_torch.shape == out_bf.shape)
elif is_dataclass(outputs_bf):
    check_equivalent_class(outputs_bf, outputs_torch)
    check_dataclass_keys_match(outputs_bf, outputs_torch)
else:
    assert(outputs_torch.shape == outputs_bf.shape)


In [7]:
# Save torch BertForMLM to file
save_path = "bertformlm_torch.pt"
torch.save(torch_model.state_dict(), save_path)

In [8]:
# Load state dict for BertForMLM into BF and check weights, outputs, and backprop
if not BF_FROM_MODEL_ID:
    bf_model.load_state_dict(torch.load(save_path))

### Check weights of BF model and Torch model match exactly

In [9]:
# Check weights match
check_bf_param_weights_match_torch(bf_model, torch_model)

Value of param weight bert.embeddings.word_embeddings.weight for bf and torch are equal? True
Value of param weight bert.embeddings.position_embeddings.weight for bf and torch are equal? True
Value of param weight bert.embeddings.token_type_embeddings.weight for bf and torch are equal? True
Value of param weight bert.embeddings.LayerNorm.weight for bf and torch are equal? True
Value of param weight bert.embeddings.LayerNorm.bias for bf and torch are equal? True
Value of param weight bert.encoder.layer.0.attention.self.query.weight for bf and torch are equal? True
Value of param weight bert.encoder.layer.0.attention.self.query.bias for bf and torch are equal? True
Value of param weight bert.encoder.layer.0.attention.self.key.weight for bf and torch are equal? True
Value of param weight bert.encoder.layer.0.attention.self.key.bias for bf and torch are equal? True
Value of param weight bert.encoder.layer.0.attention.self.value.weight for bf and torch are equal? True
Value of param weight 

### Check model output after forward pass matches for BF and Torch

In [10]:
# Set all dropouts to 0
for name, module in torch_model.named_modules():
    if module._get_name() == "Dropout":
        print(name, module.p)
        module.p = 0
        print(name, module.p)

bert.embeddings.dropout 0.1
bert.embeddings.dropout 0
bert.encoder.layer.0.attention.self.dropout 0.1
bert.encoder.layer.0.attention.self.dropout 0
bert.encoder.layer.0.attention.output.dropout 0.1
bert.encoder.layer.0.attention.output.dropout 0
bert.encoder.layer.0.output.dropout 0.1
bert.encoder.layer.0.output.dropout 0
bert.encoder.layer.1.attention.self.dropout 0.1
bert.encoder.layer.1.attention.self.dropout 0
bert.encoder.layer.1.attention.output.dropout 0.1
bert.encoder.layer.1.attention.output.dropout 0
bert.encoder.layer.1.output.dropout 0.1
bert.encoder.layer.1.output.dropout 0


In [11]:
# Check output from forward passes match for bf and torch
torch_model.train(False)
bf_model.train(False)

outputs_bf = bf_model(input_ids_bf)
outputs_torch = torch_model(input_ids_torch)

if isinstance(outputs_bf, (list, tuple)):
    assert len(outputs_bf) == len(outputs_torch)
    for i in range(len(outputs_bf)):
        out_bf, out_torch = outputs_bf[i], outputs_torch[i]
        check_bf_model_outputs_match_torch_outputs(out_bf, out_torch, atol=1e-6)
elif is_dataclass(outputs_bf):
    check_model_outputs_allclose(outputs_bf, outputs_torch, print_stats=True, atol=1e-2)
else:
    check_bf_model_outputs_match_torch_outputs(outputs_bf, outputs_torch, atol=1e-6)

Checking diff between BF and torch for logits:
Output of bf and torch are within 0.01? True
	Stats on diff in outputs between bf and torch:                   0
count  1.159836e+06
mean   2.621052e-06
std    2.201426e-06
min    1.163514e-13
25%    9.636658e-07
50%    2.073543e-06
75%    3.686615e-06
max    2.491091e-05


### Check grad after backward pass matches for BF and torch

In [12]:
%%time
# Torch backward pass
torch_model.train(True)

if isinstance(outputs_torch, (list, tuple)):
    assert len(outputs_bf) == len(outputs_torch)
    backprop_node_torch = outputs_torch[0]
elif is_dataclass(outputs_torch):
    backprop_node_torch = outputs_torch.logits
else:
    backprop_node_torch = outputs_torch
    
backprop_node_torch.backward(gradient=torch.ones_like(backprop_node_torch))

CPU times: user 112 ms, sys: 0 ns, total: 112 ms
Wall time: 17.6 ms


  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


In [13]:
%%time 
# BF backward pass

if isinstance(outputs_bf, (list, tuple)):
    assert len(outputs_bf) == len(outputs_torch)
    backprop_node = outputs_bf[0]
elif is_dataclass(outputs_torch):
    backprop_node = outputs_bf.logits
else:
    backprop_node = outputs_bf
    
backprop_node.backprop(values_to_compute=("grad",))



CPU times: user 11.3 s, sys: 6.45 s, total: 17.8 s
Wall time: 8.85 s


In [20]:
# Run the actual check
check_bf_param_grads_allclose_torch(bf_model, torch_model, rtol=6e-2, atol=1e-2, print_output=True, print_stats=True, use_assert=True)

Grad of param bert.embeddings.word_embeddings.weight for bf and torch are within rtol=0.06, atol=0.01? True
Grad of param bert.embeddings.position_embeddings.weight for bf and torch are within rtol=0.06, atol=0.01? True
Grad of param bert.embeddings.token_type_embeddings.weight for bf and torch are within rtol=0.06, atol=0.01? True
Grad of param bert.embeddings.LayerNorm.weight for bf and torch are within rtol=0.06, atol=0.01? True
Grad of param bert.embeddings.LayerNorm.bias for bf and torch are within rtol=0.06, atol=0.01? True
Grad of param bert.encoder.layer.0.attention.self.query.weight for bf and torch are within rtol=0.06, atol=0.01? True
Grad of param bert.encoder.layer.0.attention.self.query.bias for bf and torch are within rtol=0.06, atol=0.01? True
Grad of param bert.encoder.layer.0.attention.self.key.weight for bf and torch are within rtol=0.06, atol=0.01? True
Grad of param bert.encoder.layer.0.attention.self.key.bias for bf and torch are within rtol=0.06, atol=0.01? True
