Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

TF: rework XLA generate tests #16866

Merged
merged 5 commits into from
Apr 22, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 44 additions & 47 deletions tests/gpt2/test_modeling_tf_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import unittest

from transformers import GPT2Config, is_tf_available
from transformers.testing_utils import require_tf, slow
from transformers.testing_utils import get_gpu_count, require_tf, slow

from ..test_configuration_common import ConfigTester
from ..test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
Expand Down Expand Up @@ -294,21 +294,6 @@ def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask
result = model(inputs)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))

def create_and_check_gpt2_xla_generate(self, config, input_ids, *args):
gante marked this conversation as resolved.
Show resolved Hide resolved
config.eos_token_id = None
config.max_length = 10
model = TFGPT2LMHeadModel(config=config)

# make sure there are no pad tokens in prompt
input_ids = tf.where(input_ids != config.pad_token_id, input_ids, config.pad_token_id - 1)

generated = model.generate(input_ids)

generate_xla = tf.function(model.generate, jit_compile=True)
generated_xla = generate_xla(input_ids)

self.parent.assertListEqual(generated.numpy().tolist(), generated_xla.numpy().tolist())

def create_and_check_gpt2_double_head(
self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
):
Expand Down Expand Up @@ -408,10 +393,6 @@ def test_gpt2_lm_head(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_gpt2_lm_head(*config_and_inputs)

def test_gpt2_xla_generate(self):
gante marked this conversation as resolved.
Show resolved Hide resolved
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_gpt2_xla_generate(*config_and_inputs)

def test_gpt2_double_head(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
Expand Down Expand Up @@ -536,41 +517,57 @@ def test_lm_generate_greedy_distilgpt2_beam_search_special(self):
self.assertListEqual(output_strings, expected_output_string)

@slow
def test_lm_generate_gpt2(self):
@unittest.skipIf(not get_gpu_count(), "XLA not reliable on CPU")
# TODO: remove the skip when the XLA CPU softmax issue gets sorted
def test_lm_generate_gpt2_greedy_xla(self):
# TODO (Joao): convert this to an example with a batch size>1 with different input lengths that works (and fix
# the underlying problem)
model = TFGPT2LMHeadModel.from_pretrained("gpt2")
input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32) # The dog
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
# fmt: off
expected_output_ids = [464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290]
# fmt: on
output_ids = model.generate(input_ids, do_sample=False)
self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

@slow
def test_lm_generate_gpt2_xla_greedy(self):
"""This test gives the exact same results as the non-xla test above"""
model = TFGPT2LMHeadModel.from_pretrained("gpt2")
input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32) # The dog
sentences = ["The dog"]
expected_output_strings = [
"The dog was found in a field near the intersection of West and West Streets.\n\nThe dog",
]
input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids

# The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
# fmt: off
expected_output_ids = [464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290]
# fmt: on
xla_generate = tf.function(model.generate, jit_compile=True)
output_ids = model.generate(input_ids, do_sample=False)
output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
self.assertListEqual(output_strings, expected_output_strings)

xla_generate = tf.function(model.generate, jit_compile=True)
output_ids = xla_generate(input_ids, do_sample=False)
self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
self.assertListEqual(output_strings, expected_output_strings)

@slow
def test_lm_generate_gpt2_xla_sample(self):
@unittest.skipIf(not get_gpu_count(), "XLA not reliable on CPU")
# TODO: remove the skip when the XLA CPU softmax issue gets sorted
def test_lm_generate_gpt2_sample_xla(self):
# NOTE: due to the small numerical differences that are natural when we compile to XLA, sampling the same
# output out of the same seed is far from guaranteed. We can, however, confirm that the results are sensible
# and that we can seed both versions.
model = TFGPT2LMHeadModel.from_pretrained("gpt2")
input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32) # The dog
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# fmt: off
expected_output_ids = [464, 3290, 550, 284, 307, 4376, 287, 281, 4044, 1363, 329, 734, 812, 878, 852, 4376, 757, 329, 2267, 0]
# fmt: on
xla_generate = tf.function(model.generate, jit_compile=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

output_ids = xla_generate(input_ids, do_sample=True, seed=[42, 0])
self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
sentence = ["The dog"]
expected_output_string = [
"The dog must be well educated to do anything. If anything, this must be her best friend"
]
expected_output_string_xla = ["The dog has been named in connection with the murder of a 20-year-old man in!"]
input_ids = tokenizer(sentence, return_tensors="tf", padding=True).input_ids

output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0])
output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
self.assertListEqual(output_strings, expected_output_string)

xla_generate = tf.function(model.generate, jit_compile=True)
output_ids = xla_generate(input_ids, do_sample=True, seed=[7, 0])
output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
self.assertListEqual(output_strings, expected_output_string_xla)
67 changes: 26 additions & 41 deletions tests/t5/test_modeling_tf_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import unittest

from transformers import T5Config, is_tf_available
from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
from transformers.testing_utils import get_gpu_count, require_sentencepiece, require_tf, require_tokenizers, slow
from transformers.utils import cached_property

from ..test_configuration_common import ConfigTester
Expand Down Expand Up @@ -227,23 +227,6 @@ def create_and_check_t5_decoder_model_past_large_inputs(
# test that outputs are equal for slice
tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)

def create_and_check_t5_xla_generate(self, config, input_ids, *args):
gante marked this conversation as resolved.
Show resolved Hide resolved
config.eos_token_id = None
config.max_length = 10
config.do_sample = False
config.num_beams = 1
model = TFT5ForConditionalGeneration(config=config)

# make sure there are no pad tokens in prompt
input_ids = tf.where(input_ids != config.pad_token_id, input_ids, config.pad_token_id + 5)

generated = model.generate(input_ids)

generate_xla = tf.function(model.generate, jit_compile=True)
generated_xla = generate_xla(input_ids)

self.parent.assertListEqual(generated.numpy().tolist(), generated_xla.numpy().tolist())

def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, input_mask, token_labels) = config_and_inputs
Expand Down Expand Up @@ -297,10 +280,6 @@ def test_t5_decoder_model_past_large_inputs(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_t5_decoder_model_past_large_inputs(*config_and_inputs)

def test_t5_model_xla_generate(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_t5_xla_generate(*config_and_inputs)

def test_model_common_attributes(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

Expand Down Expand Up @@ -481,12 +460,18 @@ def test_train_pipeline_custom_model(self):
@require_tokenizers
class TFT5GenerationIntegrationTests(unittest.TestCase):
@slow
@unittest.skipIf(not get_gpu_count(), "XLA not reliable on CPU")
# TODO: remove the skip when the XLA CPU softmax issue gets sorted
def test_greedy_xla_generate_simple(self):
model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

sentence = "Translate English to German: Today is a beautiful day."
input_ids = tokenizer(sentence, return_tensors="tf", padding=True).input_ids
# two examples with different lengths to confirm that attention masks are operational in XLA
sentences = [
"Translate English to German: Today is a beautiful day.",
"Translate English to German: I have four cats, three dogs, two birds, and a horse.",
]
input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids

xla_generate = tf.function(model.generate, jit_compile=True)

Expand All @@ -496,7 +481,10 @@ def test_greedy_xla_generate_simple(self):
output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
output_strings_xla = tokenizer.batch_decode(output_ids_xla, skip_special_tokens=True)

expected_output_string = ["Heute ist ein sch枚ner Tag."]
expected_output_string = [
"Heute ist ein sch枚ner Tag.",
"Ich habe vier Katzen, drei Hunde, zwei V枚gel und ein Pferd.",
]
Comment on lines +505 to +508
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we guaranteed that the logits are stable enough that we'll always sample this exact output? A flaky test can be really annoying!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We haven't had problems with similar tests, so I'm assuming we won't have problems :D

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...fair point!


self.assertListEqual(expected_output_string, output_strings)
self.assertListEqual(expected_output_string, output_strings_xla)
Expand Down Expand Up @@ -525,31 +513,28 @@ def test_greedy_generate(self):
self.assertListEqual(expected_output_string, output_strings)

@slow
@unittest.skipIf(not get_gpu_count(), "XLA not reliable on CPU")
# TODO: remove the skip when the XLA CPU softmax issue gets sorted
def test_sample_xla_generate_simple(self):
# NOTE: due to the small numerical differences that are natural when we compile to XLA, sampling the same
# output out of the same seed is far from guaranteed (unlike this example). We can, however, confirm that the
# results are sensible and that we can seed both versions.
model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

sentence = "Translate English to German: Today is a beautiful day."
sentence = "Translate English to German: I have two bananas"
input_ids = tokenizer(sentence, return_tensors="tf", padding=True).input_ids
# XLA reorder ops, which causes operations like FP matmul to have slightly different results, causing
# divergences in generate -- especially with sampling.
expected_output_string = ["Heute ist ein sch枚ner Tag."]
expected_output_string_xla = ["Heute ist ein sch枚ne Tage."]
# However, notice that the first tokens are the same, for the same seed
assert expected_output_string[0][:15] == expected_output_string_xla[0][:15]
expected_output_string = ["Ich habe 2 Bananen"]
expected_output_string_xla = ["Ich habe 2 Bananen"]

# forces the generation to happen on CPU, to avoid GPU-related quirks
with tf.device(":/CPU:0"):
# seed set -> deterministic sampling sequence -> deterministic generation
output_ids = model.generate(input_ids, do_sample=True, seed=[42, 0])
# seed set -> deterministic sampling sequence -> deterministic generation
output_ids = model.generate(input_ids, do_sample=True, seed=[42, 0])
output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
self.assertListEqual(expected_output_string, output_strings)

# forces the generation to happen on CPU, to avoid GPU-related quirks
with tf.device(":/CPU:0"):
xla_generate = tf.function(model.generate, jit_compile=True)
# seed set -> deterministic sampling sequence -> deterministic generation
output_ids_xla = xla_generate(input_ids, do_sample=True, seed=[42, 0])
xla_generate = tf.function(model.generate, jit_compile=True)
# seed set -> deterministic sampling sequence -> deterministic generation
output_ids_xla = xla_generate(input_ids, do_sample=True, seed=[42, 0])
output_strings_xla = tokenizer.batch_decode(output_ids_xla, skip_special_tokens=True)
self.assertListEqual(expected_output_string_xla, output_strings_xla)

Expand Down