diff --git a/tests/models/glm4v/test_modeling_glm4v.py b/tests/models/glm4v/test_modeling_glm4v.py index 5d5e129f7e5a..6c3845b10e88 100644 --- a/tests/models/glm4v/test_modeling_glm4v.py +++ b/tests/models/glm4v/test_modeling_glm4v.py @@ -282,6 +282,8 @@ def test_inputs_embeds_matches_input_ids(self): @require_torch class Glm4vIntegrationTest(unittest.TestCase): def setUp(self): + cleanup(torch_device, gc_collect=True) + self.processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking") self.message = [ { @@ -340,8 +342,11 @@ def test_small_model_integration_test(self): # verify generation inputs = inputs.to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + output = model.generate(**inputs, max_new_tokens=30) - EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" + EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically" self.assertEqual( self.processor.decode(output[0], skip_special_tokens=True), EXPECTED_DECODED_TEXT, @@ -357,12 +362,15 @@ def test_small_model_integration_test_batch(self): batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ).to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" + "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", + "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture has a stocky body, thick fur, and a face that's" ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), @@ -395,10 +403,13 @@ def test_small_model_integration_test_with_video(self): inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True ).to(torch_device) + + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + output = model.generate(**inputs, max_new_tokens=30) - EXPECTED_DECODED_TEXT = [ - "\n012345Describe this video.\nGot it, let's analyze the video. First, the scene is an indoor tennis court. There are two players: one in the foreground wearing" - ] # fmt: skip + EXPECTED_DECODED_TEXT = ["\n012345Describe this video.\nGot it, let's analyze the video. First, the scene is an indoor tennis court. There are two players: one in a white shirt"] # fmt: skip + self.assertEqual( processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, @@ -413,6 +424,9 @@ def test_small_model_integration_test_expand(self): self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ).to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + output = model.generate(**inputs, max_new_tokens=30, do_sample=False, num_beams=2, num_return_sequences=2) EXPECTED_DECODED_TEXT = [ @@ -442,12 +456,15 @@ def test_small_model_integration_test_batch_wo_image(self): padding=True, ).to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - '\nWho are you?\nGot it, the user is asking "Who are you?" I need to respond appropriately. First, I should clarify that I\'m an AI assistant' + "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", + "\nWho are you?\nGot it, let's look at the user's question: \"Who are you?\" This is a common question when someone is just starting a conversation" ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), @@ -469,12 +486,15 @@ def test_small_model_integration_test_batch_different_resolutions(self): padding=True, ).to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - "\nWhat kind of dog is this?\nGot it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but" + "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", + "\nWhat kind of dog is this?\nGot it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but", ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), @@ -501,12 +521,15 @@ def test_small_model_integration_test_batch_flashatt2(self): padding=True, ).to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - "\nWhat kind of dog is this?\nGot it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but", + "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog. Wait, it's a cat,", + "\nWhat kind of dog is this?\nGot it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but" ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), @@ -536,12 +559,15 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): padding=True, ).to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - '\nWho are you?\nGot it, let\'s look at the question. The user is asking "Who are you?" which is a common question when someone meets an AI' + "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", + "\nWho are you?\nGot it, let's look at the user's question: \"Who are you?\" This is a common question when someone is just starting a conversation" ] # fmt: skip self.assertEqual(