From 3b800059e1b797de94d3b36a901172f78dcb54f1 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Tue, 16 Sep 2025 11:02:32 +0200 Subject: [PATCH 1/3] fix --- tests/models/glm4v/test_modeling_glm4v.py | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/models/glm4v/test_modeling_glm4v.py b/tests/models/glm4v/test_modeling_glm4v.py index 5d5e129f7e5a..4523e21a7f95 100644 --- a/tests/models/glm4v/test_modeling_glm4v.py +++ b/tests/models/glm4v/test_modeling_glm4v.py @@ -340,6 +340,9 @@ def test_small_model_integration_test(self): # verify generation inputs = inputs.to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" self.assertEqual( @@ -357,6 +360,9 @@ def test_small_model_integration_test_batch(self): batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ).to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) @@ -395,10 +401,15 @@ def test_small_model_integration_test_with_video(self): inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True ).to(torch_device) + + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ "\n012345Describe this video.\nGot it, let's analyze the video. First, the scene is an indoor tennis court. There are two players: one in the foreground wearing" ] # fmt: skip + self.assertEqual( processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, @@ -413,6 +424,9 @@ def test_small_model_integration_test_expand(self): self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ).to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + output = model.generate(**inputs, max_new_tokens=30, do_sample=False, num_beams=2, num_return_sequences=2) EXPECTED_DECODED_TEXT = [ @@ -442,6 +456,9 @@ def test_small_model_integration_test_batch_wo_image(self): padding=True, ).to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) @@ -469,6 +486,9 @@ def test_small_model_integration_test_batch_different_resolutions(self): padding=True, ).to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) @@ -501,6 +521,9 @@ def test_small_model_integration_test_batch_flashatt2(self): padding=True, ).to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) @@ -536,6 +559,9 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): padding=True, ).to(torch_device) + # This model on the hub has `do_sample=True`. + torch.manual_seed(42) + # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) From e03246967207380736da9c92ccacd8983c24a37c Mon Sep 17 00:00:00 2001 From: ydshieh Date: Tue, 16 Sep 2025 11:46:13 +0200 Subject: [PATCH 2/3] fix --- tests/models/glm4v/test_modeling_glm4v.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/models/glm4v/test_modeling_glm4v.py b/tests/models/glm4v/test_modeling_glm4v.py index 4523e21a7f95..6de0edeaf7ce 100644 --- a/tests/models/glm4v/test_modeling_glm4v.py +++ b/tests/models/glm4v/test_modeling_glm4v.py @@ -344,7 +344,7 @@ def test_small_model_integration_test(self): torch.manual_seed(42) output = model.generate(**inputs, max_new_tokens=30) - EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" + EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically" self.assertEqual( self.processor.decode(output[0], skip_special_tokens=True), EXPECTED_DECODED_TEXT, @@ -406,9 +406,7 @@ def test_small_model_integration_test_with_video(self): torch.manual_seed(42) output = model.generate(**inputs, max_new_tokens=30) - EXPECTED_DECODED_TEXT = [ - "\n012345Describe this video.\nGot it, let's analyze the video. First, the scene is an indoor tennis court. There are two players: one in the foreground wearing" - ] # fmt: skip + EXPECTED_DECODED_TEXT = ["\n012345Describe this video.\nGot it, let's analyze the video. First, the scene is an indoor tennis court. There are two players: one in a white shirt"] # fmt: skip self.assertEqual( processor.batch_decode(output, skip_special_tokens=True), @@ -493,8 +491,8 @@ def test_small_model_integration_test_batch_different_resolutions(self): output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - "\nWhat kind of dog is this?\nGot it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but" + "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", + "\nWhat kind of dog is this?\nGot it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but", ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), @@ -528,8 +526,8 @@ def test_small_model_integration_test_batch_flashatt2(self): output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - "\nWhat kind of dog is this?\nGot it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but", + "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog. Wait, it's a cat,", + "\nWhat kind of dog is this?\nGot it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but" ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), From b6af8bab9b490fa4ed2c25dab1f85bca206fbfc5 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Tue, 16 Sep 2025 12:00:05 +0200 Subject: [PATCH 3/3] fix --- tests/models/glm4v/test_modeling_glm4v.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/models/glm4v/test_modeling_glm4v.py b/tests/models/glm4v/test_modeling_glm4v.py index 6de0edeaf7ce..6c3845b10e88 100644 --- a/tests/models/glm4v/test_modeling_glm4v.py +++ b/tests/models/glm4v/test_modeling_glm4v.py @@ -282,6 +282,8 @@ def test_inputs_embeds_matches_input_ids(self): @require_torch class Glm4vIntegrationTest(unittest.TestCase): def setUp(self): + cleanup(torch_device, gc_collect=True) + self.processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking") self.message = [ { @@ -367,8 +369,8 @@ def test_small_model_integration_test_batch(self): output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" + "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", + "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture has a stocky body, thick fur, and a face that's" ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), @@ -461,8 +463,8 @@ def test_small_model_integration_test_batch_wo_image(self): output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - '\nWho are you?\nGot it, the user is asking "Who are you?" I need to respond appropriately. First, I should clarify that I\'m an AI assistant' + "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", + "\nWho are you?\nGot it, let's look at the user's question: \"Who are you?\" This is a common question when someone is just starting a conversation" ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), @@ -564,8 +566,8 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - '\nWho are you?\nGot it, let\'s look at the question. The user is asking "Who are you?" which is a common question when someone meets an AI' + "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", + "\nWho are you?\nGot it, let's look at the user's question: \"Who are you?\" This is a common question when someone is just starting a conversation" ] # fmt: skip self.assertEqual(