From 3b800059e1b797de94d3b36a901172f78dcb54f1 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Tue, 16 Sep 2025 11:02:32 +0200
Subject: [PATCH 1/3] fix

---
 tests/models/glm4v/test_modeling_glm4v.py | 26 +++++++++++++++++++++++
 1 file changed, 26 insertions(+)
diff --git a/tests/models/glm4v/test_modeling_glm4v.py b/tests/models/glm4v/test_modeling_glm4v.py
index 5d5e129f7e5a..4523e21a7f95 100644
--- a/tests/models/glm4v/test_modeling_glm4v.py
+++ b/tests/models/glm4v/test_modeling_glm4v.py
@@ -340,6 +340,9 @@ def test_small_model_integration_test(self):
         # verify generation
         inputs = inputs.to(torch_device)
 
+        # This model on the hub has `do_sample=True`.
+        torch.manual_seed(42)
+
         output = model.generate(**inputs, max_new_tokens=30)
         EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
         self.assertEqual(
@@ -357,6 +360,9 @@ def test_small_model_integration_test_batch(self):
             batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
         ).to(torch_device)
 
+        # This model on the hub has `do_sample=True`.
+        torch.manual_seed(42)
+
         # it should not matter whether two images are the same size or not
         output = model.generate(**inputs, max_new_tokens=30)
 
@@ -395,10 +401,15 @@ def test_small_model_integration_test_with_video(self):
         inputs = processor.apply_chat_template(
             messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True
         ).to(torch_device)
+
+        # This model on the hub has `do_sample=True`.
+        torch.manual_seed(42)
+
         output = model.generate(**inputs, max_new_tokens=30)
         EXPECTED_DECODED_TEXT = [
             "\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is an indoor tennis court. There are two players: one in the foreground wearing"
         ]  # fmt: skip
+
         self.assertEqual(
             processor.batch_decode(output, skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,
@@ -413,6 +424,9 @@ def test_small_model_integration_test_expand(self):
             self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
         ).to(torch_device)
 
+        # This model on the hub has `do_sample=True`.
+        torch.manual_seed(42)
+
         output = model.generate(**inputs, max_new_tokens=30, do_sample=False, num_beams=2, num_return_sequences=2)
 
         EXPECTED_DECODED_TEXT = [
@@ -442,6 +456,9 @@ def test_small_model_integration_test_batch_wo_image(self):
             padding=True,
         ).to(torch_device)
 
+        # This model on the hub has `do_sample=True`.
+        torch.manual_seed(42)
+
         # it should not matter whether two images are the same size or not
         output = model.generate(**inputs, max_new_tokens=30)
 
@@ -469,6 +486,9 @@ def test_small_model_integration_test_batch_different_resolutions(self):
             padding=True,
         ).to(torch_device)
 
+        # This model on the hub has `do_sample=True`.
+        torch.manual_seed(42)
+
         # it should not matter whether two images are the same size or not
         output = model.generate(**inputs, max_new_tokens=30)
 
@@ -501,6 +521,9 @@ def test_small_model_integration_test_batch_flashatt2(self):
             padding=True,
         ).to(torch_device)
 
+        # This model on the hub has `do_sample=True`.
+        torch.manual_seed(42)
+
         # it should not matter whether two images are the same size or not
         output = model.generate(**inputs, max_new_tokens=30)
 
@@ -536,6 +559,9 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self):
             padding=True,
         ).to(torch_device)
 
+        # This model on the hub has `do_sample=True`.
+        torch.manual_seed(42)
+
         # it should not matter whether two images are the same size or not
         output = model.generate(**inputs, max_new_tokens=30)
 

From e03246967207380736da9c92ccacd8983c24a37c Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Tue, 16 Sep 2025 11:46:13 +0200
Subject: [PATCH 2/3] fix

---
 tests/models/glm4v/test_modeling_glm4v.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/tests/models/glm4v/test_modeling_glm4v.py b/tests/models/glm4v/test_modeling_glm4v.py
index 4523e21a7f95..6de0edeaf7ce 100644
--- a/tests/models/glm4v/test_modeling_glm4v.py
+++ b/tests/models/glm4v/test_modeling_glm4v.py
@@ -344,7 +344,7 @@ def test_small_model_integration_test(self):
         torch.manual_seed(42)
 
         output = model.generate(**inputs, max_new_tokens=30)
-        EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
+        EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically"
         self.assertEqual(
             self.processor.decode(output[0], skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,
@@ -406,9 +406,7 @@ def test_small_model_integration_test_with_video(self):
         torch.manual_seed(42)
 
         output = model.generate(**inputs, max_new_tokens=30)
-        EXPECTED_DECODED_TEXT = [
-            "\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is an indoor tennis court. There are two players: one in the foreground wearing"
-        ]  # fmt: skip
+        EXPECTED_DECODED_TEXT = ["\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is an indoor tennis court. There are two players: one in a white shirt"]  # fmt: skip
 
         self.assertEqual(
             processor.batch_decode(output, skip_special_tokens=True),
@@ -493,8 +491,8 @@ def test_small_model_integration_test_batch_different_resolutions(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but"
+            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
+            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but",
         ]  # fmt: skip
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
@@ -528,8 +526,8 @@ def test_small_model_integration_test_batch_flashatt2(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but",
+            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog. Wait, it's a cat,",
+            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but"
         ]  # fmt: skip
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),

From b6af8bab9b490fa4ed2c25dab1f85bca206fbfc5 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Tue, 16 Sep 2025 12:00:05 +0200
Subject: [PATCH 3/3] fix

---
 tests/models/glm4v/test_modeling_glm4v.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/models/glm4v/test_modeling_glm4v.py b/tests/models/glm4v/test_modeling_glm4v.py
index 6de0edeaf7ce..6c3845b10e88 100644
--- a/tests/models/glm4v/test_modeling_glm4v.py
+++ b/tests/models/glm4v/test_modeling_glm4v.py
@@ -282,6 +282,8 @@ def test_inputs_embeds_matches_input_ids(self):
 @require_torch
 class Glm4vIntegrationTest(unittest.TestCase):
     def setUp(self):
+        cleanup(torch_device, gc_collect=True)
+
         self.processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
         self.message = [
             {
@@ -367,8 +369,8 @@ def test_small_model_integration_test_batch(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
+            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
+            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture has a stocky body, thick fur, and a face that's"
         ]  # fmt: skip
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
@@ -461,8 +463,8 @@ def test_small_model_integration_test_batch_wo_image(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
-            '\nWho are you?\n<think>Got it, the user is asking "Who are you?" I need to respond appropriately. First, I should clarify that I\'m an AI assistant'
+            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
+            "\nWho are you?\n<think>Got it, let's look at the user's question: \"Who are you?\" This is a common question when someone is just starting a conversation"
         ]  # fmt: skip
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
@@ -564,8 +566,8 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
-            '\nWho are you?\n<think>Got it, let\'s look at the question. The user is asking "Who are you?" which is a common question when someone meets an AI'
+            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
+            "\nWho are you?\n<think>Got it, let's look at the user's question: \"Who are you?\" This is a common question when someone is just starting a conversation"
         ]  # fmt: skip
 
         self.assertEqual(