diff --git a/tests/models/glm4v_moe/test_modeling_glm4v_moe.py b/tests/models/glm4v_moe/test_modeling_glm4v_moe.py index 83a2128c39de..dff5ea7074af 100644 --- a/tests/models/glm4v_moe/test_modeling_glm4v_moe.py +++ b/tests/models/glm4v_moe/test_modeling_glm4v_moe.py @@ -14,7 +14,6 @@ """Testing suite for the PyTorch GLM-4.1V model.""" import copy -import gc import unittest from transformers import ( @@ -25,9 +24,11 @@ is_torch_available, ) from transformers.testing_utils import ( + cleanup, require_flash_attn, require_torch, require_torch_gpu, + run_first, slow, torch_device, ) @@ -295,8 +296,26 @@ def test_inputs_embeds_matches_input_ids(self): @require_torch class Glm4vMoeIntegrationTest(unittest.TestCase): + model = None + + @classmethod + def get_model(cls): + if cls.model is None: + cls.model = Glm4vMoeForConditionalGeneration.from_pretrained( + "zai-org/GLM-4.5V", dtype="auto", device_map="auto" + ) + return cls.model + + @classmethod + def tearDownClass(cls): + del cls.model + cleanup(torch_device, gc_collect=True) + def setUp(self): - self.processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V") + cleanup(torch_device, gc_collect=True) + self.processor = AutoProcessor.from_pretrained( + "zai-org/GLM-4.5V", size={"shortest_edge": 10800, "longest_edge": 10800} + ) self.message = [ { "role": "user", @@ -321,130 +340,56 @@ def setUp(self): ], } ] + self.message_wo_image = [ + {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]}, + ] + + question = "Describe this video." + video_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4" + self.video_messages = [ + { + "role": "user", + "content": [ + { + "type": "video", + "video": video_url, + }, + {"type": "text", "text": question}, + ], + } + ] def tearDown(self): - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device, gc_collect=True) @slow def test_small_model_integration_test(self): - model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto") - inputs = self.processor.apply_chat_template( self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ) - expected_input_ids = [151331, 151333, 151336, 198, 151339, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343] # fmt: skip + expected_input_ids = [151331, 151333, 151336, 198, 151339, 151363, 151363, 151363, 151363, 151363, 151363, 151340, 3838, 3093, 315, 5562, 374] # fmt: skip assert expected_input_ids == inputs.input_ids[0].tolist()[:17] expected_pixel_slice = torch.tensor( [ - [-0.0988, -0.0842, -0.0842], - [-0.5660, -0.5514, -0.4200], - [-0.0259, -0.0259, -0.0259], - [-0.1280, -0.0988, -0.2010], - [-0.4638, -0.5806, -0.6974], - [-1.2083, -1.2229, -1.2083], + [-0.1134, -0.4492, -0.8580], + [-0.6244, -1.1645, -0.7120], + [-0.3324, -0.7996, -0.7120], + [0.2077, 0.2223, 0.4121], + [0.4413, 0.1931, 0.4559], + [0.5873, 0.3099, 0.4851], ], dtype=torch.float32, device="cpu", ) - assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3) - - # verify generation - inputs = inputs.to(torch_device) - - output = model.generate(**inputs, max_new_tokens=30) - EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" - self.assertEqual( - self.processor.decode(output[0], skip_special_tokens=True), - EXPECTED_DECODED_TEXT, - ) + torch.testing.assert_close(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=1e-4, rtol=1e-4) @slow def test_small_model_integration_test_batch(self): - model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto") - batch_messages = [self.message] * 2 + model = self.get_model() + batch_messages = [self.message, self.message2, self.message_wo_image] inputs = self.processor.apply_chat_template( - batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" - ).to(torch_device) - - # it should not matter whether two images are the same size or not - output = model.generate(**inputs, max_new_tokens=30) - - EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" - ] # fmt: skip - self.assertEqual( - self.processor.batch_decode(output, skip_special_tokens=True), - EXPECTED_DECODED_TEXT, - ) - - @slow - def test_small_model_integration_test_with_video(self): - processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V", max_image_size={"longest_edge": 50176}) - model = Glm4vMoeForConditionalGeneration.from_pretrained( - "zai-org/GLM-4.5V", dtype=torch.float16, device_map="auto" - ) - questions = ["Describe this video."] * 2 - video_urls = [ - "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4" - ] * 2 - messages = [ - [ - { - "role": "user", - "content": [ - { - "type": "video", - "video": video_url, - }, - {"type": "text", "text": question}, - ], - } - ] - for question, video_url in zip(questions, video_urls) - ] - inputs = processor.apply_chat_template( - messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True - ).to(torch_device) - output = model.generate(**inputs, max_new_tokens=30) - EXPECTED_DECODED_TEXT = [ - "\n012345Describe this video.\nGot it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami", - "\n012345Describe this video.\nGot it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami" - ] # fmt: skip - self.assertEqual( - processor.batch_decode(output, skip_special_tokens=True), - EXPECTED_DECODED_TEXT, - ) - - @slow - def test_small_model_integration_test_expand(self): - model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto") - inputs = self.processor.apply_chat_template( - self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" - ).to(torch_device) - - output = model.generate(**inputs, max_new_tokens=30, do_sample=False, num_beams=2, num_return_sequences=2) - - EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat, specifically" - ] # fmt: skip - self.assertEqual( - self.processor.batch_decode(output, skip_special_tokens=True), - EXPECTED_DECODED_TEXT, - ) - - @slow - def test_small_model_integration_test_batch_wo_image(self): - model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto") - message_wo_image = [ - {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]}, - ] - batched_messages = [self.message, message_wo_image] - inputs = self.processor.apply_chat_template( - batched_messages, + batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, @@ -453,42 +398,43 @@ def test_small_model_integration_test_batch_wo_image(self): ).to(torch_device) # it should not matter whether two images are the same size or not - output = model.generate(**inputs, max_new_tokens=30) + output = model.generate(**inputs, max_new_tokens=10) EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - '\nWho are you?\nGot it, the user is asking "Who are you?" I need to respond appropriately. First, I should clarify that I\'m an AI assistant' + "\nWhat kind of dog is this?\nGot it, let's try to figure out", + "\nWhat kind of dog is this?\nGot it, let's see. The user", + '\nWho are you?\nThe user is asking "Who are you?"' ] # fmt: skip + decoded = self.processor.batch_decode(output, skip_special_tokens=True) + decoded = [x.replace("<|image|>", "") for x in decoded] self.assertEqual( - self.processor.batch_decode(output, skip_special_tokens=True), + decoded, EXPECTED_DECODED_TEXT, ) @slow - def test_small_model_integration_test_batch_different_resolutions(self): - model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto") - batched_messages = [self.message, self.message2] - inputs = self.processor.apply_chat_template( - batched_messages, + def test_small_model_integration_test_with_video(self): + processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V", max_image_size={"longest_edge": 50176}) + model = self.get_model() + batch_messages = [self.video_messages] + inputs = processor.apply_chat_template( + batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True, ).to(torch_device) - - # it should not matter whether two images are the same size or not - output = model.generate(**inputs, max_new_tokens=30) - - EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - "\nWhat kind of dog is this?\nGot it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but" - ] # fmt: skip + output = model.generate(**inputs, max_new_tokens=3) + EXPECTED_DECODED_TEXT = ["\n012345Describe this video.\nGot it"] # fmt: skip + decoded = processor.batch_decode(output, skip_special_tokens=True) + decoded = [x.replace("<|image|>", "") for x in decoded] self.assertEqual( - self.processor.batch_decode(output, skip_special_tokens=True), + decoded, EXPECTED_DECODED_TEXT, ) + @run_first @slow @require_flash_attn @require_torch_gpu @@ -499,44 +445,9 @@ def test_small_model_integration_test_batch_flashatt2(self): attn_implementation="flash_attention_2", device_map="auto", ) - batched_messages = [self.message, self.message2] - inputs = self.processor.apply_chat_template( - batched_messages, - tokenize=True, - add_generation_prompt=True, - return_dict=True, - return_tensors="pt", - padding=True, - ).to(torch_device) - - # it should not matter whether two images are the same size or not - output = model.generate(**inputs, max_new_tokens=30) - - EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture has a stocky build, thick fur, and a face that's", - "\nWhat kind of dog is this?\nGot it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but" - ] # fmt: skip - self.assertEqual( - self.processor.batch_decode(output, skip_special_tokens=True), - EXPECTED_DECODED_TEXT, - ) - - @slow - @require_flash_attn - @require_torch_gpu - def test_small_model_integration_test_batch_wo_image_flashatt2(self): - model = Glm4vMoeForConditionalGeneration.from_pretrained( - "zai-org/GLM-4.5V", - dtype=torch.bfloat16, - attn_implementation="flash_attention_2", - device_map="auto", - ) - message_wo_image = [ - {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]}, - ] - batched_messages = [self.message, message_wo_image] + batch_messages = [self.message, self.message2, self.message_wo_image] inputs = self.processor.apply_chat_template( - batched_messages, + batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, @@ -545,14 +456,16 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): ).to(torch_device) # it should not matter whether two images are the same size or not - output = model.generate(**inputs, max_new_tokens=30) + output = model.generate(**inputs, max_new_tokens=3) EXPECTED_DECODED_TEXT = [ - "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", - '\nWho are you?\nGot it, let\'s look at the question. The user is asking "Who are you?" which is a common question when someone meets an AI' + "\nWhat kind of dog is this?\nGot it", + "\nWhat kind of dog is this?\nGot it", + "\nWho are you?\nThe user", ] # fmt: skip - + decoded = self.processor.batch_decode(output, skip_special_tokens=True) + decoded = [x.replace("<|image|>", "") for x in decoded] self.assertEqual( - self.processor.batch_decode(output, skip_special_tokens=True), + decoded, EXPECTED_DECODED_TEXT, )