-
Notifications
You must be signed in to change notification settings - Fork 30.6k
Fix Glm4vMoeIntegrationTest
#40930
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix Glm4vMoeIntegrationTest
#40930
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,6 @@ | |
"""Testing suite for the PyTorch GLM-4.1V model.""" | ||
|
||
import copy | ||
import gc | ||
import unittest | ||
|
||
from transformers import ( | ||
|
@@ -25,9 +24,11 @@ | |
is_torch_available, | ||
) | ||
from transformers.testing_utils import ( | ||
cleanup, | ||
require_flash_attn, | ||
require_torch, | ||
require_torch_gpu, | ||
run_first, | ||
slow, | ||
torch_device, | ||
) | ||
|
@@ -295,8 +296,26 @@ def test_inputs_embeds_matches_input_ids(self): | |
|
||
@require_torch | ||
class Glm4vMoeIntegrationTest(unittest.TestCase): | ||
model = None | ||
|
||
@classmethod | ||
def get_model(cls): | ||
if cls.model is None: | ||
cls.model = Glm4vMoeForConditionalGeneration.from_pretrained( | ||
"zai-org/GLM-4.5V", dtype="auto", device_map="auto" | ||
) | ||
return cls.model | ||
|
||
@classmethod | ||
def tearDownClass(cls): | ||
del cls.model | ||
cleanup(torch_device, gc_collect=True) | ||
|
||
def setUp(self): | ||
self.processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V") | ||
cleanup(torch_device, gc_collect=True) | ||
self.processor = AutoProcessor.from_pretrained( | ||
"zai-org/GLM-4.5V", size={"shortest_edge": 10800, "longest_edge": 10800} | ||
) | ||
self.message = [ | ||
{ | ||
"role": "user", | ||
|
@@ -321,130 +340,56 @@ def setUp(self): | |
], | ||
} | ||
] | ||
self.message_wo_image = [ | ||
{"role": "user", "content": [{"type": "text", "text": "Who are you?"}]}, | ||
] | ||
|
||
question = "Describe this video." | ||
video_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4" | ||
self.video_messages = [ | ||
{ | ||
"role": "user", | ||
"content": [ | ||
{ | ||
"type": "video", | ||
"video": video_url, | ||
}, | ||
{"type": "text", "text": question}, | ||
], | ||
} | ||
] | ||
|
||
def tearDown(self): | ||
gc.collect() | ||
torch.cuda.empty_cache() | ||
cleanup(torch_device, gc_collect=True) | ||
|
||
@slow | ||
def test_small_model_integration_test(self): | ||
model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto") | ||
|
||
inputs = self.processor.apply_chat_template( | ||
self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" | ||
) | ||
expected_input_ids = [151331, 151333, 151336, 198, 151339, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343] # fmt: skip | ||
expected_input_ids = [151331, 151333, 151336, 198, 151339, 151363, 151363, 151363, 151363, 151363, 151363, 151340, 3838, 3093, 315, 5562, 374] # fmt: skip | ||
assert expected_input_ids == inputs.input_ids[0].tolist()[:17] | ||
|
||
expected_pixel_slice = torch.tensor( | ||
[ | ||
[-0.0988, -0.0842, -0.0842], | ||
[-0.5660, -0.5514, -0.4200], | ||
[-0.0259, -0.0259, -0.0259], | ||
[-0.1280, -0.0988, -0.2010], | ||
[-0.4638, -0.5806, -0.6974], | ||
[-1.2083, -1.2229, -1.2083], | ||
[-0.1134, -0.4492, -0.8580], | ||
[-0.6244, -1.1645, -0.7120], | ||
[-0.3324, -0.7996, -0.7120], | ||
[0.2077, 0.2223, 0.4121], | ||
[0.4413, 0.1931, 0.4559], | ||
[0.5873, 0.3099, 0.4851], | ||
], | ||
dtype=torch.float32, | ||
device="cpu", | ||
) | ||
assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3) | ||
|
||
# verify generation | ||
inputs = inputs.to(torch_device) | ||
|
||
output = model.generate(**inputs, max_new_tokens=30) | ||
EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" | ||
self.assertEqual( | ||
self.processor.decode(output[0], skip_special_tokens=True), | ||
EXPECTED_DECODED_TEXT, | ||
) | ||
torch.testing.assert_close(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=1e-4, rtol=1e-4) | ||
|
||
@slow | ||
def test_small_model_integration_test_batch(self): | ||
model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto") | ||
batch_messages = [self.message] * 2 | ||
model = self.get_model() | ||
batch_messages = [self.message, self.message2, self.message_wo_image] | ||
inputs = self.processor.apply_chat_template( | ||
batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" | ||
).to(torch_device) | ||
|
||
# it should not matter whether two images are the same size or not | ||
output = model.generate(**inputs, max_new_tokens=30) | ||
|
||
EXPECTED_DECODED_TEXT = [ | ||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", | ||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" | ||
] # fmt: skip | ||
self.assertEqual( | ||
self.processor.batch_decode(output, skip_special_tokens=True), | ||
EXPECTED_DECODED_TEXT, | ||
) | ||
|
||
@slow | ||
def test_small_model_integration_test_with_video(self): | ||
processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V", max_image_size={"longest_edge": 50176}) | ||
model = Glm4vMoeForConditionalGeneration.from_pretrained( | ||
"zai-org/GLM-4.5V", dtype=torch.float16, device_map="auto" | ||
) | ||
questions = ["Describe this video."] * 2 | ||
video_urls = [ | ||
"https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4" | ||
] * 2 | ||
messages = [ | ||
[ | ||
{ | ||
"role": "user", | ||
"content": [ | ||
{ | ||
"type": "video", | ||
"video": video_url, | ||
}, | ||
{"type": "text", "text": question}, | ||
], | ||
} | ||
] | ||
for question, video_url in zip(questions, video_urls) | ||
] | ||
inputs = processor.apply_chat_template( | ||
messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True | ||
).to(torch_device) | ||
output = model.generate(**inputs, max_new_tokens=30) | ||
EXPECTED_DECODED_TEXT = [ | ||
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami", | ||
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami" | ||
] # fmt: skip | ||
self.assertEqual( | ||
processor.batch_decode(output, skip_special_tokens=True), | ||
EXPECTED_DECODED_TEXT, | ||
) | ||
|
||
@slow | ||
def test_small_model_integration_test_expand(self): | ||
model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto") | ||
inputs = self.processor.apply_chat_template( | ||
self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" | ||
).to(torch_device) | ||
|
||
output = model.generate(**inputs, max_new_tokens=30, do_sample=False, num_beams=2, num_return_sequences=2) | ||
|
||
EXPECTED_DECODED_TEXT = [ | ||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", | ||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat, specifically" | ||
] # fmt: skip | ||
self.assertEqual( | ||
self.processor.batch_decode(output, skip_special_tokens=True), | ||
EXPECTED_DECODED_TEXT, | ||
) | ||
|
||
@slow | ||
def test_small_model_integration_test_batch_wo_image(self): | ||
model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto") | ||
message_wo_image = [ | ||
{"role": "user", "content": [{"type": "text", "text": "Who are you?"}]}, | ||
] | ||
batched_messages = [self.message, message_wo_image] | ||
inputs = self.processor.apply_chat_template( | ||
batched_messages, | ||
batch_messages, | ||
tokenize=True, | ||
add_generation_prompt=True, | ||
return_dict=True, | ||
|
@@ -453,42 +398,43 @@ def test_small_model_integration_test_batch_wo_image(self): | |
).to(torch_device) | ||
|
||
# it should not matter whether two images are the same size or not | ||
output = model.generate(**inputs, max_new_tokens=30) | ||
output = model.generate(**inputs, max_new_tokens=10) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 16 minutes for 10 tokens |
||
|
||
EXPECTED_DECODED_TEXT = [ | ||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", | ||
'\nWho are you?\n<think>Got it, the user is asking "Who are you?" I need to respond appropriately. First, I should clarify that I\'m an AI assistant' | ||
"\nWhat kind of dog is this?\n<think>Got it, let's try to figure out", | ||
"\nWhat kind of dog is this?\n<think>Got it, let's see. The user", | ||
'\nWho are you?\n<think>The user is asking "Who are you?"' | ||
] # fmt: skip | ||
decoded = self.processor.batch_decode(output, skip_special_tokens=True) | ||
decoded = [x.replace("<|image|>", "") for x in decoded] | ||
self.assertEqual( | ||
self.processor.batch_decode(output, skip_special_tokens=True), | ||
decoded, | ||
EXPECTED_DECODED_TEXT, | ||
) | ||
|
||
@slow | ||
def test_small_model_integration_test_batch_different_resolutions(self): | ||
model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto") | ||
batched_messages = [self.message, self.message2] | ||
inputs = self.processor.apply_chat_template( | ||
batched_messages, | ||
def test_small_model_integration_test_with_video(self): | ||
processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V", max_image_size={"longest_edge": 50176}) | ||
model = self.get_model() | ||
batch_messages = [self.video_messages] | ||
inputs = processor.apply_chat_template( | ||
batch_messages, | ||
tokenize=True, | ||
add_generation_prompt=True, | ||
return_dict=True, | ||
return_tensors="pt", | ||
padding=True, | ||
).to(torch_device) | ||
|
||
# it should not matter whether two images are the same size or not | ||
output = model.generate(**inputs, max_new_tokens=30) | ||
|
||
EXPECTED_DECODED_TEXT = [ | ||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", | ||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but" | ||
] # fmt: skip | ||
output = model.generate(**inputs, max_new_tokens=3) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 3 tokens - let's not being crazy to have all the tests being so slow. This 3 tokens already takes 7 minutes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Woow, btw, we can change the video size by setting small I mean in this test and in the batched test above |
||
EXPECTED_DECODED_TEXT = ["\n012345Describe this video.\n<think>Got it"] # fmt: skip | ||
decoded = processor.batch_decode(output, skip_special_tokens=True) | ||
decoded = [x.replace("<|image|>", "") for x in decoded] | ||
self.assertEqual( | ||
self.processor.batch_decode(output, skip_special_tokens=True), | ||
decoded, | ||
EXPECTED_DECODED_TEXT, | ||
) | ||
|
||
@run_first | ||
@slow | ||
@require_flash_attn | ||
@require_torch_gpu | ||
|
@@ -499,44 +445,9 @@ def test_small_model_integration_test_batch_flashatt2(self): | |
attn_implementation="flash_attention_2", | ||
device_map="auto", | ||
) | ||
batched_messages = [self.message, self.message2] | ||
inputs = self.processor.apply_chat_template( | ||
batched_messages, | ||
tokenize=True, | ||
add_generation_prompt=True, | ||
return_dict=True, | ||
return_tensors="pt", | ||
padding=True, | ||
).to(torch_device) | ||
|
||
# it should not matter whether two images are the same size or not | ||
output = model.generate(**inputs, max_new_tokens=30) | ||
|
||
EXPECTED_DECODED_TEXT = [ | ||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture has a stocky build, thick fur, and a face that's", | ||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but" | ||
] # fmt: skip | ||
self.assertEqual( | ||
self.processor.batch_decode(output, skip_special_tokens=True), | ||
EXPECTED_DECODED_TEXT, | ||
) | ||
|
||
@slow | ||
@require_flash_attn | ||
@require_torch_gpu | ||
def test_small_model_integration_test_batch_wo_image_flashatt2(self): | ||
model = Glm4vMoeForConditionalGeneration.from_pretrained( | ||
"zai-org/GLM-4.5V", | ||
dtype=torch.bfloat16, | ||
attn_implementation="flash_attention_2", | ||
device_map="auto", | ||
) | ||
message_wo_image = [ | ||
{"role": "user", "content": [{"type": "text", "text": "Who are you?"}]}, | ||
] | ||
batched_messages = [self.message, message_wo_image] | ||
batch_messages = [self.message, self.message2, self.message_wo_image] | ||
inputs = self.processor.apply_chat_template( | ||
batched_messages, | ||
batch_messages, | ||
tokenize=True, | ||
add_generation_prompt=True, | ||
return_dict=True, | ||
|
@@ -545,14 +456,16 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): | |
).to(torch_device) | ||
|
||
# it should not matter whether two images are the same size or not | ||
output = model.generate(**inputs, max_new_tokens=30) | ||
output = model.generate(**inputs, max_new_tokens=3) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same, 3 tokens, 7 minutes |
||
|
||
EXPECTED_DECODED_TEXT = [ | ||
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", | ||
'\nWho are you?\n<think>Got it, let\'s look at the question. The user is asking "Who are you?" which is a common question when someone meets an AI' | ||
"\nWhat kind of dog is this?\n<think>Got it", | ||
"\nWhat kind of dog is this?\n<think>Got it", | ||
"\nWho are you?\n<think>The user", | ||
] # fmt: skip | ||
|
||
decoded = self.processor.batch_decode(output, skip_special_tokens=True) | ||
decoded = [x.replace("<|image|>", "") for x in decoded] | ||
self.assertEqual( | ||
self.processor.batch_decode(output, skip_special_tokens=True), | ||
decoded, | ||
EXPECTED_DECODED_TEXT, | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
combine several tests into this one, but using batch