Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
245 changes: 79 additions & 166 deletions tests/models/glm4v_moe/test_modeling_glm4v_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
"""Testing suite for the PyTorch GLM-4.1V model."""

import copy
import gc
import unittest

from transformers import (
Expand All @@ -25,9 +24,11 @@
is_torch_available,
)
from transformers.testing_utils import (
cleanup,
require_flash_attn,
require_torch,
require_torch_gpu,
run_first,
slow,
torch_device,
)
Expand Down Expand Up @@ -295,8 +296,26 @@ def test_inputs_embeds_matches_input_ids(self):

@require_torch
class Glm4vMoeIntegrationTest(unittest.TestCase):
model = None

@classmethod
def get_model(cls):
if cls.model is None:
cls.model = Glm4vMoeForConditionalGeneration.from_pretrained(
"zai-org/GLM-4.5V", dtype="auto", device_map="auto"
)
return cls.model

@classmethod
def tearDownClass(cls):
del cls.model
cleanup(torch_device, gc_collect=True)

def setUp(self):
self.processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V")
cleanup(torch_device, gc_collect=True)
self.processor = AutoProcessor.from_pretrained(
"zai-org/GLM-4.5V", size={"shortest_edge": 10800, "longest_edge": 10800}
)
self.message = [
{
"role": "user",
Expand All @@ -321,130 +340,56 @@ def setUp(self):
],
}
]
self.message_wo_image = [
{"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
]

question = "Describe this video."
video_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
self.video_messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": video_url,
},
{"type": "text", "text": question},
],
}
]

def tearDown(self):
gc.collect()
torch.cuda.empty_cache()
cleanup(torch_device, gc_collect=True)

@slow
def test_small_model_integration_test(self):
model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto")

inputs = self.processor.apply_chat_template(
self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
)
expected_input_ids = [151331, 151333, 151336, 198, 151339, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343] # fmt: skip
expected_input_ids = [151331, 151333, 151336, 198, 151339, 151363, 151363, 151363, 151363, 151363, 151363, 151340, 3838, 3093, 315, 5562, 374] # fmt: skip
assert expected_input_ids == inputs.input_ids[0].tolist()[:17]

expected_pixel_slice = torch.tensor(
[
[-0.0988, -0.0842, -0.0842],
[-0.5660, -0.5514, -0.4200],
[-0.0259, -0.0259, -0.0259],
[-0.1280, -0.0988, -0.2010],
[-0.4638, -0.5806, -0.6974],
[-1.2083, -1.2229, -1.2083],
[-0.1134, -0.4492, -0.8580],
[-0.6244, -1.1645, -0.7120],
[-0.3324, -0.7996, -0.7120],
[0.2077, 0.2223, 0.4121],
[0.4413, 0.1931, 0.4559],
[0.5873, 0.3099, 0.4851],
],
dtype=torch.float32,
device="cpu",
)
assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3)

# verify generation
inputs = inputs.to(torch_device)

output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
self.assertEqual(
self.processor.decode(output[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
torch.testing.assert_close(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=1e-4, rtol=1e-4)

@slow
def test_small_model_integration_test_batch(self):
model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto")
batch_messages = [self.message] * 2
model = self.get_model()
batch_messages = [self.message, self.message2, self.message_wo_image]
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

combine several tests into this one, but using batch

inputs = self.processor.apply_chat_template(
batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
).to(torch_device)

# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)

EXPECTED_DECODED_TEXT = [
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)

@slow
def test_small_model_integration_test_with_video(self):
processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V", max_image_size={"longest_edge": 50176})
model = Glm4vMoeForConditionalGeneration.from_pretrained(
"zai-org/GLM-4.5V", dtype=torch.float16, device_map="auto"
)
questions = ["Describe this video."] * 2
video_urls = [
"https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
] * 2
messages = [
[
{
"role": "user",
"content": [
{
"type": "video",
"video": video_url,
},
{"type": "text", "text": question},
],
}
]
for question, video_url in zip(questions, video_urls)
]
inputs = processor.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True
).to(torch_device)
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami",
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami"
] # fmt: skip
self.assertEqual(
processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)

@slow
def test_small_model_integration_test_expand(self):
model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto")
inputs = self.processor.apply_chat_template(
self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
).to(torch_device)

output = model.generate(**inputs, max_new_tokens=30, do_sample=False, num_beams=2, num_return_sequences=2)

EXPECTED_DECODED_TEXT = [
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat, specifically"
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)

@slow
def test_small_model_integration_test_batch_wo_image(self):
model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto")
message_wo_image = [
{"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
]
batched_messages = [self.message, message_wo_image]
inputs = self.processor.apply_chat_template(
batched_messages,
batch_messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
Expand All @@ -453,42 +398,43 @@ def test_small_model_integration_test_batch_wo_image(self):
).to(torch_device)

# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
output = model.generate(**inputs, max_new_tokens=10)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

16 minutes for 10 tokens


EXPECTED_DECODED_TEXT = [
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
'\nWho are you?\n<think>Got it, the user is asking "Who are you?" I need to respond appropriately. First, I should clarify that I\'m an AI assistant'
"\nWhat kind of dog is this?\n<think>Got it, let's try to figure out",
"\nWhat kind of dog is this?\n<think>Got it, let's see. The user",
'\nWho are you?\n<think>The user is asking "Who are you?"'
] # fmt: skip
decoded = self.processor.batch_decode(output, skip_special_tokens=True)
decoded = [x.replace("<|image|>", "") for x in decoded]
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
decoded,
EXPECTED_DECODED_TEXT,
)

@slow
def test_small_model_integration_test_batch_different_resolutions(self):
model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto")
batched_messages = [self.message, self.message2]
inputs = self.processor.apply_chat_template(
batched_messages,
def test_small_model_integration_test_with_video(self):
processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V", max_image_size={"longest_edge": 50176})
model = self.get_model()
batch_messages = [self.video_messages]
inputs = processor.apply_chat_template(
batch_messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
padding=True,
).to(torch_device)

# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)

EXPECTED_DECODED_TEXT = [
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but"
] # fmt: skip
output = model.generate(**inputs, max_new_tokens=3)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3 tokens - let's not being crazy to have all the tests being so slow.

This 3 tokens already takes 7 minutes

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Woow, btw, we can change the video size by setting small num_frames when calling processor.apply_chat_template. I dont know for sure what is the default sampling size for model, so maybe it is sampling a lot

I mean in this test and in the batched test above

EXPECTED_DECODED_TEXT = ["\n012345Describe this video.\n<think>Got it"] # fmt: skip
decoded = processor.batch_decode(output, skip_special_tokens=True)
decoded = [x.replace("<|image|>", "") for x in decoded]
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
decoded,
EXPECTED_DECODED_TEXT,
)

@run_first
@slow
@require_flash_attn
@require_torch_gpu
Expand All @@ -499,44 +445,9 @@ def test_small_model_integration_test_batch_flashatt2(self):
attn_implementation="flash_attention_2",
device_map="auto",
)
batched_messages = [self.message, self.message2]
inputs = self.processor.apply_chat_template(
batched_messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
padding=True,
).to(torch_device)

# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)

EXPECTED_DECODED_TEXT = [
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture has a stocky build, thick fur, and a face that's",
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but"
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)

@slow
@require_flash_attn
@require_torch_gpu
def test_small_model_integration_test_batch_wo_image_flashatt2(self):
model = Glm4vMoeForConditionalGeneration.from_pretrained(
"zai-org/GLM-4.5V",
dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
message_wo_image = [
{"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
]
batched_messages = [self.message, message_wo_image]
batch_messages = [self.message, self.message2, self.message_wo_image]
inputs = self.processor.apply_chat_template(
batched_messages,
batch_messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
Expand All @@ -545,14 +456,16 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self):
).to(torch_device)

# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
output = model.generate(**inputs, max_new_tokens=3)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same, 3 tokens, 7 minutes


EXPECTED_DECODED_TEXT = [
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
'\nWho are you?\n<think>Got it, let\'s look at the question. The user is asking "Who are you?" which is a common question when someone meets an AI'
"\nWhat kind of dog is this?\n<think>Got it",
"\nWhat kind of dog is this?\n<think>Got it",
"\nWho are you?\n<think>The user",
] # fmt: skip

decoded = self.processor.batch_decode(output, skip_special_tokens=True)
decoded = [x.replace("<|image|>", "") for x in decoded]
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
decoded,
EXPECTED_DECODED_TEXT,
)