Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 40 additions & 14 deletions tests/models/glm4v/test_modeling_glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,8 @@ def test_inputs_embeds_matches_input_ids(self):
@require_torch
class Glm4vIntegrationTest(unittest.TestCase):
def setUp(self):
cleanup(torch_device, gc_collect=True)

self.processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
self.message = [
{
Expand Down Expand Up @@ -340,8 +342,11 @@ def test_small_model_integration_test(self):
# verify generation
inputs = inputs.to(torch_device)

# This model on the hub has `do_sample=True`.
torch.manual_seed(42)

output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically"
self.assertEqual(
self.processor.decode(output[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
Expand All @@ -357,12 +362,15 @@ def test_small_model_integration_test_batch(self):
batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
).to(torch_device)

# This model on the hub has `do_sample=True`.
torch.manual_seed(42)

# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)

EXPECTED_DECODED_TEXT = [
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture has a stocky body, thick fur, and a face that's"
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
Expand Down Expand Up @@ -395,10 +403,13 @@ def test_small_model_integration_test_with_video(self):
inputs = processor.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True
).to(torch_device)

# This model on the hub has `do_sample=True`.
torch.manual_seed(42)

output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
"\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is an indoor tennis court. There are two players: one in the foreground wearing"
] # fmt: skip
EXPECTED_DECODED_TEXT = ["\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is an indoor tennis court. There are two players: one in a white shirt"] # fmt: skip

self.assertEqual(
processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
Expand All @@ -413,6 +424,9 @@ def test_small_model_integration_test_expand(self):
self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
).to(torch_device)

# This model on the hub has `do_sample=True`.
torch.manual_seed(42)

output = model.generate(**inputs, max_new_tokens=30, do_sample=False, num_beams=2, num_return_sequences=2)

EXPECTED_DECODED_TEXT = [
Expand Down Expand Up @@ -442,12 +456,15 @@ def test_small_model_integration_test_batch_wo_image(self):
padding=True,
).to(torch_device)

# This model on the hub has `do_sample=True`.
torch.manual_seed(42)

# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)

EXPECTED_DECODED_TEXT = [
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
'\nWho are you?\n<think>Got it, the user is asking "Who are you?" I need to respond appropriately. First, I should clarify that I\'m an AI assistant'
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
"\nWho are you?\n<think>Got it, let's look at the user's question: \"Who are you?\" This is a common question when someone is just starting a conversation"
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
Expand All @@ -469,12 +486,15 @@ def test_small_model_integration_test_batch_different_resolutions(self):
padding=True,
).to(torch_device)

# This model on the hub has `do_sample=True`.
torch.manual_seed(42)

# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)

EXPECTED_DECODED_TEXT = [
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but"
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but",
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
Expand All @@ -501,12 +521,15 @@ def test_small_model_integration_test_batch_flashatt2(self):
padding=True,
).to(torch_device)

# This model on the hub has `do_sample=True`.
torch.manual_seed(42)

# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)

EXPECTED_DECODED_TEXT = [
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but",
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog. Wait, it's a cat,",
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but"
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
Expand Down Expand Up @@ -536,12 +559,15 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self):
padding=True,
).to(torch_device)

# This model on the hub has `do_sample=True`.
torch.manual_seed(42)

# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)

EXPECTED_DECODED_TEXT = [
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
'\nWho are you?\n<think>Got it, let\'s look at the question. The user is asking "Who are you?" which is a common question when someone meets an AI'
"\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
"\nWho are you?\n<think>Got it, let's look at the user's question: \"Who are you?\" This is a common question when someone is just starting a conversation"
] # fmt: skip

self.assertEqual(
Expand Down