In [1]:
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor, TextIteratorStreamer
import sys
from threading import Thread

from tool_server.tf_eval.utils.utils import *
from qwen_vl_utils import process_vision_info

  from .autonotebook import tqdm as notebook_tqdm
2025-01-27 19:13:39 | INFO | qwen_vl_utils.vision_process | set VIDEO_TOTAL_PIXELS: 90316800
2025-01-27 19:13:39 | ERROR | ipykernel.comm | No such comm target registered: jupyter.widget.control
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
2025-01-27 19:13:42 | INFO | accelerate.utils.modeling | We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards:  20%|██        | 1/5 [00:08<00:33,  8.49s/it]
Loading checkpoint shards:  40%|████      | 2/5 [00:14<00:20,  6.99s/it]
Loading checkpoint shards:  60%|██████    | 3/5 [00:20<00:13,  6.58s/it]
Loading checkpoint shards:  80%|████████  | 4/5 [00:26<00:06,  6.41

In [2]:
# Load the model in half-precision on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "/mnt/petrelfs/songmingyang/songmingyang/model/tool-augment/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("/mnt/petrelfs/songmingyang/songmingyang/model/tool-augment/Qwen2-VL-7B-Instruct")


In [8]:
# Image
url = "/mnt/petrelfs/songmingyang/code/tools/test_imgs/roxy.jpeg"
image = Image.open(url)
image_base64=pil_to_base64(image)
image_base64 = f"data:image/jpeg;base64,{image_base64}"
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

processor.tokenizer.padding_side='left'
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=2048)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

In [12]:
output_text

["The image is an illustration of an anime-style character. The character has long, flowing blue hair and is wearing a beige coat with a high collar and a black dress underneath. The coat has a brownish hue and features a large, decorative button on the front. The character is holding a large, mechanical-looking sword with a black and silver design. The sword has a long, cylindrical handle and a sharp blade. In the background, there is a floating blue crystal or gemstone, and the character appears to be in a snowy or cold environment, as suggested by the white, snowy landscape and the character's attire. The overall style is vibrant and detailed, with a focus on the character's expression and the mechanical elements of the sword."]

In [16]:
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA
    ],
    on_trace_ready=torch.profiler.tensorboard_trace_handler("./log"),
    with_stack=True,
    record_shapes=True,
    with_flops=True,  # 开启 FLOPS 统计
) as prof:
    generated_ids = model.generate(**inputs, max_new_tokens=2048)
# print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
# prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)

RuntimeError: Can't disable Kineto profiler when it's not running

In [13]:
from fvcore.nn import FlopCountAnalysis, parameter_count
from thop import profile

In [9]:
flops = FlopCountAnalysis(model, input_tensor)
params = parameter_count(model)

print(f"FLOPS: {flops.total()}")  # 总 FLOPS
print(f"Params: {params}")        # 参数数量

In [39]:
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
pixel_values = inputs["pixel_values"]
image_grid_thw = inputs["image_grid_thw"]

# flops, params = profile(model, inputs=(input,))
flops, params = profile(model, inputs=(input_ids, attention_mask, None,
                                       None,None,None,None,None,None,None,pixel_values,None,image_grid_thw))


In [40]:
flops

95443525828608.0

In [41]:
pixel_values

tensor([[1.8135, 1.9303, 1.4486,  ..., 2.1459, 2.1459, 2.1459],
        [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
        [1.9303, 1.9011, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
        ...,
        [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
        [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
        [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459]],
       device='cuda:0')

In [42]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])

In [43]:
params

7746173952.0