In [2]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

# If you have an 80G A100 GPU, you can put the entire model on a single GPU.
# Otherwise, you need to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
path = 'OpenGVLab/InternVL2_5-8B'
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

# set the max number of tiles in `max_num`
image_path1 = "/data/RSP/rlbench_frames/insert_onto_square_peg_variation0_episode0_front/pair_2_frame0.jpg"  # Replace with your image path
image_path2 = "/data/RSP/rlbench_frames/insert_onto_square_peg_variation0_episode0_front/pair_2_frame1.jpg"  # Replace with your image path


generation_config = dict(max_new_tokens=1024, do_sample=False)

# multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)
pixel_values1 = load_image(image_path1, max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image(image_path2, max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)

prompt = """You are a movement analyzer specialized in comparing consecutive video frames. Analyze and describe changes between frames using the following guidelines:
FORMAT:
- Provide one clear sentence summarizing the overall dynamics and activity
- Describe qualitative and relative dynamics between frames
- Use precise directional terms (left, right, up, down, forward, backward)
- Focus on observable, concrete changes

ANALYZE THE FOLLOWING ELEMENTS:
- Main Subject/Object:
    - Position: Track center of mass movement
    - Rotation: Note any turns or spins
    - Orientation: Describe facing direction
    - State Changes: Document visible changes in:
        - Physical form or shape
        - Color or appearance
        - Expression or emotional state (if applicable)
- Background:
    - Note any changes in background elements
    - Identify moving vs static elements

Keep descriptions concise, objective, and focused on visible changes between frames."""


InternLM2ForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


FlashAttention2 is not installed.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
question = ('<image>\n'
            'Describe the changes between the two frames.\n'
            )
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

User: <image>
Describe the changes between the two frames.

Assistant: The two frames show a robot arm in a 3D simulation environment. Here are the changes between the two frames:

1. **Robot Arm Position**: The robot arm has moved slightly. In the first frame, the arm is positioned more towards the left, while in the second frame, it is slightly more towards the right.

2. **End Effector Position**: The end effector (the part of the robot arm that interacts with objects) has moved. In the first frame, the end effector is closer to the base of the robot arm, while in the second frame, it is slightly further away from the base.

3. **Object Position**: The objects on the table have moved. In the first frame, the objects are positioned more towards the left side of the table, while in the second frame, they are slightly more towards the right side of the table.

4. **Lighting and Shadows**: The lighting and shadows in the environment have changed slightly. The shadows of the robot arm an

In [4]:
del model

In [1]:
from lmdeploy import pipeline, TurbomindEngineConfig
from lmdeploy.vl import load_image
path = 'OpenGVLab/InternVL2_5-26B'
model = 'OpenGVLab/InternVL2_5-26B-AWQ'
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192, tp=2))


Fetching 32 files:   0%|          | 0/32 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

tokenization_internlm2_fast.py:   0%|          | 0.00/7.79k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

tokenization_internlm2.py:   0%|          | 0.00/8.79k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

InternLM2ForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


FlashAttention2 is not installed.


  return torch.load(checkpoint_file, map_location=torch.device("cpu"))




  tmp = torch.load(shard, map_location='cpu')




In [4]:
from lmdeploy.vl.constants import IMAGE_TOKEN
image_path1 = "/data/RSP/rlbench_frames/insert_onto_square_peg_variation0_episode0_front/pair_2_frame0.jpg"  # Replace with your image path
image_path2 = "/data/RSP/rlbench_frames/insert_onto_square_peg_variation0_episode0_front/pair_2_frame1.jpg"  # Replace with your image path

response = pipe((f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\ndescribe these two images', [load_image(image_path1), load_image(image_path2)]))
print(response.text)



The two images show a robotic arm in a similar position, with a slight difference in the arm's orientation. The background is a plain, dark surface, and the robot is standing on a beige mat. In front of the robot, there is a wooden base with four vertical pegs of different colors: green, red, blue, and another red. The robot appears to be interacting with the pegs, possibly in a sorting or stacking task.


In [7]:
response = pipe((f"""Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\nYou are a movement analyzer specialized in comparing consecutive video frames. Analyze and describe changes between frames using the following guidelines:
FORMAT:
- Provide one clear sentence summarizing the overall dynamics and activity
- Describe qualitative and relative dynamics between frames
- Use precise directional terms (left, right, up, down, forward, backward)
- Focus on observable, concrete changes

ANALYZE THE FOLLOWING ELEMENTS:
- Main Subject/Object:
    - Position: Track center of mass movement
    - Rotation: Note any turns or spins
    - Orientation: Describe facing direction
    - State Changes: Document visible changes in:
        - Physical form or shape
        - Color or appearance
        - Expression or emotional state (if applicable)
- Background:
    - Note any changes in background elements
    - Identify moving vs static elements

Keep descriptions concise, objective, and focused on visible changes between frames.""", [load_image(image_path1), load_image(image_path2)]))
print(response.text)

- The robot arm has moved slightly to the right.
- The arm's orientation has changed, now facing more towards the right.
- The background remains static with no visible changes.


: 