In [1]:
import torch
from transformers import AutoConfig, AutoImageProcessor, AutoModelForVision2Seq, AutoProcessor
import time
import numpy as np
import cv2
import textwrap
from PIL import Image, ImageDraw, ImageFont
import enum
import json
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Define some utils.
def split_reasoning(text, tags):
    new_parts = {None: text}

    for tag in tags:
        parts = new_parts
        new_parts = dict()

        for k, v in parts.items():
            if tag in v:
                s = v.split(tag)
                new_parts[k] = s[0]
                new_parts[tag] = s[1]
                # print(tag, s)
            else:
                new_parts[k] = v

    return new_parts

class CotTag(enum.Enum):
    TASK = "TASK:"
    PLAN = "PLAN:"
    VISIBLE_OBJECTS = "VISIBLE OBJECTS:"
    SUBTASK_REASONING = "SUBTASK REASONING:"
    SUBTASK = "SUBTASK:"
    MOVE_REASONING = "MOVE REASONING:"
    MOVE = "MOVE:"
    GRIPPER_POSITION = "GRIPPER POSITION:"
    ACTION = "ACTION:"


def get_cot_tags_list():
    return [
        CotTag.TASK.value,
        CotTag.PLAN.value,
        CotTag.VISIBLE_OBJECTS.value,
        CotTag.SUBTASK_REASONING.value,
        CotTag.SUBTASK.value,
        CotTag.MOVE_REASONING.value,
        CotTag.MOVE.value,
        CotTag.GRIPPER_POSITION.value,
        CotTag.ACTION.value,
    ]

def name_to_random_color(name):
    return [(hash(name) // (256**i)) % 256 for i in range(3)]


def draw_gripper(img, pos_list, img_size=(640, 480)):
    for i, pos in enumerate(reversed(pos_list)):
        pos = resize_pos(pos, img_size)
        scale = 255 - int(255 * i / len(pos_list))
        cv2.circle(img, pos, 6, (0, 0, 0), -1)
        cv2.circle(img, pos, 5, (scale, scale, 255), -1)

def get_metadata(reasoning):
    metadata = {"gripper": [[0, 0]], "bboxes": dict()}

    if f" {CotTag.GRIPPER_POSITION.value}" in reasoning:
        gripper_pos = reasoning[f" {CotTag.GRIPPER_POSITION.value}"]
        gripper_pos = gripper_pos.split("[")[-1]
        gripper_pos = gripper_pos.split("]")[0]
        gripper_pos = [int(x) for x in gripper_pos.split(",")]
        gripper_pos = [(gripper_pos[2 * i], gripper_pos[2 * i + 1]) for i in range(len(gripper_pos) // 2)]
        metadata["gripper"] = gripper_pos

    if f" {CotTag.VISIBLE_OBJECTS.value}" in reasoning:
        for sample in reasoning[f" {CotTag.VISIBLE_OBJECTS.value}"].split("]"):
            obj = sample.split("[")[0]
            if obj == "":
                continue
            coords = [int(n) for n in sample.split("[")[-1].split(",")]
            metadata["bboxes"][obj] = coords

    return metadata

def resize_pos(pos, img_size):
    return [(x * size) // 256 for x, size in zip(pos, img_size)]

def draw_bboxes(img, bboxes, img_size=(640, 480)):
    for name, bbox in bboxes.items():
        show_name = name
        # show_name = f'{name}; {str(bbox)}'

        cv2.rectangle(
            img,
            resize_pos((bbox[0], bbox[1]), img_size),
            resize_pos((bbox[2], bbox[3]), img_size),
            name_to_random_color(name),
            1,
        )
        cv2.putText(
            img,
            show_name,
            resize_pos((bbox[0], bbox[1] + 6), img_size),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (255, 255, 255),
            1,
            cv2.LINE_AA,
        )

In [3]:
device = "cuda:0"
# Load Processor & VLA
# path_to_converted_ckpt = "Embodied-CoT/ecot-openvla-7b-bridge"
path_to_converted_ckpt = "Embodied-CoT/ecot-openvla-7b-oxe"
# path_to_converted_ckpt = "../openvla/logs/ecot-openvla-7b-oxe+libero_spatial_no_noops+b16+lr-0.0005+lora-r32+dropout-0.0--image_aug"
processor = AutoProcessor.from_pretrained(path_to_converted_ckpt, trust_remote_code=True)
vla = AutoModelForVision2Seq.from_pretrained(
    path_to_converted_ckpt,
    # attn_implementation="flash_attention_2",  # [Optional] Requires `flash_attn`
    torch_dtype=torch.bfloat16,
    # low_cpu_mem_usage=True,
    trust_remote_code=True,
).to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Expected `transformers==4.40.1` and `tokenizers==0.19.1` but got `transformers==4.48.1` and `tokenizers==0.21.0`; there might be inference-time regressions due to dependency changes. If in doubt, pleaseuse the above versions.
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  8.13it/s]


In [4]:
# vla.language_model.save_pretrained("logs/llama-bridge")
from vllm import LLM, SamplingParams
sampling_params = SamplingParams(temperature=0, max_tokens=64,)
vla.input_embds = vla.language_model.get_input_embeddings()

# save language model 
if not os.path.exists("logs/llama-bridge"):
    vla.language_model.save_pretrained("logs/llama-bridge")
    processor.save_pretrained("logs/llama-bridge")

# load language model with VLLM
if hasattr(vla, "language_model"):
    del vla.language_model
vla.language_model = LLM("logs/llama-bridge", trust_remote_code=True, gpu_memory_utilization=0.8)

INFO 01-25 17:11:16 __init__.py:183] Automatically detected platform cuda.


2025-01-25 17:11:16,605	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 01-25 17:11:20 config.py:528] This model supports multiple tasks: {'classify', 'score', 'embed', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 01-25 17:11:20 llm_engine.py:232] Initializing an LLM engine (v0.1.dev4262+g6609cdf) with config: model='logs/llama-bridge', speculative_config=None, tokenizer='logs/llama-bridge', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=logs/llam

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  33% Completed | 1/3 [00:00<00:00,  2.44it/s]
Loading safetensors checkpoint shards:  67% Completed | 2/3 [00:01<00:00,  1.93it/s]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:01<00:00,  1.79it/s]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:01<00:00,  1.86it/s]


INFO 01-25 17:11:23 model_runner.py:1115] Loading model weights took 12.5527 GB





INFO 01-25 17:11:24 worker.py:266] Memory profiling takes 0.77 seconds
INFO 01-25 17:11:24 worker.py:266] the current vLLM instance can use total_gpu_memory (23.64GiB) x gpu_memory_utilization (0.80) = 18.91GiB
INFO 01-25 17:11:24 worker.py:266] model weights take 12.55GiB; non_torch_memory takes 0.08GiB; PyTorch activation peak memory takes 0.44GiB; the rest of the memory reserved for KV Cache is 5.84GiB.
INFO 01-25 17:11:24 executor_base.py:107] # CUDA blocks: 748, # CPU blocks: 512
INFO 01-25 17:11:24 executor_base.py:112] Maximum concurrency for 2048 tokens per request: 5.84x
INFO 01-25 17:11:25 model_runner.py:1448] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as ne

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:11<00:00,  3.09it/s]

INFO 01-25 17:11:37 model_runner.py:1574] Graph capturing finished in 11 secs, took 0.24 GiB
INFO 01-25 17:11:37 llm_engine.py:429] init engine (profile, create kv cache, warmup model) took 13.45 seconds





In [5]:
SYSTEM_PROMPT = (
    "A chat between a curious user and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the user's questions."
)
t = CotTag.TASK.value
def get_openvla_prompt(instruction: str, task) -> str:
    return f"{SYSTEM_PROMPT} USER: What action should the robot take to {instruction.lower()}? ASSISTANT: {task}"
INSTRUCTION = "place the watermelon on the towel"
prompt = get_openvla_prompt(INSTRUCTION, t)
image = Image.open("./test.png")
print(prompt.replace(". ", ".\n"))
# print("Image size:", image.size)
dataset_statistics_path = os.path.join(path_to_converted_ckpt, "dataset_statistics.json")
if os.path.isfile(dataset_statistics_path):
    with open(dataset_statistics_path, "r") as f:
        norm_stats = json.load(f)
    vla.norm_stats = norm_stats

A chat between a curious user and an artificial intelligence assistant.
The assistant gives helpful, detailed, and polite answers to the user's questions.
USER: What action should the robot take to place the watermelon on the towel? ASSISTANT: TASK:


# VLLM

different prompt tests (not directly supported)

prepare the inputs 

In [20]:
# sync prompts (one prompt per image )
prompts = []
for t in CotTag:
    prompt_b = f"{SYSTEM_PROMPT} USER: What action should the robot take to {INSTRUCTION.lower()}? ASSISTANT: {t.value}"
    prompts.append(prompt_b)

# async prompts
async_prompts = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: What action should the robot take to place the watermelon on the towel? ASSISTANT: TASK: The task is to place the watermelon on the towel. The first step is to move the robotic arm towards the towel. PLAN: 1. Move to the right and forward. 2. Move down and grip the towel. 3. Move backward and up. 4. Move to the left. VISIBLE OBJECTS: the robot task [100, 1, 153, 105], the towel [160, 99, 220, 164], the towel [160, 99, 221, 165], table [20, 39, 239, 249], the robot task [100, 1, 154, 106] SUBTASK REASONING: The towel is to the right and slightly forward from the current robotic arm position. The robotic arm needs to move forward and up to reach the towel and grip it. SUBTASK: Move forward and up. MOVE REASONING: The robotic arm needs to move forward and up to reach the towel and grip it. MOVE: Move forward up. GRIPPER POSITION: [121, 91, 130, 87, 142, 87, 153, 88, 169, 95] ACTION: 塔瀬ܝĦ越ਿŸ"
# break async_prompts with CotTag keep value before the tag

prompts = []
for t in CotTag:
    # if t == CotTag.PLAN:
    #     break
    prompts.append(async_prompts.split(t.value)[0] + t.value)
    # print(prompts[-1])
    
from transformers.utils import TensorType
# left padding
# processor.tokenizer.padding_side = 'left'
inputs = [processor.tokenizer(p, return_tensors=TensorType.PYTORCH)['input_ids'].to(device) for p in prompts]
pixel_values = processor.image_processor(image, return_tensors=TensorType.PYTORCH)["pixel_values"].to(device, dtype=torch.bfloat16)

In [21]:
sampling_params = SamplingParams(temperature=0,
                                     max_tokens=64,
                                     stop_token_ids=[2])
outputs = vla.vllm_inference(input_ids=inputs, pixel_values=pixel_values)

Processed prompts:   0%|          | 0/9 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 9/9 [00:00<00:00, 12.59it/s, est. speed input: 6354.34 toks/s, output: 191.71 toks/s]


In [22]:
for o , p in zip(outputs, prompts):
    print(p + ' ' + o.outputs[0].text)
    # generated_text = o.outputs[0].token_ids
    # print(generated_text)
    # print(len(generated_text))

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: What action should the robot take to place the watermelon on the towel? ASSISTANT: TASK: The task is to place the watermelon on the towel. The
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: What action should the robot take to place the watermelon on the towel? ASSISTANT: TASK: The task is to place the watermelon on the towel. The first step is to move the robotic arm towards the towel. PLAN: 1. Move to the right, above the towel. 2.
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: What action should the robot take to place the watermelon on the towel? ASSISTANT: TASK: The task is to place the watermelon o

In [10]:
# # use cuda timmer 
# start = torch.cuda.Event(enable_timing=True)
# end = torch.cuda.Event(enable_timing=True)
# start.record()
# action, generated_ids = vla.predict_action(**inputs, unnorm_key='libero_spatial_no_noops', do_sample=False, max_new_tokens=60)
# end.record()
# torch.cuda.synchronize()
# print(start.elapsed_time(end))
# generated_text = processor.batch_decode(generated_ids)
# for i in range(len(prompts)):
#     # print(f"Prompt: {prompts[i]}")
#     print(f"Generated: {generated_text[i]}")
#     print("")

In [11]:
#profile memroy with torch
# print(torch.cuda.memory_summary(device))