In [1]:
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
from PIL import Image
import torch
import os 
os.environ["TRANSFORMERS_OFFLINE"] = "1"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


  from .autonotebook import tqdm as notebook_tqdm
2024-11-05 12:51:36.027128: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-05 12:51:36.053098: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-05 12:51:36.053119: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-05 12:51:36.053807: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-05 12:51:36.0

In [2]:
# Load Processor & VLA

quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16,
                                         llm_int8_skip_modules=["vision_backbone"],)
# quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=10.0)
# path = '/cluster/nvme9a/dzk/'
processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)
vla = AutoModelForVision2Seq.from_pretrained(
    "openvla/openvla-7b", 
    torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",  # [Optional] Requires `flash_attn`
    low_cpu_mem_usage=True, 
    trust_remote_code=True,
    quantization_config=quantization_config,
)#.to(device)

# Grab image input & format prompt
image = Image.open("test.png")
instruction = "put eggplant into pot"
prompt = f"In: What action should the robot take to {instruction}?\nOut:"

# Predict Action (7-DoF; un-normalize for BridgeData V2)
# inputs = processor(prompt, image).to(device, dtype=torch.bfloat16)
# action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)
# Execute...
# robot.act(action, ...)

Expected `transformers==4.40.1` and `tokenizers==0.19.1` but got `transformers==4.46.1` and `tokenizers==0.20.1`; there might be inference-time regressions due to dependency changes. If in doubt, pleaseuse the above versions.
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.12it/s]


In [3]:
vla.vision_backbone.featurizer.blocks[0].attn.qkv.weight.dtype
# for name, module in vla.named_modules():
#     print(name)

torch.bfloat16

In [4]:
# from torch.profiler import profile, record_function, ProfilerActivity

# === BFLOAT16 MODE ===
inputs = processor(prompt, image).to(device, dtype=torch.bfloat16)
# inputs["input_ids"] = inputs["input_ids"][:, 1:]

# Run OpenVLA Inference
torch.manual_seed(0)

def trace_handler(prof):
    # print(prof.key_averages().table(
    #     sort_by="self_cuda_time_total", row_limit=-1))
    prof.export_chrome_trace("tmp/test_trace_" + str(prof.step_num) + ".json")

# with torch.profiler.profile(
#     activities=[
#         torch.profiler.ProfilerActivity.CPU,
#         torch.profiler.ProfilerActivity.CUDA,
#     ],
#     schedule=torch.profiler.schedule(
#         wait=1,
#         warmup=1,
#         active=1),
#     on_trace_ready=trace_handler,
#     with_stack=True,
#     profile_memory=True,
#     with_flops = True
#     ) as p:
#         for iter in range(3):
#             action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)
#             p.step()



In [5]:
# profile latecy with cuda event
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
times = []
for i in range(10):
    start.record()
    action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)
    end.record()
    torch.cuda.synchronize()
    times.append(start.elapsed_time(end))
print("Average inference time: ", sum(times)/len(times))
print("Std: ", torch.tensor(times).std().item())

We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)


Average inference time:  205.7176956176758
Std:  96.32500457763672


In [6]:
#profile memroy with torch
print(torch.cuda.memory_summary(device))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   5297 MiB |   5712 MiB | 195273 MiB | 189976 MiB |
|       from large pool |   5164 MiB |   5579 MiB | 189758 MiB | 184593 MiB |
|       from small pool |    132 MiB |    136 MiB |   5515 MiB |   5382 MiB |
|---------------------------------------------------------------------------|
| Active memory         |   5297 MiB |   5712 MiB | 195273 MiB | 189976 MiB |
|       from large pool |   5164 MiB |   5579 MiB | 189758 MiB | 184593 MiB |
|       from small pool |    132 MiB |    136 MiB |   5515 MiB |   5382 MiB |
|---------------------------------------------------------------