In [1]:
from transformers import AutoModelForVision2Seq, AutoProcessor
from PIL import Image
import torch
import os 
import json

os.environ["TRANSFORMERS_OFFLINE"] = "1"
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm
2024-11-25 18:55:47.217416: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-25 18:55:47.244204: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-25 18:55:47.244226: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-25 18:55:47.244985: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-25 18:55:47.2

In [None]:
# Load Processor & VLA
# path = '/cluster/nvme9a/dzk/'
# path = "openvla/openvla-7b"
path = "logs/ecot-openvla-7b-oxe+libero_spatial_no_noops+b16+lr-0.0005+lora-r32+dropout-0.0--image_aug"

processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
vla = AutoModelForVision2Seq.from_pretrained(
    path, 
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",  # [Optional] Requires `flash_attn`
    low_cpu_mem_usage=True, 
    trust_remote_code=True,
).to(device)

# Grab image input & format prompt
# image: Image.Image = get_from_camera(...)
# open a image file
image = Image.open("test.png")
instruction = "put eggplant into pot"
prompt = f"In: What action should the robot take to {instruction}?\nOut:"

# Predict Action (7-DoF; un-normalize for BridgeData V2)
inputs = processor(prompt, image).to(device, dtype=torch.bfloat16)
# action = vla.predict_action(**inputs, unnorm_key="libero_spatial_no_noops", do_sample=False)
# Execute...
# robot.act(action, ...)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Expected `transformers==4.40.1` and `tokenizers==0.19.1` but got `transformers==4.46.1` and `tokenizers==0.20.1`; there might be inference-time regressions due to dependency changes. If in doubt, pleaseuse the above versions.
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  5.62it/s]


In [None]:
dataset_statistics_path = os.path.join(path, "dataset_statistics.json")
if os.path.isfile(dataset_statistics_path):
    with open(dataset_statistics_path, "r") as f:
        norm_stats = json.load(f)
    vla.norm_stats = norm_stats
action = vla.predict_action(**inputs, unnorm_key='libero_spatial_no_noops', do_sample=False)




In [4]:
action

(array([0.93420005, 0.87287817, 0.92847689, 0.10351471, 0.17603361,
        0.14506722, 0.99607843]),
 tensor([[    1,   512, 29901,  1724,  3158,   881,   278, 19964,  2125,   304,
           1925, 19710, 24389,   964,  3104, 29973,    13,  3744, 29901,   323]],
        device='cuda:1'))

In [None]:
# from torch.profiler import profile, record_function, ProfilerActivity

# === BFLOAT16 MODE ===
inputs = processor(prompt, image).to(device, dtype=torch.bfloat16)
# inputs["input_ids"] = inputs["input_ids"][:, 1:]

# Run OpenVLA Inference
torch.manual_seed(0)
def trace_handler(prof):
    # print(prof.key_averages().table(
    #     sort_by="self_cuda_time_total", row_limit=-1))
    prof.export_chrome_trace("tmp/test_trace_" + str(prof.step_num) + ".json")

with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    schedule=torch.profiler.schedule(
        wait=1,
        warmup=1,
        active=1),
    on_trace_ready=trace_handler,
    with_stack=True,
    profile_memory=True,
    with_flops = True
    ) as p:
        for iter in range(3):
            action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)
            p.step()



STAGE:2024-11-04 14:16:38 74110:74110 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-11-04 14:16:38 74110:74110 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-11-04 14:16:38 74110:74110 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [None]:
# profile latecy with cuda event
# calculate 10 runs and get the average inference and std with torch events
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
times = []
for i in range(10):
    start.record()
    action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)
    end.record()
    torch.cuda.synchronize()
    times.append(start.elapsed_time(end))
print("Average inference time: ", sum(times)/len(times))
print("Std: ", torch.tensor(times).std().item())

Average inference time:  167.4547607421875
Std:  1.8540565967559814


In [None]:
#profile memroy with torch
print(torch.cuda.memory_summary(device))

|                  PyTorch CUDA memory summary, device ID 1                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  14405 MiB |  14692 MiB |  89785 MiB |  75380 MiB |
|       from large pool |  14401 MiB |  14688 MiB |  82197 MiB |  67795 MiB |
|       from small pool |      3 MiB |      7 MiB |   7588 MiB |   7584 MiB |
|---------------------------------------------------------------------------|
| Active memory         |  14405 MiB |  14692 MiB |  89785 MiB |  75380 MiB |
|       from large pool |  14401 MiB |  14688 MiB |  82197 MiB |  67795 MiB |
|       from small pool |      3 MiB |      7 MiB |   7588 MiB |   7584 MiB |
|---------------------------------------------------------------