In [1]:
import os

# Restrict PyTorch to only see GPU 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch

if torch.cuda.is_available():
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available, using CPU.")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Number of GPUs available: 1
GPU 0: NVIDIA L40S
Using device: cuda:0


In [2]:
from scripts.inference import VLAModel
from PIL import Image, ImageDraw, ImageFont
from IPython.display import display

model_path = "./SpatialVLA"
model = VLAModel(model_path)

[2024-10-30 14:05:26,384] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2024-10-30 14:05:26.673581: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-30 14:05:26.702957: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-30 14:05:26.702999: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-30 14:05:26.703763: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-30 14:05:26.708695: I tensorflow/core/platform/cpu_feature_guar

In [3]:
def draw(image, prompt):
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default(10)  # You can load a custom font if needed
    image_width, image_height = image.size
    text_bbox = draw.textbbox((0, 0), prompt, font=font)
    text_width = text_bbox[2] - text_bbox[0]  # Width of the text
    text_height = text_bbox[3] - text_bbox[1]  # Height of the text
    padding = 10  # Padding from the bottom
    text_position = ((image_width - text_width) // 2, image_height - text_height - padding)
    draw.text(text_position, prompt, font=font, fill="white")  # Text in white color
    return image

In [4]:
import h5py
import os
from tqdm import tqdm
import cv2
import numpy as np

hdf5_files = []
folder_path = '/home/shared/LG_Robot/stack_cup/'
for file in os.listdir(folder_path):
    if file.endswith(".hdf5"):
        hdf5_files.append(os.path.join(folder_path, file))

idx = 20

img_list = []
data = h5py.File(hdf5_files[idx], 'r')
for imgd in tqdm(data['/observation/image']):
    img = Image.fromarray(imgd).convert("RGB")
    prompt_str = "What robot should do to transfer the wet tissue to the basket?"
    res = model.inference_action(img, prompt_str)
    print(res)
    break
    # img_list.append(draw(img, res))
    # print('action', res.shape)

  0%|                                                                                                                                     | 0/226 [00:00<?, ?it/s]

None
[[-0.01106462  0.00346839 -0.00319081  0.01103914  0.05147479 -0.01263141
   0.59610183]]





In [None]:
# Get the dimensions of the first image
width, height = img_list[0].size

fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for .mp4
video_name = 'output_video.mp4'  # Output video file name
video = cv2.VideoWriter(video_name, fourcc, 30, (width, height))  # 30 fps

# Step 3: Read each image, convert to array, and write to the video
for img in img_list:
    # Convert the image to RGB (OpenCV uses BGR format)
    img = img.convert("RGB")
    
    # Convert the Pillow image to a NumPy array
    img_array = np.array(img)
    
    # Convert RGB to BGR (OpenCV's format)
    img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
    
    # Write the image to the video
    video.write(img_array)

# Step 4: Release the video writer
video.release()
cv2.destroyAllWindows()

print(f"Video {video_name} created successfully!")

In [5]:
model.model

SpatialVLAForCausalLM(
  (model): SpatialVLAModel(
    (embed_tokens): Embedding(32000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

In [7]:
model.model.config.save_pretrained("./my_model_config")