In [1]:
import os

# Restrict PyTorch to only see GPU 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch

if torch.cuda.is_available():
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available, using CPU.")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Number of GPUs available: 1
GPU 0: NVIDIA L40S
Using device: cuda:0


In [2]:
from mobilevlm.model.mobilevlm import load_pretrained_model
from mobilevlm.conversation import conv_templates, SeparatorStyle
from mobilevlm.utils import disable_torch_init, process_images, tokenizer_image_token, KeywordsStoppingCriteria
from mobilevlm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from scripts.inference import VLMModel
from PIL import Image, ImageDraw, ImageFont
from IPython.display import display

model_path = "remyxai/SpaceLLaVA-lite"
args = type('Args', (), {
    "model_path": model_path,
    "conv_mode": "v1",
    "temperature": 0.7,
    "num_beams": 1,
    "top_p": None,
    "max_new_tokens": 512,
    "load_8bit": False,
    "load_4bit": False,
})()
model = VLMModel(args)

[2024-10-24 22:00:53,748] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32000. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


In [3]:
def draw(image, prompt):
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default(10)  # You can load a custom font if needed
    image_width, image_height = image.size
    text_bbox = draw.textbbox((0, 0), prompt, font=font)
    text_width = text_bbox[2] - text_bbox[0]  # Width of the text
    text_height = text_bbox[3] - text_bbox[1]  # Height of the text
    padding = 10  # Padding from the bottom
    text_position = ((image_width - text_width) // 2, image_height - text_height - padding)
    draw.text(text_position, prompt, font=font, fill="white")  # Text in white color
    return image

In [4]:
import h5py
import os
from tqdm import tqdm
import cv2
import numpy as np

hdf5_files = []
folder_path = '/home/shared/LG_Robot/stack_cup/'
for file in os.listdir(folder_path):
    if file.endswith(".hdf5"):
        hdf5_files.append(os.path.join(folder_path, file))

idx = 20

img_list = []
data = h5py.File(hdf5_files[idx], 'r')
for imgd in tqdm(data['/observation/image']):
    img = Image.fromarray(imgd).convert("RGB")
    prompt_str = "What is the distance between pink cup and robot gripper in centimeters?"
    res = model.inference(img, prompt_str)
    img_list.append(draw(img, res))

  0%|                                                                                                                                                                                                | 0/226 [00:00<?, ?it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  1%|█▋                                                                                                                                                                                      | 2/226 [00:00<01:30,  2.46it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  1%|██▍                                                                                                                                                                                     | 3/226 [00:01<01:24,  2.65it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  2%|███▎                                                                                                                                                                                    | 4/226 [00:01<01:23,  2.66it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  3%|████▉                                                                                                                                                                                   | 6/226 [00:02<01:02,  3.49it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  3%|█████▋                                                                                                                                                                                  | 7/226 [00:02<01:06,  3.27it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  4%|██████▌                                                                                                                                                                                 | 8/226 [00:02<01:07,  3.23it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  4%|████████                                                                                                                                                                               | 10/226 [00:03<01:03,  3.42it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  5%|████████▉                                                                                                                                                                              | 11/226 [00:03<01:07,  3.20it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  6%|██████████▌                                                                                                                                                                            | 13/226 [00:04<00:58,  3.65it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  6%|███████████▎                                                                                                                                                                           | 14/226 [00:04<00:50,  4.21it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  7%|████████████▉                                                                                                                                                                          | 16/226 [00:04<00:50,  4.17it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  8%|██████████████▌                                                                                                                                                                        | 18/226 [00:05<00:47,  4.35it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:


  8%|███████████████▍                                                                                                                                                                       | 19/226 [00:05<01:04,  3.20it/s]

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
What is the distance between pink cup and robot gripper in centimeters? ASSISTANT:





KeyboardInterrupt: 

In [20]:
# Get the dimensions of the first image
width, height = img_list[0].size

fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for .mp4
video_name = 'output_video.mp4'  # Output video file name
video = cv2.VideoWriter(video_name, fourcc, 30, (width, height))  # 30 fps

# Step 3: Read each image, convert to array, and write to the video
for img in img_list:
    # Convert the image to RGB (OpenCV uses BGR format)
    img = img.convert("RGB")
    
    # Convert the Pillow image to a NumPy array
    img_array = np.array(img)
    
    # Convert RGB to BGR (OpenCV's format)
    img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
    
    # Write the image to the video
    video.write(img_array)

# Step 4: Release the video writer
video.release()
cv2.destroyAllWindows()

print(f"Video {video_name} created successfully!")

Video output_video.mp4 created successfully!


In [22]:
model.model

MobileLlamaForCausalLM(
  (model): MobileLlamaModel(
    (embed_tokens): Embedding(32000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): L