In [1]:
#@title [!Important]Please use a GPU runtime.
!nvidia-smi

Tue Nov 19 18:07:48 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...    Off | 00000000:01:00.0  On |                  N/A |
| N/A   53C    P8               4W /  80W |     66MiB /  8188MiB |     34%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip install --upgrade pip
!pip install mbodied

Collecting mbodied
  Using cached mbodied-1.2.6-py3-none-any.whl.metadata (26 kB)
Collecting anthropic (from mbodied)
  Using cached anthropic-0.39.0-py3-none-any.whl.metadata (22 kB)
Collecting art (from mbodied)
  Using cached art-6.3-py3-none-any.whl.metadata (70 kB)
Collecting backoff (from mbodied)
  Using cached backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting compress-pickle>=2.1.0 (from mbodied)
  Using cached compress_pickle-2.1.0-py3-none-any.whl.metadata (3.1 kB)
Collecting datasets (from mbodied)
  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting gradio (from mbodied)
  Using cached gradio-5.6.0-py3-none-any.whl.metadata (16 kB)
Collecting gradio-client (from mbodied)
  Using cached gradio_client-1.4.3-py3-none-any.whl.metadata (7.1 kB)
Collecting gymnasium (from mbodied)
  Using cached gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting h5py (from mbodied)
  Using cached h5py-3.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014

In [None]:
# @title Install Real2Sim
!pip install numpy==1.24.4
!pip install orbax-checkpoint==0.4.4
!pip install scipy==1.12.0
!pip install keras==2.15.0
!pip install tensorflow==2.15.1
!git clone https://github.com/simpler-env/ManiSkill2_real2sim.git
!pip install -e ./ManiSkill2_real2sim
!git clone https://github.com/simpler-env/SimplerEnv.git
!pip install -e ./SimplerEnv
!mkdir ./SimplerEnv/checkpoints

Collecting numpy==1.24.4
  Using cached numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Using cached numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.3
    Uninstalling numpy-2.1.3:
      Successfully uninstalled numpy-2.1.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mbodied 1.2.6 requires numpy>=1.26.4, but you have numpy 1.24.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.24.4
Collecting orbax-checkpoint==0.4.4
  Using cached orbax_checkpoint-0.4.4-py3-none-any.whl.metadata (1.7 kB)
Collecting absl-py (from orbax-checkpoint==0.4.4)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting etils[epath,epy] (from orbax-checkpoint==

In [1]:
# @title [Important]Post Installation

# run this so local pip installs are recognized
import site
site.main()

In [2]:
import logging
import os

import click
from gymnasium import spaces
from pydantic import BaseModel, Field
from pydantic_core import from_json

from mbodied.agents.language import LanguageAgent
from mbodied.data.recording import Recorder
from mbodied.types.message import Message
from mbodied.types.motion.control import HandControl
from mbodied.types.sample import Sample
from mbodied.types.sense.vision import Image

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from mbodied.agents.motion.openvla_agent import OpenVlaAgent

openvla_agent = OpenVlaAgent(model_src="https://api.mbodi.ai/community-models/")

Loaded as API: https://api.mbodi.ai/community-models/ ✔


In [4]:
import simpler_env
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
import mediapy
import sapien.core as sapien
import numpy as np
from mbodied.robots import Robot

import base64
import cv2

In [8]:
import simpler_env
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
import sapien.core as sapien
import numpy as np
from mbodied.robots import Robot
import base64
import cv2

class Base64Image:
    def __init__(self, base64_str):
        self.base64 = base64_str

def numpy_to_base64(image: np.ndarray) -> str:
    # 이미지를 base64 문자열로 변환
    _, buffer = cv2.imencode('.png', image)
    base64_str = base64.b64encode(buffer).decode('utf-8')
    return base64_str

class SimplerENVRobot(Robot):
    def __init__(self):
        self.task_name = "widowx_carrot_on_plate"  # Task 설정
        self.env = simpler_env.make(self.task_name)
        sapien.render_config.rt_use_denoiser = False
        self.obs, self.reset_info = self.env.reset()
        print("Reset info", self.reset_info)

    def do(self, action: HandControl):
        print("[ROBOT ACTION]:", action)
        obs, reward, done, truncated, info = self.env.step(np.array(action.flatten('list')))
        self.obs = obs

    def capture(self):
        # 로봇의 관측 데이터를 캡처하고 base64로 변환
        image = get_image_from_maniskill2_obs_dict(self.env, self.obs)
        base64_str = numpy_to_base64(image)
        return Base64Image(base64_str)  # Base64Image 객체 반환



In [4]:
import simpler_env
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
import sapien.core as sapien
import numpy as np
from mbodied.robots import Robot
import base64
import cv2

from mbodied.agents.motion.openvla_agent import OpenVlaAgent

openvla_agent = OpenVlaAgent(model_src="https://api.mbodi.ai/community-models/")

class Base64Image:
    def __init__(self, base64_str):
        self.base64 = base64_str

def numpy_to_base64(image: np.ndarray) -> str:
    # 이미지를 base64 문자열로 변환
    _, buffer = cv2.imencode('.png', image)
    base64_str = base64.b64encode(buffer).decode('utf-8')
    return base64_str

class SimplerENVRobot(Robot):
    def __init__(self):
        self.task_name = "widowx_carrot_on_plate"  # Task 설정
        self.env = simpler_env.make(self.task_name)
        sapien.render_config.rt_use_denoiser = False
        self.obs, self.reset_info = self.env.reset()
        print("Reset info", self.reset_info)

    def do(self, action: list):
        print("[ROBOT ACTION]:", action)
        obs, reward, done, truncated, info = self.env.step(action)
        self.obs = obs

    def capture(self):
        # 로봇의 관측 데이터를 캡처하고 base64로 변환
        image = get_image_from_maniskill2_obs_dict(self.env, self.obs)
        base64_str = numpy_to_base64(image)
        return Base64Image(base64_str)  # Base64Image 객체 반환



Loaded as API: https://api.mbodi.ai/community-models/ ✔


In [41]:
import simpler_env
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
import sapien.core as sapien
import numpy as np
from mbodied.robots import Robot
import base64
import cv2

from mbodied.agents.motion.openvla_agent import OpenVlaAgent

openvla_agent = OpenVlaAgent(model_src="https://api.mbodi.ai/community-models/")

class Base64Image:
    def __init__(self, base64_str):
        self.base64 = base64_str

def numpy_to_base64(image: np.ndarray) -> str:
    # 이미지를 base64 문자열로 변환
    _, buffer = cv2.imencode('.png', image)
    base64_str = base64.b64encode(buffer).decode('utf-8')
    return base64_str

class SimplerENVRobot(Robot):
    def __init__(self):
        self.task_name = "widowx_carrot_on_plate"  # Task 설정
        self.env = simpler_env.make(self.task_name)
        sapien.render_config.rt_use_denoiser = False
        self.obs, self.reset_info = self.env.reset()
        print("Reset info", self.reset_info)

    def do(self, action: list):
        print("[ROBOT ACTION]:", action)
        obs, reward, done, truncated, info = self.env.step(action)
        self.obs = obs

    def capture(self):
        # 로봇의 관측 데이터를 캡처하고 base64로 변환
        image = get_image_from_maniskill2_obs_dict(self.env, self.obs)
        base64_str = numpy_to_base64(image)
        image = Base64Image(base64_str)
        return image # Base64Image 객체 반환



Loaded as API: https://api.mbodi.ai/community-models/ ✔


In [42]:
robot = SimplerENVRobot()

images = [robot.capture()]

# An example instruction.
instruction = "put carrot on plate"



Reset info {'scene_name': 'bridge_table_1_v1', 'scene_offset': None, 'scene_pose': None, 'scene_table_height': 0.87, 'urdf_version': '', 'rgb_overlay_path': '/home/khw/res/OpenVLA/SimplerEnv_Openvla/ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png', 'rgb_overlay_cameras': ['3rd_view_camera'], 'rgb_overlay_mode': 'background', 'disable_bad_material': False, 'episode_model_ids': ['bridge_carrot_generated_modified', 'bridge_plate_objaverse_larger'], 'episode_model_scales': [1.0, 1.0], 'episode_source_obj_name': 'bridge_carrot_generated_modified', 'episode_target_obj_name': 'bridge_plate_objaverse_larger', 'episode_source_obj_init_pose_wrt_robot_base': Pose([0.382, 0.103, -1.92144], [-0.707107, 2.76248e-08, 1.48615e-08, -0.707107]), 'episode_target_obj_init_pose_wrt_robot_base': Pose([0.382, -0.047, -1.92144], [0, -5.61613e-11, 0, -1]), 'episode_id': 1}


In [44]:


# Run 50 steps.
for i in range(50):
    action = openvla_agent.act(instruction, robot.capture(), unnorm_key="bridge_orig")
    # action = [v for k,v in list(action.pose)] +[list(action.grasp)[0][1]]
    robot.do(action)
    images.append(robot.capture())

[ROBOT ACTION]: HandControl(pose={'x': -0.000208787322, 'y': -0.000100362692, 'z': 1.85814697e-05, 'roll': 0.000499713421, 'pitch': 0.00122899628, 'yaw': -0.00167850941}, grasp={'value': 0.996078431})


In [None]:
# Show the video
images = [img.array for img in images]
mediapy.show_video(images, fps=5)

In [None]:
robot = SimplerENVRobot()

instruction = "put carrot on plate"
action = openvla_agent.act(instruction, robot.capture(), unnorm_key="bridge_orig")



Reset info {'scene_name': 'bridge_table_1_v1', 'scene_offset': None, 'scene_pose': None, 'scene_table_height': 0.87, 'urdf_version': '', 'rgb_overlay_path': '/home/khw/res/OpenVLA/SimplerEnv_Openvla/ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png', 'rgb_overlay_cameras': ['3rd_view_camera'], 'rgb_overlay_mode': 'background', 'disable_bad_material': False, 'episode_model_ids': ['bridge_carrot_generated_modified', 'bridge_plate_objaverse_larger'], 'episode_model_scales': [1.0, 1.0], 'episode_source_obj_name': 'bridge_carrot_generated_modified', 'episode_target_obj_name': 'bridge_plate_objaverse_larger', 'episode_source_obj_init_pose_wrt_robot_base': Pose([0.382, 0.103, -1.92144], [-0.707107, 2.76248e-08, 1.48615e-08, -0.707107]), 'episode_target_obj_init_pose_wrt_robot_base': Pose([0.382, -0.047, -1.92144], [0, -5.61613e-11, 0, -1]), 'episode_id': 1}


In [None]:
# Run 50 steps.
for i in range(50):
    action = openvla_agent.act(instruction, robot.capture(), unnorm_key="bridge_orig")
    robot.do(action)
    images.append(robot.capture())

In [1]:
import simpler_env
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
import mediapy
import sapien.core as sapien

task_name = "widowx_carrot_on_plate"  # @param ["google_robot_pick_coke_can", "google_robot_move_near", "google_robot_open_drawer", "google_robot_close_drawer", "widowx_spoon_on_towel", "widowx_carrot_on_plate", "widowx_stack_cube", "widowx_put_eggplant_in_basket"]

if 'env' in locals():
  print("Closing existing env")
  env.close()
  del env
  
env = simpler_env.make(task_name)
# Colab GPU does not supoort denoiser
sapien.render_config.rt_use_denoiser = False
obs, reset_info = env.reset()
instruction = env.get_language_instruction()
print("Reset info", reset_info)
print("Instruction", instruction)

frames = []
done, truncated = False, False
while not (done or truncated):
   # action[:3]: delta xyz; action[3:6]: delta rotation in axis-angle representation;
   # action[6:7]: gripper (the meaning of open / close depends on robot URDF)
   image = get_image_from_maniskill2_obs_dict(env, obs)
   action = env.action_space.sample() # replace this with your policy inference
   print('action: ', action)
   obs, reward, done, truncated, info = env.step(action)
   frames.append(image)

episode_stats = info.get('episode_stats', {})
print("Episode stats", episode_stats)
mediapy.show_video(frames, fps=10)

: 