In [8]:
!pip install 'stable-baselines3[extra]' --quiet
!pip install tqdm --quiet
%env MUJOCO_GL=enable
from myosuite.utils import gym
from showvid import show_video
import skvideo.io
import numpy as np
import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
from stable_baselines3 import PPO


env: MUJOCO_GL=enable


# MyoFingerReach example

In [None]:
env = gym.make('myoFingerReachFixed-v0')

model = PPO("MlpPolicy", env, verbose=0)
model.learn(total_timesteps=100000)
model.save("Reach_policy")

In [None]:
# evaluate policy
all_rewards = []
for _ in tqdm(range(20)): # 20 random targets
  ep_rewards = []
  done = False
  obs, _ = env.reset()
  for _ in range(40):
      # get the next action from the policy
      action, _ = model.predict(obs)
      # take an action based on the current observation
      obs, reward, done, _, info = env.step(action)
      ep_rewards.append(reward)
  all_rewards.append(np.sum(ep_rewards))
print(f"Average reward: {np.mean(all_rewards)} over 20 episodes")


In [None]:
env.reset()
frames = []
obs, _ = env.reset()
for _ in tqdm(range(1000)):
    # Predict the best action based on the current observation
    action, _ = model.predict(obs)
    # Render the current frame from the environment
    frames.append(env.sim.renderer.render_offscreen(
                        width=400,
                        height=400,
                        camera_id=-1))
    # Take the action and get the next observation
    obs, _, done, _, _ = env.step(action)
env.close()

In [None]:
os.makedirs('videos', exist_ok=True)
# make a local copy
skvideo.io.vwrite('videos/Reach.mp4', np.asarray(frames),outputdict={"-pix_fmt": "yuv420p"})

# show in the notebook
show_video('videos/Reach.mp4')

# MyoDm example

In [2]:
import numpy as np
from myosuite.utils import gym
import os

env = gym.make('MyoHandMouseLift-v0')
env.reset()

from stable_baselines3 import PPO
model = PPO("MlpPolicy", env, verbose=1, ent_coef=0.2, gamma=0.95, learning_rate=0.015)
model.learn(total_timesteps=10000)
# to train to convergence use more iterations e.g.
# model.learn(total_timesteps=1e7)
model.save("inspect_policy")

[36m    MyoDex: A Generalizable Prior for Dexterous Manipulation
        Vittorio Caggiano, Sudeep Dasari, Vikash Kumar
        ICML-2023, https://arxiv.org/abs/2309.03130
    [0m
target [0.05 0.1  0.3 ]
tip_pos [0.1417 0.161  0.3739]
target [0.05 0.1  0.3 ]
tip_pos [0.0267 0.0579 0.054 ]
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
target [0.05 0.1  0.3 ]
tip_pos [0.0267 0.0579 0.054 ]
target [0.05 0.1  0.3 ]
tip_pos [0.0262 0.0589 0.0569]
target [0.05 0.1  0.3 ]
tip_pos [0.0144 0.058  0.0654]
target [0.05 0.1  0.3 ]
tip_pos [0.0267 0.0579 0.054 ]
target [0.05 0.1  0.3 ]
tip_pos [0.0235 0.0544 0.0509]
target [0.05 0.1  0.3 ]
tip_pos [0.017  0.0458 0.0516]
target [0.05 0.1  0.3 ]
tip_pos [0.0267 0.0579 0.054 ]
target [0.05 0.1  0.3 ]
tip_pos [0.0307 0.046  0.0406]
target [0.05 0.1  0.3 ]
tip_pos [0.0345 0.0267 0.0341]
target [0.05 0.1  0.3 ]
tip_pos [0.0267 0.0579 0.054 ]
target [0.05 0.1  0.3 ]
tip_pos [0.0326 0.0547 0.0481]
target [0

In [3]:
all_rewards = []
for _ in tqdm(range(20)): # 20 random targets
  ep_rewards = []
  done = False
  obs, _ = env.reset()
  for _ in range(40):
      # get the next action from the policy
      action, _ = model.predict(obs)
      # take an action based on the current observation
      obs, reward, done, _, info = env.step(action)
      ep_rewards.append(reward)
  all_rewards.append(np.sum(ep_rewards))
print(f"Average reward: {np.mean(all_rewards)} over 20 episodes")


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for _ in tqdm(range(20)): # 20 random targets


  0%|          | 0/20 [00:00<?, ?it/s]

target [0.05 0.1  0.3 ]
tip_pos [0.0267 0.0579 0.054 ]
target [0.05 0.1  0.3 ]
tip_pos [0.0307 0.0525 0.0572]
target [0.05 0.1  0.3 ]
tip_pos [0.0348 0.0429 0.0617]
target [0.05 0.1  0.3 ]
tip_pos [0.0282 0.0294 0.064 ]
target [0.05 0.1  0.3 ]
tip_pos [0.0115 0.0129 0.0663]
target [0.05 0.1  0.3 ]
tip_pos [-0.0071 -0.0062  0.0674]
target [0.05 0.1  0.3 ]
tip_pos [-0.014  -0.0272  0.0684]
target [0.05 0.1  0.3 ]
tip_pos [-0.0099 -0.0469  0.0722]
target [0.05 0.1  0.3 ]
tip_pos [ 0.0018 -0.0673  0.0801]
target [0.05 0.1  0.3 ]
tip_pos [ 0.0198 -0.0893  0.0897]
target [0.05 0.1  0.3 ]
tip_pos [ 0.0347 -0.1094  0.0986]
target [0.05 0.1  0.3 ]
tip_pos [ 0.0463 -0.1279  0.1126]
target [0.05 0.1  0.3 ]
tip_pos [ 0.0601 -0.1447  0.1325]
target [0.05 0.1  0.3 ]
tip_pos [ 0.0668 -0.1587  0.1519]
target [0.05 0.1  0.3 ]
tip_pos [ 0.071  -0.1726  0.1647]
target [0.05 0.1  0.3 ]
tip_pos [ 0.0722 -0.1844  0.1753]
target [0.05 0.1  0.3 ]
tip_pos [ 0.0705 -0.1956  0.1914]
target [0.05 0.1  0.3 ]
tip_p

In [6]:
env.reset()
# find geometries with ID == 1 which indicates the skins
geom_1_indices = np.where(env.sim.model.geom_group == 1)
# Change the alpha value to make it transparent
env.sim.model.geom_rgba[geom_1_indices, 3] = 0

env.sim.renderer.set_viewer_settings(
           render_actuator=True,
           render_tendon=True
       )


# find geometries with ID == 1 which indicates the skins
geom_1_indices = np.where(env.sim.model.geom_group == 1)
# Change the alpha value to make it transparent
env.sim.model.geom_rgba[geom_1_indices, 3] = 0

env.sim.renderer.set_viewer_settings(
           render_actuator=True,
           render_tendon=True
       )

pi = PPO.load("inspect_policy")
frames = []
for _ in range(400):
    frames.append(env.sim.renderer.render_offscreen(width=400, height=400, camera_id=5))
    o = env.get_obs()
    a = pi.predict(o)[0]
    next_o, r, done, *_, ifo = env.step(
        a
    )  # take an action based on the current observation
    
import skvideo.io
import os
os.makedirs("videos", exist_ok=True)
# make a local copy
skvideo.io.vwrite(
    "videos/inspect.mp4",

    np.asarray(frames),
    outputdict={"-pix_fmt": "yuv420p", "-r": "10"},
)




target [0.05 0.1  0.3 ]
tip_pos [0.0267 0.0579 0.054 ]
target [0.05 0.1  0.3 ]
tip_pos [0.0267 0.0579 0.054 ]
target [0.05 0.1  0.3 ]
tip_pos [0.0167 0.0491 0.0494]
target [0.05 0.1  0.3 ]
tip_pos [0.0167 0.0491 0.0494]
target [0.05 0.1  0.3 ]
tip_pos [0.0027 0.0349 0.0501]
target [0.05 0.1  0.3 ]
tip_pos [0.0027 0.0349 0.0501]
target [0.05 0.1  0.3 ]
tip_pos [-0.0136  0.0192  0.0565]
target [0.05 0.1  0.3 ]
tip_pos [-0.0136  0.0192  0.0565]
target [0.05 0.1  0.3 ]
tip_pos [-0.0351 -0.0013  0.0606]
target [0.05 0.1  0.3 ]
tip_pos [-0.0351 -0.0013  0.0606]
target [0.05 0.1  0.3 ]
tip_pos [-0.0484 -0.0254  0.0633]
target [0.05 0.1  0.3 ]
tip_pos [-0.0484 -0.0254  0.0633]
target [0.05 0.1  0.3 ]
tip_pos [-0.0461 -0.0487  0.0686]
target [0.05 0.1  0.3 ]
tip_pos [-0.0461 -0.0487  0.0686]
target [0.05 0.1  0.3 ]
tip_pos [-0.031  -0.0728  0.0789]
target [0.05 0.1  0.3 ]
tip_pos [-0.031  -0.0728  0.0789]
target [0.05 0.1  0.3 ]
tip_pos [-0.0156 -0.0965  0.0934]
target [0.05 0.1  0.3 ]
tip_pos 

  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.stdin.write(vid.tostring())
  self._proc.std

In [7]:
show_video('videos/inspect.mp4')