# Simulación Robotic arm witch FetchPickAndPlace from Gymnasium-Robotics using Mujoco y stable baselines3

En este notebook vamos a simular y entrenar un agente(brazo robotico) en el entorno `FetchPickAndPlace-v4` utilizando `gymnasium`, `mujoco` y `stable-baselines3`.

In [5]:
!pip install tensorboard

Collecting tensorboard
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.73.1-cp310-cp310-win_amd64.whl.metadata (4.0 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Downloading markdown-3.8.2-py3-none-any.whl.metadata (5.1 kB)
Collecting protobuf!=4.24.0,>=3.19.6 (from tensorboard)
  Downloading protobuf-6.31.1-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Downloading tensorboard_data_server-0.7.2-py3-none-any.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Downloading werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Downloading tensorboard-2.19.0-py3-none-any.whl (5.5 MB)
   ---------------------------------------- 0.0/5.5 MB ? eta -:--:--
   -------------------------------------- - 5.2/5.5 MB 29.0 MB/s eta 0:00:01
   ---------------------------------------- 5.5/5.5 MB 25.8 MB/s eta 0:00:00
Downloading t

## Pruebas


In [None]:
# Prueba de archivo xml para encontrar el archivo pick_and_place.xml
import os

# Ruta base a site-packages dentro de tu entorno virtual
base_dir = "./Ve_ArmSimulation2/Lib/site-packages"  # Ajusta si tu ruta es diferente

for root, dirs, files in os.walk(base_dir):
    if "pick_and_place.xml" in files:
        print(os.path.join(root, "pick_and_place.xml"))
        break


In [None]:
# ✅ Instalar dependencias necesarias (si no están instaladas)
!pip install gymnasium[robotics] mujoco matplotlib stable-baselines3 --quiet

In [None]:
import gymnasium_robotics
import importlib.metadata
print(importlib.metadata.version("gymnasium-robotics"))
print(gymnasium_robotics.__file__)

## 1. Simulación básica del entorno FetchPickAndPlace

In [3]:
import os
os.environ["MUJOCO_GL"] = "glfw"

import gymnasium as gym
import gymnasium_robotics  # Importar el paquete de robótica
import numpy as np

# Crear entorno con renderizado
env = gym.make("FetchPickAndPlace-v4", render_mode="human")
obs, info = env.reset()

# Simular acciones aleatorias
for _ in range(50):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    env.render()
    if terminated or truncated:
        obs, info = env.reset()

env.close()

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.war

## 2. Entrenamiento con PPO (Stable-Baselines3)

In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Crear entorno vectorizado
vec_env = make_vec_env("FetchPickAndPlace-v4", n_envs=1)

# Crear modelo PPO con MultiInputPolicy
model = PPO("MultiInputPolicy", vec_env, verbose=1)

# Entrenar el modelo
model.learn(total_timesteps=50_000)

# Guardar modelo
model.save("fetch_pick_and_place_ppo")

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -48.8    |
|    success_rate    | 0.025    |
| time/              |          |
|    fps             | 231      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 50          |
|    ep_rew_mean          | -48.8       |
|    success_rate         | 0.0247      |
| time/                   |             |
|    fps                  | 215         |
|    iterations           | 2           |
|    time_elapsed         | 19          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.004900503 |
|    clip_fraction        | 0.0265      |
|    clip_range           | 0.2         |
|    entropy_loss    

### 3. Evaluación del agente entrenado

In [None]:
# Cargar entorno y modelo entrenado
env = gym.make("FetchPickAndPlace-v4", render_mode="human")
model = PPO.load("fetch_pick_and_place_ppo")

obs, info = env.reset()
for _ in range(1000):
    action, _ = model.predict(obs)
    obs, reward, done, truncated, info = env.step(action)
    env.render()
    if done or truncated:
        obs, info = env.reset()

env.close()

# 2 Entrenamiento con SAC (Stable-Baselines3)

In [3]:
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CheckpointCallback

# Crear entorno
env = gym.make("FetchPickAndPlace-v4")

# Crear directorio para guardar modelos
save_dir = "./sac_models/"
os.makedirs(save_dir, exist_ok=True)

# Callback para guardar cada 15k steps
checkpoint_callback = CheckpointCallback(
    save_freq=15000,                # Cada 15,000 pasos
    save_path="./sac_models", # Carpeta donde se guardan
    name_prefix="sac_model"        # Prefijo del archivo
)

# Crear modelo SAC
model = SAC("MultiInputPolicy", env, verbose=1, tensorboard_log="./sac_logs/")

# Entrenar con checkpoints
model.learn(total_timesteps=160_000, callback=checkpoint_callback)

# Guardar modelo final
model.save(os.path.join(save_dir, "sac_final"))

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./sac_logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -50      |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 4        |
|    fps             | 44       |
|    time_elapsed    | 4        |
|    total_timesteps | 200      |
| train/             |          |
|    actor_loss      | -4.8     |
|    critic_loss     | 0.0546   |
|    ent_coef        | 0.971    |
|    ent_coef_loss   | -0.199   |
|    learning_rate   | 0.0003   |
|    n_updates       | 99       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -50      |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 8        |
|    fps             | 40   

### Reanudar entrenamiento desde checkpoint

In [None]:
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CheckpointCallback
import gymnasium as gym
import os

env = gym.make("FetchPickAndPlace-v4", render_mode="human", reward_type="sparse")

# Cargar modelo desde un checkpoint
model = SAC.load("./sac_models/sac_final", env=env)

checkpoint_callback = CheckpointCallback(
    save_freq=15000,                # Cada 15,000 pasos
    save_path="./sac_models", # Carpeta donde se guardan
    name_prefix="sac_model"        # Prefijo del archivo
)

# Continuar entrenamiento
model.learn(
    total_timesteps=60_000,       # O los pasos que quieras añadir
    callback=checkpoint_callback,
    reset_num_timesteps=False      # Esto es CRUCIAL para no perder continuidad
)

model.save("sac_final_continuado")

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./sac_logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -48.4    |
|    success_rate    | 0.03     |
| time/              |          |
|    episodes        | 3204     |
|    fps             | 26       |
|    time_elapsed    | 7        |
|    total_timesteps | 160200   |
| train/             |          |
|    actor_loss      | 51.2     |
|    critic_loss     | 7.91     |
|    ent_coef        | 0.0468   |
|    ent_coef_loss   | 0.606    |
|    learning_rate   | 0.0003   |
|    n_updates       | 160099   |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -48.4    |
|    success_rate    | 0.03     |
| time/              |          |
|    episodes        | 3208     |
|    fps             | 27       |
|    time_e

: 

### Evaluar el agente entrenado

In [None]:
from stable_baselines3 import SAC
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np

model = SAC.load("./sac_models/sac_final_continuado", env=env)

env = gym.make("FetchPickAndPlace-v4", render_mode="human")

n_episodes = 20
successes = []
rewards = []

for ep in range(n_episodes):
    obs, _ = env.reset()
    done = False
    total_reward = 0
    success = 0

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        total_reward += reward
        success = info.get("is_success", 0)

    successes.append(success)
    rewards.append(total_reward)

print(f"✅ Success rate: {np.mean(successes):.2f}")
print(f"✅ Recompensa promedio: {np.mean(rewards):.2f}")

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
✅ Success rate: 0.05
✅ Recompensa promedio: -46.35


: 

# 3 Entrenamiento con SAC + Her o DDPG + Her

In [4]:
import gymnasium as gym
from stable_baselines3.common.env_util import make_vec_env

# Crear entorno vectorizado
make_vec_env("FetchPickAndPlace-v4", n_envs=1)
 

<stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv at 0x1d543edd240>

## Entrenamiento SAC + HER

In [8]:
from stable_baselines3 import SAC
from stable_baselines3.her import HerReplayBuffer
from stable_baselines3.common.callbacks import CheckpointCallback

# Crear directorio para guardar modelos
save_dir = "./sac+her_models/"
os.makedirs(save_dir, exist_ok=True)

# Callback para guardar cada 15k steps
checkpoint_callback = CheckpointCallback(
    save_freq=15000,                # Cada 15,000 pasos
    save_path="./sac_her_models", # Carpeta donde se guardan
    name_prefix="sac_her_model"        # Prefijo del archivo
)

sac_model = SAC(
    policy="MultiInputPolicy",
    env=env,
    replay_buffer_class=HerReplayBuffer,
    replay_buffer_kwargs=dict(
        n_sampled_goal=4,
        goal_selection_strategy="future",
    ),
    verbose=1,
    tensorboard_log="./her_sac_tensorboard/",
)

# Entrenar el modelo
sac_model.learn(total_timesteps=30_000)

# Guardar modelo
model.save(os.path.join(save_dir, "sac_her_final"))

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./her_sac_tensorboard/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -50      |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 4        |
|    fps             | 61       |
|    time_elapsed    | 3        |
|    total_timesteps | 200      |
| train/             |          |
|    actor_loss      | -5.66    |
|    critic_loss     | 0.218    |
|    ent_coef        | 0.971    |
|    ent_coef_loss   | -0.198   |
|    learning_rate   | 0.0003   |
|    n_updates       | 99       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -50      |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 8        |
|    fps         

### Continuar entrenamiento

In [None]:
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CheckpointCallback
import gymnasium as gym
import os

env = gym.make("FetchPickAndPlace-v4", render_mode="human", reward_type="sparse")

# Cargar modelo desde un checkpoint
model = SAC.load("./sac_models/sac_final", env=env)

checkpoint_callback = CheckpointCallback(
    save_freq=15000,                # Cada 15,000 pasos
    save_path="./sac_her_models",   # Carpeta donde se guardan
    name_prefix="sac_her_model"        # Prefijo del archivo
)

# Continuar entrenamiento
model.learn(
    total_timesteps=60_000,       # O los pasos que quieras añadir
    callback=checkpoint_callback,
    reset_num_timesteps=False      # Esto es CRUCIAL para no perder continuidad
)

model.save("sac_final_continuado")