# LeLamp with Gymnasium and Mujoco

## Dependencies

In [1]:
%env MUJOCO_GL=egl

env: MUJOCO_GL=egl


In [2]:
import mujoco
import numpy as np

from gymnasium import utils
from gymnasium.envs.mujoco import MujocoEnv
from gymnasium.spaces import Box

In [19]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, BaseCallback
from stable_baselines3.common.monitor import Monitor
from gymnasium.wrappers import TimeLimit
import time

import mediapy as media

## Configurations

In [4]:
DEFAULT_CAMERA_CONFIG = {
    "trackbodyid": -1,
    "distance": 4.0,
}

## Environment

In [5]:
MODEL_PATH = "../models/lelamp/scene.xml"

class LeLampEnv(MujocoEnv, utils.EzPickle):
    metadata = {
        "render_modes": [
            "human",
            "rgb_array",
            "depth_array",
            "rgbd_tuple"
        ],
        "render_fps": 20,
    }

    def __init__(
            self,
            forward_reward_weight=1.0,
            healthy_reward=5.0,
            terminate_when_unhealthy=True,
            healthy_z_range=(0.2, 1.0),
            reset_noise_scale=1e-2,
            **kwargs
    ):
        # Store reward parameters
        self._forward_reward_weight = forward_reward_weight
        self._healthy_reward = healthy_reward
        self._terminate_when_unhealthy = terminate_when_unhealthy
        self._healthy_z_range = healthy_z_range
        self._reset_noise_scale = reset_noise_scale

        # Initialize EzPickle
        utils.EzPickle.__init__(
            self,
            forward_reward_weight=forward_reward_weight,
            healthy_reward=healthy_reward,
            terminate_when_unhealthy=terminate_when_unhealthy,
            healthy_z_range=healthy_z_range,
            reset_noise_scale=reset_noise_scale,
            **kwargs
        )

        # Init the observation space
        obs_size = 5 + 5 + 6
        self.observation_space = Box(
            low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float32
        )

        # Initialize the Mujoco environment
        MujocoEnv.__init__(
            self,
            MODEL_PATH,
            5,
            observation_space=self.observation_space,
            **kwargs
        )

        # Store previous COM position for velocity calculation
        self._prev_com = None

    def _calculate_observation_size(self):
        n_joints = self.model.nq - 7
        n_vel = self.model.nv - 6
        n_sensors = self.model.n_sensor

        return n_joints + n_vel + n_sensors

    def _get_obs(self):
        """Get current observation."""
        # Joint positions (excluding free joint if present)
        # You may need to adjust the slicing based on your model
        position = self.data.qpos[7:].copy()

        # Joint velocities (excluding free joint if present)  
        velocities = self.data.qvel[6:].copy()

        # Sensor data
        sensor_data = self.data.sensordata[:6].copy()

        return np.concatenate([position, velocities, sensor_data]).astype(np.float32)
    
    def reset_model(self):
        """Reset the model to a random state."""
        # Add noise to initial positions and velocities
        noise_low = -self._reset_noise_scale
        noise_high = self._reset_noise_scale
        
        # Add noise to initial joint positions
        qpos = self.init_qpos + self.np_random.uniform(
            low=noise_low,
            high=noise_high,
            size=self.model.nq,
        )

        # Add noise to initial joint velocities  
        qvel = self.np_random.uniform(
            low=noise_low,
            high=noise_high,
            size=self.model.nv,
        )

        # Set the state
        self.set_state(qpos, qvel)

        # Reset previous COM tracking
        self._prev_com = None

        return self._get_obs()
    
    def step(self, action):
        """Perform a step in the environment."""

        # Get body ID for the lamp
        body_id = mujoco.mj_name2id(self.model, mujoco.mjtObj.mjOBJ_BODY, "dc15_a01_dummy_assy_idle_asm")

        # Store COM
        com_before = self.data.subtree_com[body_id].copy()

        # Perform the action
        self.do_simulation(action, self.frame_skip)

        # Get the new COM position
        com_after = self.data.subtree_com[body_id].copy()

        # Calculate velocity (COM change over time)
        velocity = (com_after - com_before) / self.dt
        
        # Forward reward
        forward_reward = self._forward_reward_weight * velocity[0]

        # Healthy reward
        lamp_head_id = mujoco.mj_name2id(
            self.model,
            mujoco.mjtObj.mjOBJ_BODY,
            "lamp_head"
        )
        lamp_head_pos = self.data.xpos[lamp_head_id]

        min_z, max_z = self._healthy_z_range
        is_healthy = (min_z <= lamp_head_pos[2] <= max_z)
        healthy_reward = self._healthy_reward if is_healthy else 0.0

        # Calculate total reward
        reward = forward_reward + healthy_reward

        # Check if episode should terminate
        terminated = False
        if self._terminate_when_unhealthy and not is_healthy:
            terminated = True

        # Get observation
        observation = self._get_obs()

        # Render if in human mode
        if self.render_mode == "human":
            self.render()

                # Create info dict with metrics
        info = {
            'forward_reward': forward_reward,
            'reward_linvel': forward_reward,
            'reward_alive': healthy_reward,
            'x_position': com_after[0],
            'y_position': com_after[1],
            'distance_from_origin': np.linalg.norm(com_after),
            'x_velocity': velocity[0],
            'y_velocity': velocity[1],
        }

        # Return (observation, reward, terminated, truncated, info)
        return observation, reward, terminated, False, info

## Test Environment

In [32]:
env = LeLampEnv(forward_reward_weight=10.0)

obs, info = env.reset()

obs, info

(array([-0.00600509,  0.00430657,  0.00846351,  0.00714885,  0.00938395,
         0.00384329, -0.00017675,  0.0064778 ,  0.00616183,  0.00412362,
         0.05279579,  0.07466126,  0.1572197 , -0.00902396,  0.00803411,
         0.00136742], dtype=float32),
 {})

In [33]:
# Test single step
action = env.action_space.sample()
obs, reward, terminated, truncated, info = env.step(action)
print(f"✓ Step successful. Reward: {reward:.4f}")
print(f"  Terminated: {terminated}, Truncated: {truncated}")
print(f"  Info: {info}")

In [34]:
import mediapy as media

def render_env_video(max_steps=100, fps=20):
    """Collect frames and show as video in Jupyter."""
    env = LeLampEnv(render_mode="rgb_array")
    obs, info = env.reset()
    
    frames = []
    rewards = []
    
    # Collect frames
    for i in range(max_steps):
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)
        
        # Get frame and store
        frame = env.render()
        if frame is not None:
            frames.append(frame)
            rewards.append(reward)
        
        if terminated or truncated:
            print(f"Episode ended at step {i+1}")
            break
    
    env.close()
    
    # Show video using media.show_video
    if frames:
        print(f"Collected {len(frames)} frames")
        print(f"Total reward: {sum(rewards):.3f}")
        media.show_video(frames, fps=fps)
    else:
        print("No frames collected!")

# Just run this in a Jupyter cell:
render_env_video()


0
This browser does not support the video tag.


# Train with PPO

In [6]:
# Create environment
env = LeLampEnv(forward_reward_weight=10.0)
env = TimeLimit(LeLampEnv(), max_episode_steps=1000)

# Check environment is valid
check_env(env)
print("✓ Environment validation passed")

# Wrap environment for monitoring
env = Monitor(env, "./logs/")

✓ Environment validation passed




In [44]:
# Create model
model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    tensorboard_log="./tensorboard_logs/",
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    clip_range=0.2,
)

Using cuda device
Wrapping the env in a DummyVecEnv.


In [45]:
# Set up callbacks
eval_env = Monitor(LeLampEnv())
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./models/",
    log_path="./logs/",
    eval_freq=10000,
    deterministic=True,
    render=False,
)

In [46]:
checkpoint_callback = CheckpointCallback(
    save_freq=50000,
    save_path="./checkpoints/",
    name_prefix="lelamp_ppo",
)

In [52]:
class DebugCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.last_print = time.time()
    
    def _on_step(self) -> bool:
        current_time = time.time()
        if current_time - self.last_print > 5:  # Print every 5 seconds
            print(f"DEBUG: Timestep {self.num_timesteps}, "
                  f"Time: {current_time}")
            self.last_print = current_time
        return True

# Add to your training:
debug_callback = DebugCallback()

In [53]:
# Train the model
model.learn(
    total_timesteps=1000000,  # 1M steps
    callback=[eval_callback, checkpoint_callback, debug_callback],
    progress_bar=True,
)

Logging to ./tensorboard_logs/PPO_4
DEBUG: Timestep 1, Time: 1754594234.1464956




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 12.7     |
|    ep_rew_mean     | 58.6     |
| time/              |          |
|    fps             | 1203     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 16.4        |
|    ep_rew_mean          | 76.6        |
| time/                   |             |
|    fps                  | 964         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011980383 |
|    clip_fraction        | 0.147       |
|    clip_range           | 0.2         |
|    entropy_loss         | -6.89       |
|    explained_variance   | 0.184       |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x72fe94bcadd0>

In [54]:
model.save("lelamp_ppo_final")

# Test Model

In [17]:
    
# Load model
model = PPO.load("lelamp_ppo_final")

# Create environment
env = LeLampEnv(forward_reward_weight=10.0, render_mode="rgb_array")
env = TimeLimit(env, max_episode_steps=1000)



In [10]:
# Test for multiple episodes
for episode in range(5):
    obs, info = env.reset()
    total_reward = 0
    steps = 0
    
    print(f"\nEpisode {episode + 1}:")
    
    for step in range(1000):  # Max 1000 steps per episode
        # Get action from trained policy
        action, _state = model.predict(obs, deterministic=True)
        
        # Step environment
        obs, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        steps += 1
        
        # Print progress
        if step % 100 == 0:
            print(f"  Step {step}: reward={reward:.3f}, x_pos={info['x_position']:.3f}")
        
        if terminated or truncated:
            break
    
    print(f"  Episode finished: {steps} steps, total reward: {total_reward:.3f}")

env.close()


Episode 1:
  Step 0: reward=5.028, x_pos=-0.001
  Step 100: reward=5.045, x_pos=0.076
  Step 200: reward=5.004, x_pos=0.116
  Step 300: reward=5.043, x_pos=0.139
  Step 400: reward=4.998, x_pos=0.207
  Step 500: reward=4.860, x_pos=0.128
  Episode finished: 583 steps, total reward: 2912.536

Episode 2:
  Step 0: reward=4.995, x_pos=0.009
  Step 100: reward=5.023, x_pos=0.118
  Step 200: reward=4.963, x_pos=0.042
  Step 300: reward=5.020, x_pos=-0.100
  Step 400: reward=4.957, x_pos=-0.248
  Step 500: reward=4.989, x_pos=-0.330
  Step 600: reward=5.089, x_pos=-0.184
  Step 700: reward=5.026, x_pos=-0.068
  Step 800: reward=5.081, x_pos=0.006
  Step 900: reward=4.943, x_pos=-0.081
  Episode finished: 1000 steps, total reward: 4994.917

Episode 3:
  Step 0: reward=4.999, x_pos=0.001
  Episode finished: 72 steps, total reward: 355.537

Episode 4:
  Step 0: reward=5.000, x_pos=0.002
  Step 100: reward=4.997, x_pos=0.027
  Episode finished: 137 steps, total reward: 679.908

Episode 5:
  Ste

In [20]:
all_episodes = []

num_episodes = 5
fps = 20  # Frames per second for video rendering
max_steps = 1000  # Max steps per episode
    
for episode in range(num_episodes):
    print(f"\nTesting Episode {episode + 1}/{num_episodes}")
    
    # Reset environment
    obs, info = env.reset()
    
    frames = []
    rewards = []
    episode_reward = 0
    step_count = 0
    
    for step in range(max_steps):
        # Get action from trained model (deterministic for consistent behavior)
        action, _states = model.predict(obs, deterministic=True)
        
        # Take step in environment
        obs, reward, terminated, truncated, info = env.step(action)
        
        # Collect data
        frame = env.render()
        if frame is not None:
            frames.append(frame)
            rewards.append(reward)
            episode_reward += reward
            step_count += 1
        
        # Print progress occasionally
        if step % 50 == 0 and step > 0:
            print(f"  Step {step}: reward={reward:.3f}, "
                    f"x_pos={info.get('x_position', 0):.3f}, "
                    f"total_reward={episode_reward:.3f}")
        
        if terminated or truncated:
            reason = "terminated" if terminated else "truncated"
            print(f"  Episode ended ({reason}) after {step + 1} steps")
            break
    
    # Store episode data
    episode_data = {
        'frames': frames,
        'rewards': rewards,
        'total_reward': episode_reward,
        'steps': step_count,
        'final_x_pos': info.get('x_position', 0),
        'final_y_pos': info.get('y_position', 0),
    }
    all_episodes.append(episode_data)
    
    print(f"  Episode Summary:")
    print(f"    Steps: {step_count}")
    print(f"    Total Reward: {episode_reward:.3f}")
    print(f"    Average Reward: {episode_reward/step_count:.3f}")
    print(f"    Final Position: ({episode_data['final_x_pos']:.3f}, {episode_data['final_y_pos']:.3f})")
    
    # Show video for this episode
    if frames:
        print(f"  Rendering video for Episode {episode + 1}...")
        media.show_video(frames, fps=fps)
    else:
        print("  No frames collected for this episode")


Testing Episode 1/5
  Step 50: reward=5.106, x_pos=0.016, total_reward=257.012
  Episode ended (terminated) after 89 steps
  Episode Summary:
    Steps: 89
    Total Reward: 441.524
    Average Reward: 4.961
    Final Position: (0.014, -0.003)
  Rendering video for Episode 1...


0
This browser does not support the video tag.



Testing Episode 2/5
  Step 50: reward=5.421, x_pos=0.049, total_reward=264.761
  Step 100: reward=5.212, x_pos=0.101, total_reward=525.239
  Step 150: reward=4.685, x_pos=0.143, total_reward=783.683
  Step 200: reward=5.021, x_pos=0.166, total_reward=1038.213
  Step 250: reward=4.222, x_pos=0.106, total_reward=1276.266
  Step 300: reward=5.231, x_pos=-0.006, total_reward=1503.803
  Step 350: reward=4.977, x_pos=0.006, total_reward=1756.264
  Step 400: reward=4.746, x_pos=-0.089, total_reward=1987.246
  Step 450: reward=4.762, x_pos=-0.135, total_reward=2227.956
  Step 500: reward=4.541, x_pos=-0.105, total_reward=2484.115
  Step 550: reward=4.770, x_pos=-0.102, total_reward=2734.622
  Step 600: reward=5.007, x_pos=-0.099, total_reward=2985.227
  Step 650: reward=5.384, x_pos=-0.056, total_reward=3243.877
  Step 700: reward=5.514, x_pos=0.035, total_reward=3511.958
  Step 750: reward=5.103, x_pos=0.119, total_reward=3778.879
  Step 800: reward=4.809, x_pos=0.167, total_reward=4038.384


0
This browser does not support the video tag.



Testing Episode 3/5
  Step 50: reward=4.911, x_pos=0.041, total_reward=264.467
  Step 100: reward=4.817, x_pos=0.061, total_reward=518.441
  Step 150: reward=4.889, x_pos=0.044, total_reward=765.011
  Step 200: reward=4.886, x_pos=0.051, total_reward=1016.547
  Step 250: reward=4.580, x_pos=0.059, total_reward=1268.104
  Step 300: reward=5.361, x_pos=0.080, total_reward=1522.207
  Step 350: reward=4.659, x_pos=0.009, total_reward=1758.086
  Step 400: reward=4.480, x_pos=-0.098, total_reward=1986.622
  Episode ended (terminated) after 404 steps
  Episode Summary:
    Steps: 404
    Total Reward: 1994.465
    Average Reward: 4.937
    Final Position: (-0.109, 0.384)
  Rendering video for Episode 3...


0
This browser does not support the video tag.



Testing Episode 4/5
  Step 50: reward=5.326, x_pos=0.039, total_reward=262.697
  Step 100: reward=5.251, x_pos=0.077, total_reward=520.219
  Episode ended (terminated) after 148 steps
  Episode Summary:
    Steps: 148
    Total Reward: 762.850
    Average Reward: 5.154
    Final Position: (0.140, 0.100)
  Rendering video for Episode 4...


0
This browser does not support the video tag.



Testing Episode 5/5
  Step 50: reward=5.235, x_pos=0.055, total_reward=265.631
  Step 100: reward=5.268, x_pos=0.119, total_reward=528.306
  Episode ended (terminated) after 135 steps
  Episode Summary:
    Steps: 135
    Total Reward: 692.855
    Average Reward: 5.132
    Final Position: (0.116, 0.173)
  Rendering video for Episode 5...


0
This browser does not support the video tag.
