# LeLamp with Gymnasium and Mujoco

## Dependencies

In [1]:
%env MUJOCO_GL=egl

env: MUJOCO_GL=egl


In [2]:
import mujoco
import numpy as np

from gymnasium import utils
from gymnasium.envs.mujoco import MujocoEnv
from gymnasium.spaces import Box

In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, BaseCallback
from stable_baselines3.common.monitor import Monitor
from gymnasium.wrappers import TimeLimit
import time

import mediapy as media

## Configurations

In [4]:
DEFAULT_CAMERA_CONFIG = {
    "trackbodyid": -1,
    "distance": 4.0,
}

## Environment

In [43]:
MODEL_PATH = "../models/lelamp/scene.xml"

class LeLampEnv(MujocoEnv, utils.EzPickle):
    metadata = {
        "render_modes": [
            "human",
            "rgb_array",
            "depth_array",
            "rgbd_tuple"
        ],
        "render_fps": 20,
    }

    def __init__(
            self,
            forward_reward_weight=1.0,
            jump_reward_weight=1.0,
            in_air_reward_weight=1.0,
            healthy_reward=1.0,
            energy_consumption_weight=1.0,
            terminate_when_unhealthy=True,
            healthy_z_range=(0.1, 1.0),
            reset_noise_scale=1e-2,
            **kwargs
    ):
        # Store reward parameters
        self._forward_reward_weight = forward_reward_weight
        self._jump_reward_weight = jump_reward_weight
        self._in_air_reward_weight = in_air_reward_weight
        self._energy_consumption_weight = energy_consumption_weight
        self._healthy_reward = healthy_reward
        self._terminate_when_unhealthy = terminate_when_unhealthy
        self._healthy_z_range = healthy_z_range
        self._reset_noise_scale = reset_noise_scale

        # Initialize EzPickle
        utils.EzPickle.__init__(
            self,
            forward_reward_weight=forward_reward_weight,
            healthy_reward=healthy_reward,
            terminate_when_unhealthy=terminate_when_unhealthy,
            healthy_z_range=healthy_z_range,
            reset_noise_scale=reset_noise_scale,
            **kwargs
        )

        # Init the observation space
        obs_size = 5 + 5 + 6
        self.observation_space = Box(
            low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float32
        )

        # Initialize the Mujoco environment
        MujocoEnv.__init__(
            self,
            MODEL_PATH,
            5,
            observation_space=self.observation_space,
            **kwargs
        )

        # Store previous COM position for velocity calculation
        self._prev_com = None

    def _calculate_observation_size(self):
        n_joints = self.model.nq - 7
        n_vel = self.model.nv - 6
        n_sensors = self.model.n_sensor

        return n_joints + n_vel + n_sensors

    def _get_obs(self):
        """Get current observation."""
        # Joint positions (excluding free joint if present)
        # You may need to adjust the slicing based on your model
        position = self.data.qpos[7:].copy()

        # Joint velocities (excluding free joint if present)  
        velocities = self.data.qvel[6:].copy()

        # Sensor data
        sensor_data = self.data.sensordata[:6].copy()

        return np.concatenate([position, velocities, sensor_data]).astype(np.float32)
    
    def reset_model(self):
        """Reset the model to a random state."""
        # Add noise to initial positions and velocities
        noise_low = -self._reset_noise_scale
        noise_high = self._reset_noise_scale
        
        # Add noise to initial joint positions
        qpos = self.init_qpos + self.np_random.uniform(
            low=noise_low,
            high=noise_high,
            size=self.model.nq,
        )

        # Add noise to initial joint velocities  
        qvel = self.np_random.uniform(
            low=noise_low,
            high=noise_high,
            size=self.model.nv,
        )

        # Set the state
        self.set_state(qpos, qvel)

        # Reset previous COM tracking
        self._prev_com = None

        return self._get_obs()
    
    def step(self, action):
        """Perform a step in the environment."""
        #  ------ Forward reward ------
        # Get body ID for the lamp
        body_id = mujoco.mj_name2id(self.model, mujoco.mjtObj.mjOBJ_BODY, "dc15_a01_dummy_assy_idle_asm")

        # Store COM
        com_before = self.data.subtree_com[body_id].copy()

        # Perform the action
        self.do_simulation(action, self.frame_skip)

        # Get the new COM position
        com_after = self.data.subtree_com[body_id].copy()

        # Calculate velocity (COM change over time)
        velocity = (com_after - com_before) / self.dt
        
        forward_reward = self._forward_reward_weight * velocity[0]

       #  ------ Healthy reward ------
        lamp_head_id = mujoco.mj_name2id(
            self.model,
            mujoco.mjtObj.mjOBJ_BODY,
            "lamp_head"
        )
        lamp_head_pos = self.data.xpos[lamp_head_id]

        min_z, max_z = self._healthy_z_range
        is_healthy = (min_z <= lamp_head_pos[2] <= max_z)
        healthy_reward = self._healthy_reward if is_healthy else 0.0

        #  ------ Jump Reward ------
        body_pos = self.data.xpos[body_id]
        body_levitation = body_pos[2] - 0.02
        jump_reward = max(0, body_levitation) * self._jump_reward_weight

        # ------ Flight Reward ------
        is_in_air = self.data.ncon == 0
        in_air_reward = self._in_air_reward_weight * is_in_air

        # ------ Energy Consumption ------
        tau = self.data.actuator_force.copy()
        energy_consumption = np.sum(np.square(tau)) * self.dt
        energy_reward = -self._energy_consumption_weight * energy_consumption

        # Calculate total reward
        reward = forward_reward + healthy_reward + jump_reward + in_air_reward + energy_reward

        # Check if episode should terminate
        terminated = False
        if self._terminate_when_unhealthy and not is_healthy:
            terminated = True

        # Get observation
        observation = self._get_obs()

        # Render if in human mode
        if self.render_mode == "human":
            self.render()

                # Create info dict with metrics
        info = {
            # Position & velocity
            'x_position': com_after[0],
            'y_position': com_after[1],
            'distance_from_origin': np.linalg.norm(com_after),
            'x_velocity': velocity[0],
            'y_velocity': velocity[1],
            'z_height': body_pos[2],

            # Reward components
            'forward_reward': forward_reward,
            'healthy_reward': healthy_reward,
            'jump_reward': jump_reward,
            'in_air_reward': in_air_reward,
            'energy_penalty': energy_reward,
            'total_reward': reward,

            # Energy details
            'energy_consumption': energy_consumption,
            'actuator_torques': tau.copy(),

            # State flags
            'is_healthy': is_healthy,
            'is_in_air': is_in_air
        }

        # Return (observation, reward, terminated, truncated, info)
        return observation, reward, terminated, False, info

## Test Environment

In [44]:
env = LeLampEnv(forward_reward_weight=10.0)

obs, info = env.reset()

obs, info

(array([-0.00375568,  0.0012125 ,  0.00511799, -0.00219549, -0.0070742 ,
         0.00339591,  0.00913928, -0.00369656, -0.00604676, -0.00701995,
         0.0162988 ,  0.03310462,  0.0450488 , -0.00898036, -0.00532504,
         0.00478129], dtype=float32),
 {})

In [45]:
# Test single step
action = env.action_space.sample()
obs, reward, terminated, truncated, info = env.step(action)
print(f"✓ Step successful. Reward: {reward:.4f}")
print(f"  Terminated: {terminated}, Truncated: {truncated}")
print(f"  Info: {info}")

✓ Step successful. Reward: -2.0156
  Terminated: False, Truncated: False
  Info: {'x_position': np.float64(0.008239513788759898), 'y_position': np.float64(-0.005137557805081651), 'distance_from_origin': np.float64(0.10662327554263651), 'x_velocity': np.float64(-0.020997256930089525), 'y_velocity': np.float64(-0.0180336192922558), 'z_height': np.float64(-0.0001127413154814723), 'forward_reward': np.float64(-0.20997256930089525), 'healthy_reward': 1.0, 'jump_reward': 0.0, 'in_air_reward': 0.0, 'energy_penalty': np.float64(-2.805625), 'total_reward': np.float64(-2.0155975693008954), 'energy_consumption': np.float64(2.805625), 'actuator_torques': array([-3.35,  3.35,  3.35,  3.35,  3.35]), 'is_healthy': np.True_, 'is_in_air': False}


In [46]:
import mediapy as media

def render_env_video(max_steps=100, fps=20):
    """Collect frames and show as video in Jupyter."""
    env = LeLampEnv(render_mode="rgb_array")
    obs, info = env.reset()
    
    frames = []
    rewards = []
    
    # Collect frames
    for i in range(max_steps):
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)
        
        # Get frame and store
        frame = env.render()
        if frame is not None:
            frames.append(frame)
            rewards.append(reward)
        
        if terminated or truncated:
            print(f"Episode ended at step {i+1}")
            break
    
    env.close()
    
    # Show video using media.show_video
    if frames:
        print(f"Collected {len(frames)} frames")
        print(f"Total reward: {sum(rewards):.3f}")
        media.show_video(frames, fps=fps)
    else:
        print("No frames collected!")

# Just run this in a Jupyter cell:
render_env_video()


Episode ended at step 22
Collected 22 frames
Total reward: -36.918


0
This browser does not support the video tag.


# Train with PPO

In [61]:
# Create environment
env = LeLampEnv(
    forward_reward_weight=2.5,
    healthy_reward=0.8,
    energy_consumption_weight=0,
    jump_reward_weight=5.0,
    in_air_reward_weight=3.0,
)
env = TimeLimit(env, max_episode_steps=1000)

# Check environment is valid
check_env(env)
print("✓ Environment validation passed")

# Wrap environment for monitoring
env = Monitor(env, "./logs/")

✓ Environment validation passed


In [62]:
# Create model
model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    tensorboard_log="./tensorboard_logs/",
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    clip_range=0.2,
)

Using cuda device
Wrapping the env in a DummyVecEnv.


In [63]:
# Set up callbacks
eval_env = Monitor(LeLampEnv())
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./models/",
    log_path="./logs/",
    eval_freq=10000,
    deterministic=True,
    render=False,
)

In [64]:
checkpoint_callback = CheckpointCallback(
    save_freq=50000,
    save_path="./checkpoints/",
    name_prefix="lelamp_v2_ppo",
)

In [65]:
class DebugCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.last_print = time.time()
    
    def _on_step(self) -> bool:
        current_time = time.time()
        if current_time - self.last_print > 5:  # Print every 5 seconds
            print(f"DEBUG: Timestep {self.num_timesteps}, "
                  f"Time: {current_time}")
            self.last_print = current_time
        return True

# Add to your training:
debug_callback = DebugCallback()

In [68]:
# Train the model
model.learn(
    total_timesteps=200000,  # 1M steps
    callback=[eval_callback, checkpoint_callback, debug_callback],
    progress_bar=True,
)

KeyboardInterrupt: 

In [52]:
model.save("lelamp_ppo_v4")

# Test Model

In [53]:
    
# Load model
model = PPO.load("lelamp_ppo_v4")

# Create environment
env = LeLampEnv(
    forward_reward_weight=2.5,
    healthy_reward=0.8,
    energy_consumption_weight=0.2,
    jump_reward_weight=5.0,
    in_air_reward_weight=3.0,
    render_mode="rgb_array"
)
env = TimeLimit(env, max_episode_steps=1000)

In [54]:
# Test for multiple episodes
for episode in range(5):
    obs, info = env.reset()
    total_reward = 0
    steps = 0
    
    print(f"\nEpisode {episode + 1}:")
    
    for step in range(1000):  # Max 1000 steps per episode
        # Get action from trained policy
        action, _state = model.predict(obs, deterministic=True)
        
        # Step environment
        obs, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        steps += 1
        
        # Print progress
        if step % 100 == 0:
            print(f"  Step {step}: reward={reward:.3f}, x_pos={info['x_position']:.3f}")
        
        if terminated or truncated:
            break
    
    print(f"  Episode finished: {steps} steps, total reward: {total_reward:.3f}")

env.close()


Episode 1:
ERROR! Session/line number was not unique in database. History logging moved to new session 45
  Step 0: reward=0.466, x_pos=0.003
  Episode finished: 100 steps, total reward: 146.923

Episode 2:
  Step 0: reward=0.415, x_pos=0.001
  Episode finished: 96 steps, total reward: 146.524

Episode 3:
  Step 0: reward=0.474, x_pos=-0.007
  Step 100: reward=3.096, x_pos=-0.141
  Episode finished: 102 steps, total reward: 100.769

Episode 4:
  Step 0: reward=0.476, x_pos=-0.010
  Episode finished: 96 steps, total reward: 155.801

Episode 5:
  Step 0: reward=0.454, x_pos=-0.001
  Step 100: reward=0.546, x_pos=-0.146
  Episode finished: 104 steps, total reward: 154.992


In [55]:
all_episodes = []

num_episodes = 5
fps = 20  # Frames per second for video rendering
max_steps = 1000  # Max steps per episode
    
for episode in range(num_episodes):
    print(f"\nTesting Episode {episode + 1}/{num_episodes}")
    
    # Reset environment
    obs, info = env.reset()
    
    frames = []
    rewards = []
    episode_reward = 0
    step_count = 0
    
    for step in range(max_steps):
        # Get action from trained model (deterministic for consistent behavior)
        action, _states = model.predict(obs, deterministic=True)
        
        # Take step in environment
        obs, reward, terminated, truncated, info = env.step(action)
        
        # Collect data
        frame = env.render()
        if frame is not None:
            frames.append(frame)
            rewards.append(reward)
            episode_reward += reward
            step_count += 1
        
        # Print progress occasionally
        if step % 50 == 0 and step > 0:
            print(f"  Step {step}: reward={reward:.3f}, "
                    f"x_pos={info.get('x_position', 0):.3f}, "
                    f"total_reward={episode_reward:.3f}")
        
        if terminated or truncated:
            reason = "terminated" if terminated else "truncated"
            print(f"  Episode ended ({reason}) after {step + 1} steps")
            break
    
    # Store episode data
    episode_data = {
        'frames': frames,
        'rewards': rewards,
        'total_reward': episode_reward,
        'steps': step_count,
        'final_x_pos': info.get('x_position', 0),
        'final_y_pos': info.get('y_position', 0),
    }
    all_episodes.append(episode_data)
    
    print(f"  Episode Summary:")
    print(f"    Steps: {step_count}")
    print(f"    Total Reward: {episode_reward:.3f}")
    print(f"    Average Reward: {episode_reward/step_count:.3f}")
    print(f"    Final Position: ({episode_data['final_x_pos']:.3f}, {episode_data['final_y_pos']:.3f})")
    
    # Show video for this episode
    if frames:
        print(f"  Rendering video for Episode {episode + 1}...")
        media.show_video(frames, fps=fps)
    else:
        print("  No frames collected for this episode")


Testing Episode 1/5
  Step 50: reward=0.173, x_pos=-0.072, total_reward=77.373
  Episode ended (terminated) after 92 steps
  Episode Summary:
    Steps: 92
    Total Reward: 96.643
    Average Reward: 1.050
    Final Position: (-0.141, 0.196)
  Rendering video for Episode 1...


0
This browser does not support the video tag.



Testing Episode 2/5
  Step 50: reward=3.064, x_pos=-0.104, total_reward=84.612
  Episode ended (terminated) after 96 steps
  Episode Summary:
    Steps: 96
    Total Reward: 153.576
    Average Reward: 1.600
    Final Position: (-0.044, -0.030)
  Rendering video for Episode 2...


0
This browser does not support the video tag.



Testing Episode 3/5
  Step 50: reward=0.427, x_pos=-0.050, total_reward=69.278
  Episode ended (terminated) after 88 steps
  Episode Summary:
    Steps: 88
    Total Reward: 79.821
    Average Reward: 0.907
    Final Position: (-0.083, 0.146)
  Rendering video for Episode 3...


0
This browser does not support the video tag.



Testing Episode 4/5
  Step 50: reward=3.129, x_pos=-0.112, total_reward=87.185
  Episode ended (terminated) after 98 steps
  Episode Summary:
    Steps: 98
    Total Reward: 150.922
    Average Reward: 1.540
    Final Position: (-0.053, -0.067)
  Rendering video for Episode 4...


0
This browser does not support the video tag.



Testing Episode 5/5
  Step 50: reward=3.124, x_pos=-0.086, total_reward=88.579
  Step 100: reward=3.518, x_pos=-0.049, total_reward=156.039
  Episode ended (terminated) after 102 steps
  Episode Summary:
    Steps: 102
    Total Reward: 155.662
    Average Reward: 1.526
    Final Position: (-0.048, -0.020)
  Rendering video for Episode 5...


0
This browser does not support the video tag.
