In [None]:
import os
import cv2
import numpy as np
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv
import torch

# Hyperparameters
GAMMA = 0.99
LAMBDA = 0.95
CLIP_EPSILON = 0.2
ACTOR_LR = 3e-4
CRITIC_LR = 1e-3  # Not directly used; Stable Baselines3 uses a single learning rate
EPOCHS = 10
BATCH_SIZE = 512
ENTROPY_COEF = 0.01
CONTROL_PENALTY = 0.01
MAX_EPISODE_STEPS = 1000
ENV_NAME = "HumanoidStandup-v4"
TOTAL_TIMESTEPS = 10000000
REWARD_THRESHOLD = 45000
THRESHOLD_EPISODES = 3
PLATEAU_WINDOW = 50
PLATEAU_IMPROVEMENT = 0.001
EVAL_EPISODES = 5
EVAL_MAX_STEPS = 2000
EVAL_SAVE_VIDEO = True

class RunningStat:
    def __init__(self, shape):
        self.mean = np.zeros(shape, dtype=np.float32)
        self.std = np.ones(shape, dtype=np.float32)
        self.count = 0

    def update(self, x):
        batch_mean = np.mean(x, axis=0)
        batch_std = np.std(x, axis=0) + 1e-6
        batch_count = x.shape[0]
        self.count += batch_count
        delta = batch_mean - self.mean
        self.mean += delta * batch_count / self.count
        m_a = self.std * self.std * (self.count - batch_count)
        m_b = batch_std * batch_std * batch_count
        self.std = np.sqrt((m_a + m_b + np.square(delta) * self.count * batch_count / self.count) / self.count)

    def normalize(self, x):
        normalized = (x - self.mean) / self.std
        return np.clip(normalized, -5, 5)

class NormalizeObservationWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.running_stat = RunningStat(env.observation_space.shape)

    def observation(self, obs):
        self.running_stat.update(np.array([obs]))
        return self.running_stat.normalize(obs)

class ControlPenaltyWrapper(gym.Wrapper):
    def __init__(self, env, penalty_coeff=CONTROL_PENALTY):
        super().__init__(env)
        self.penalty_coeff = penalty_coeff

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        reward -= self.penalty_coeff * np.sum(action ** 2)
        return obs, reward, terminated, truncated, info

def evaluate(model, env, episodes=EVAL_EPISODES, max_steps=EVAL_MAX_STEPS, save_video=EVAL_SAVE_VIDEO):
    """Evaluate the policy and check if the humanoid stands."""
    print("\nStarting evaluation...")
    total_reward = 0
    standing_episodes = 0
    height_threshold = 1.1  # Approximate height for standing (z-position)

    if save_video:
        video_folder = "evaluation_videos"
        os.makedirs(video_folder, exist_ok=True)
        video_writer = None

    for ep in range(episodes):
        obs, _ = env.reset()
        episode_reward = 0
        steps = 0
        done = False
        max_height = 0

        if save_video:
            video_path = f"{video_folder}/eval_episode_{ep + 1}.mp4"
            frame = env.render()
            if frame is not None:
                video_writer = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*'mp4v'), 30,
                                               (frame.shape[1], frame.shape[0]))
            else:
                print("Warning: Render returned None, skipping video")

        while not done and steps < max_steps:
            action, _ = model.predict(obs, deterministic=False)
            obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            episode_reward += reward
            steps += 1

            height = obs[2]  # Assuming z-position is at index 2
            max_height = max(max_height, height)

            if save_video and video_writer is not None:
                frame = env.render()
                if frame is not None:
                    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                    video_writer.write(frame)

        total_reward += episode_reward
        if max_height > height_threshold:
            standing_episodes += 1

        print(f"Evaluation Episode {ep + 1}: Reward: {episode_reward:.2f}, Max Height: {max_height:.2f}")
        if save_video and video_writer is not None:
            video_writer.release()

    avg_reward = total_reward / episodes
    standing_ratio = standing_episodes / episodes
    print(f"Average Evaluation Reward: {avg_reward:.2f}")
    print(f"Standing Success Rate: {standing_ratio:.2f} ({standing_episodes}/{episodes} episodes)")
    return avg_reward, standing_ratio

def train():
    # Create environment with normalization and control penalty
    env = gym.make(ENV_NAME, render_mode="rgb_array", max_episode_steps=MAX_EPISODE_STEPS)
    env = NormalizeObservationWrapper(env)
    env = ControlPenaltyWrapper(env, penalty_coeff=CONTROL_PENALTY)
    env = DummyVecEnv([lambda: env])  # Wrap in DummyVecEnv for Stable Baselines3

    # Initialize PPO model
    model = PPO(
        policy="MlpPolicy",
        env=env,
        learning_rate=ACTOR_LR,
        n_steps=MAX_EPISODE_STEPS,  # Steps per rollout
        batch_size=BATCH_SIZE,
        n_epochs=EPOCHS,
        gamma=GAMMA,
        gae_lambda=LAMBDA,
        clip_range=CLIP_EPSILON,
        ent_coef=ENTROPY_COEF,
        verbose=1,
        device="cuda" if torch.cuda.is_available() else "cpu"
    )

    total_steps = 0
    episode_rewards = []
    threshold_count = 0
    plateau_rewards = []
    episode_count = 0

    # Custom training loop to mimic original stopping conditions
    while total_steps < TOTAL_TIMESTEPS:
        model.learn(total_timesteps=MAX_EPISODE_STEPS, reset_num_timesteps=False)
        total_steps += MAX_EPISODE_STEPS
        episode_count += 1

        # Estimate episode reward (Stable Baselines3 doesn't track this directly)
        obs = env.reset()
        episode_reward = 0
        done = False
        steps = 0
        while not done and steps < MAX_EPISODE_STEPS:
            action, _ = model.predict(obs, deterministic=False)
            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated[0] or truncated[0]
            episode_reward += reward[0]
            steps += 1

        episode_rewards.append(episode_reward)
        avg_reward = np.mean(episode_rewards[-10:]) if len(episode_rewards) >= 10 else np.mean(episode_rewards)
        print(f"Episode: {episode_count}, Reward: {episode_reward:.2f}, Avg Reward: {avg_reward:.2f}")

        if len(episode_rewards) >= 10 and avg_reward >= REWARD_THRESHOLD:
            threshold_count += 1
            if threshold_count >= THRESHOLD_EPISODES:
                print(f"Stopping training: Avg reward {avg_reward:.2f} >= {REWARD_THRESHOLD}")
                break
        else:
            threshold_count = 0

        if len(episode_rewards) >= PLATEAU_WINDOW + 10:
            old_avg = np.mean(plateau_rewards[-PLATEAU_WINDOW - 10:-PLATEAU_WINDOW])
            new_avg = np.mean(plateau_rewards[-PLATEAU_WINDOW:])
            improvement = (new_avg - old_avg) / old_avg if old_avg != 0 else 0
            if improvement < PLATEAU_IMPROVEMENT:
                print(f"Stopping training: Reward plateaued (improvement {improvement:.4f} < {PLATEAU_IMPROVEMENT})")
                break
            plateau_rewards.append(avg_reward)

    # Save model and evaluate once at the end
    model.save("ppo_humanoid_standup_mujoco")
    evaluate(model, env)
    env.close()

if __name__ == "__main__":
    train()

In [None]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Create the HumanoidStandup environment
env = make_vec_env("HumanoidStandup-v4", n_envs=1)

# Initialize the PPO model with a multi-layer perceptron policy
model = PPO("MlpPolicy", env, verbose=1)

# Train the model for 1 million timesteps
model.learn(total_timesteps=1000000)

# Save the trained model
model.save("ppo_humanoid_standup_mujoco")