In [4]:
import os
import cv2
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal
import cloudpickle

# Hyperparameters
GAMMA = 0.99
LAMBDA = 0.95
CLIP_EPSILON = 0.2
ACTOR_LR = 3e-5
CRITIC_LR = 1e-4
EPOCHS = 10
BATCH_SIZE = 1024
ENTROPY_COEF = 0.1
CONTROL_PENALTY = 0.0
GRAD_CLIP = 0.5
MIN_STD = 1e-6
MAX_EPISODE_STEPS = 1000
ENV_NAME = "HumanoidStandup-v4"
TOTAL_TIMESTEPS = 20000000
REWARD_THRESHOLD = 45000
THRESHOLD_EPISODES = 3
PLATEAU_WINDOW = 50
PLATEAU_IMPROVEMENT = 0.005
EVAL_EPISODES = 5
EVAL_MAX_STEPS = 2000
EVAL_SAVE_VIDEO = True


class RunningStat:
    def __init__(self, shape):
        self.mean = np.zeros(shape, dtype=np.float32)
        self.std = np.ones(shape, dtype=np.float32)
        self.count = 0

    def update(self, x):
        batch_mean = np.mean(x, axis=0)
        batch_std = np.std(x, axis=0) + MIN_STD
        batch_count = x.shape[0]
        self.count += batch_count
        delta = batch_mean - self.mean
        self.mean += delta * batch_count / self.count
        m_a = self.std * self.std * (self.count - batch_count)
        m_b = batch_std * batch_std * batch_count
        self.std = np.sqrt((m_a + m_b + np.square(delta) * self.count * batch_count / self.count) / self.count)

    def normalize(self, x):
        normalized = (x - self.mean) / self.std
        return np.clip(normalized, -10, 10)


class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super(Actor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Linear(512, act_dim * 2)
        )
        nn.init.orthogonal_(self.net[-1].weight, gain=0.01)
        nn.init.zeros_(self.net[-1].bias)

    def forward(self, state):
        x = self.net(state)
        mean, log_std = x.chunk(2, dim=-1)
        log_std = torch.clamp(log_std, -10, 2)
        return mean, log_std

    def sample(self, state):
        mean, log_std = self.forward(state)
        if torch.isnan(mean).any() or torch.isnan(log_std).any():
            return None, None, None
        std = log_std.exp()
        dist = Normal(mean, std)
        pre_tanh = dist.rsample()
        action = torch.tanh(pre_tanh)
        log_prob = dist.log_prob(pre_tanh) - torch.log(1 - action.pow(2) + 1e-6)
        log_prob = log_prob.sum(dim=-1)
        entropy = dist.entropy().sum(dim=-1)
        return action, log_prob, entropy

    def get_action(self, state, deterministic=False):
        mean, log_std = self.forward(state)
        if deterministic:
            return torch.tanh(mean)
        std = log_std.exp()
        dist = Normal(mean, std)
        pre_tanh = dist.sample()
        return torch.tanh(pre_tanh)


class Critic(nn.Module):
    def __init__(self, obs_dim):
        super(Critic, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )
        nn.init.orthogonal_(self.net[-1].weight, gain=1.0)
        nn.init.zeros_(self.net[-1].bias)

    def forward(self, state):
        return self.net(state)


class PPO:
    def __init__(self, obs_dim, act_dim, device):
        self.actor = Actor(obs_dim, act_dim).to(device)
        self.critic = Critic(obs_dim).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=CRITIC_LR)
        self.device = device
        self.obs_normalizer = RunningStat(obs_dim)

    def compute_gae(self, rewards, values, next_value, dones):
        advantages = np.zeros_like(rewards)
        gae = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + GAMMA * next_value * (1 - dones[t]) - values[t]
            gae = delta + GAMMA * LAMBDA * (1 - dones[t]) * gae
            advantages[t] = gae
            next_value = values[t]
        returns = advantages + values
        return advantages, returns

    def update(self, states, actions, old_log_probs, returns, advantages):
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        old_log_probs = torch.FloatTensor(old_log_probs).to(self.device)
        returns = torch.FloatTensor(returns).to(self.device)
        advantages = torch.FloatTensor(advantages).to(self.device)
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        for _ in range(EPOCHS):
            # Shuffle indices for mini-batches
            indices = np.random.permutation(len(states))
            for start in range(0, len(states), BATCH_SIZE):
                batch_idx = indices[start:start + BATCH_SIZE]
                batch_states = states[batch_idx]
                batch_actions = actions[batch_idx]
                batch_old_log_probs = old_log_probs[batch_idx]
                batch_returns = returns[batch_idx]
                batch_advantages = advantages[batch_idx]

                # Actor update
                action_out, log_prob, entropy = self.actor.sample(batch_states)
                if action_out is None:
                    continue
                ratio = torch.exp(log_prob - batch_old_log_probs)
                surr1 = ratio * batch_advantages
                surr2 = torch.clamp(ratio, 1 - CLIP_EPSILON, 1 + CLIP_EPSILON) * batch_advantages
                actor_loss = -torch.min(surr1, surr2).mean() - ENTROPY_COEF * entropy.mean()

                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.actor.parameters(), GRAD_CLIP)
                self.actor_optimizer.step()

                # Critic update (compute fresh value to avoid graph issues)
                value = self.critic(batch_states).squeeze()
                critic_loss = (batch_returns - value).pow(2).mean()

                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.critic.parameters(), GRAD_CLIP)
                self.critic_optimizer.step()

    def save(self, path):
        with open(path, 'wb') as f:
            cloudpickle.dump(self, f)

    def evaluate(self, env, episodes=EVAL_EPISODES, max_steps=EVAL_MAX_STEPS, save_video=EVAL_SAVE_VIDEO):
        print("\nStarting evaluation...")
        total_reward = 0
        standing_episodes = 0
        height_threshold = 1.1

        if save_video:
            video_folder = "evaluation_videos"
            os.makedirs(video_folder, exist_ok=True)
            video_writer = None

        for ep in range(episodes):
            obs, _ = env.reset()
            episode_reward = 0
            steps = 0
            done = False
            max_height = 0

            if save_video:
                video_path = f"{video_folder}/eval_episode_{ep + 1}.mp4"
                frame = env.render()
                if frame is not None:
                    video_writer = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*'mp4v'), 30,
                                                   (frame.shape[1], frame.shape[0]))
                else:
                    print("Warning: Render returned None, skipping video")

            while not done and steps < max_steps:
                norm_obs = self.obs_normalizer.normalize(obs)
                obs_tensor = torch.FloatTensor(norm_obs).to(self.device).unsqueeze(0)
                action, _, _ = self.actor.sample(obs_tensor)
                if action is None:
                    print("Warning: Invalid action, breaking episode")
                    break
                action_np = action.detach().cpu().numpy()[0] * 0.4  # Scale actions to [-0.4, 0.4]

                obs, reward, terminated, truncated, info = env.step(action_np)
                done = terminated or truncated
                episode_reward += reward
                steps += 1

                height = obs[2]
                max_height = max(max_height, height)

                if save_video and video_writer is not None:
                    frame = env.render()
                    if frame is not None:
                        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                        video_writer.write(frame)

            total_reward += episode_reward
            if max_height > height_threshold:
                standing_episodes += 1

            print(f"Evaluation Episode {ep + 1}: Reward: {episode_reward:.2f}, Max Height: {max_height:.2f}")
            if save_video and video_writer is not None:
                video_writer.release()

        avg_reward = total_reward / episodes
        standing_ratio = standing_episodes / episodes
        print(f"Average Evaluation Reward: {avg_reward:.2f}")
        print(f"Standing Success Rate: {standing_ratio:.2f} ({standing_episodes}/{episodes} episodes)")
        return avg_reward, standing_ratio


def train():
    env = gym.make(ENV_NAME, render_mode="rgb_array")
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent = PPO(obs_dim, act_dim, device)

    total_steps = 0
    episode_rewards = []
    threshold_count = 0
    plateau_rewards = []

    while total_steps < TOTAL_TIMESTEPS:
        state, _ = env.reset()
        episode_reward = 0
        max_height = 0
        states, actions, rewards, log_probs, values, dones = [], [], [], [], [], []

        for t in range(MAX_EPISODE_STEPS):
            agent.obs_normalizer.update(np.array([state]))
            norm_state = agent.obs_normalizer.normalize(state)
            state_tensor = torch.FloatTensor(norm_state).to(device)

            action, log_prob, _ = agent.actor.sample(state_tensor.unsqueeze(0))
            if action is None:
                break
            value = agent.critic(state_tensor.unsqueeze(0)).item()

            action_np = action.detach().cpu().numpy()[0] * 0.4  # Scale actions to [-0.4, 0.4]
            next_state, reward, terminated, truncated, _ = env.step(action_np)
            done = terminated or truncated

            states.append(state)
            actions.append(action_np)
            rewards.append(reward)
            log_probs.append(log_prob.item())
            values.append(value)
            dones.append(done)

            state = next_state
            episode_reward += reward
            total_steps += 1
            max_height = max(max_height, state[2])  # Track max height

            if done:
                break

        if np.isnan(next_state).any():
            continue

        agent.obs_normalizer.update(np.array([next_state]))
        norm_next_state = agent.obs_normalizer.normalize(next_state)
        next_value = agent.critic(torch.FloatTensor(norm_next_state).to(device).unsqueeze(0)).item()

        advantages, returns = agent.compute_gae(rewards, values, next_value, dones)
        agent.update(states, actions, log_probs, returns, advantages)

        episode_rewards.append(episode_reward)
        avg_reward = np.mean(episode_rewards[-10:]) if len(episode_rewards) >= 10 else np.mean(episode_rewards)
        print(
            f"Episode: {len(episode_rewards)}, Reward: {episode_reward:.2f}, Avg Reward: {avg_reward:.2f}, Max Height: {max_height:.2f}")

        if len(episode_rewards) >= 10 and avg_reward >= REWARD_THRESHOLD:
            threshold_count += 1
            if threshold_count >= THRESHOLD_EPISODES:
                print(f"Stopping training: Avg reward {avg_reward:.2f} >= {REWARD_THRESHOLD}")
                break
        else:
            threshold_count = 0

        if len(episode_rewards) >= PLATEAU_WINDOW + 10:
            old_avg = np.mean(plateau_rewards[-PLATEAU_WINDOW - 10:-PLATEAU_WINDOW])
            new_avg = np.mean(plateau_rewards[-PLATEAU_WINDOW:])
            improvement = (new_avg - old_avg) / old_avg if old_avg != 0 else 0
            if improvement < PLATEAU_IMPROVEMENT:
                print(f"Stopping training: Reward plateaued (improvement {improvement:.4f} < {PLATEAU_IMPROVEMENT})")
                break
            plateau_rewards.append(avg_reward)

    agent.save("ppo_humanoid_standup_mujoco")
    agent.evaluate(env)
    env.close()


if __name__ == "__main__":
    train()

Episode: 1, Reward: 31807.09, Avg Reward: 31807.09, Max Height: 0.26
Episode: 2, Reward: 32456.98, Avg Reward: 32132.04, Max Height: 0.13
Episode: 3, Reward: 36494.93, Avg Reward: 33586.34, Max Height: 0.05
Episode: 4, Reward: 32348.89, Avg Reward: 33276.97, Max Height: 0.27
Episode: 5, Reward: 33236.40, Avg Reward: 33268.86, Max Height: 0.11
Episode: 6, Reward: 31343.53, Avg Reward: 32947.97, Max Height: 0.24
Episode: 7, Reward: 32750.48, Avg Reward: 32919.76, Max Height: 0.23
Episode: 8, Reward: 33604.78, Avg Reward: 33005.39, Max Height: 0.21
Episode: 9, Reward: 34307.34, Avg Reward: 33150.05, Max Height: 0.32
Episode: 10, Reward: 36698.17, Avg Reward: 33504.86, Max Height: 0.07
Episode: 11, Reward: 32233.83, Avg Reward: 33547.53, Max Height: 0.26
Episode: 12, Reward: 31595.73, Avg Reward: 33461.41, Max Height: 0.23
Episode: 13, Reward: 33120.82, Avg Reward: 33124.00, Max Height: 0.52
Episode: 14, Reward: 30993.87, Avg Reward: 32988.49, Max Height: 0.16
Episode: 15, Reward: 36741.01