In [1]:
import gymnasium as gym
import ale_py
import cv2
import numpy as np

gym.register_envs(ale_py)
env = gym.make("Humanoid-v5", render_mode="rgb_array")
state_size = env.observation_space.shape[0]
action_size = env.action_space
state_size, action_size

(348, Box(-0.4, 0.4, (17,), float32))

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
import numpy as np
import gymnasium as gym
from collections import deque
import random

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
import time
# Hyperparameters
class Config:
    def __init__(self):
        self.env_name = "Humanoid-v4"
        self.seed = 42
        self.gamma = 0.99
        self.gae_lambda = 0.95
        self.entropy_coef = 0.01
        self.value_loss_coef = 0.5
        self.max_grad_norm = 0.5
        self.num_steps = 2048
        self.num_envs = 1
        self.ppo_epochs = 10
        self.num_minibatches = 32
        self.clip_param = 0.2
        self.lr = 3e-4
        self.eps = 1e-5
        self.hidden_size = 64
        self.max_episodes = 10000
        self.save_interval = 100

config = Config()

# Set random seeds
torch.manual_seed(config.seed)
np.random.seed(config.seed)

In [4]:

# Actor-Critic Network
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size=64):
        super(ActorCritic, self).__init__()

        # Shared network
        self.shared = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh()
        )

        # Actor (policy) network
        self.actor = nn.Linear(hidden_size, num_actions)
        self.actor_logstd = nn.Parameter(torch.zeros(1, num_actions))

        # Critic (value) network
        self.critic = nn.Linear(hidden_size, 1)

        # Initialize weights
        self.apply(self.init_weights)

    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.orthogonal_(m.weight.data)
            nn.init.constant_(m.bias.data, 0)

    def forward(self, x):
        x = self.shared(x)
        return self.actor(x), self.critic(x)

    def act(self, state):
        mean, value = self.forward(state)
        logstd = self.actor_logstd.expand_as(mean)
        std = torch.exp(logstd)

        dist = Normal(mean, std)
        action = dist.sample()
        log_prob = dist.log_prob(action).sum(-1, keepdim=True)

        return action, log_prob, value

    def evaluate(self, state, action):
        mean, value = self.forward(state)
        logstd = self.actor_logstd.expand_as(mean)
        std = torch.exp(logstd)

        dist = Normal(mean, std)
        log_prob = dist.log_prob(action).sum(-1, keepdim=True)
        entropy = dist.entropy().sum(-1, keepdim=True)

        return log_prob, value, entropy

In [5]:

# PPO Agent
class PPO:
    def __init__(self, num_inputs, num_actions, config):
        self.config = config

        self.model = ActorCritic(num_inputs, num_actions, config.hidden_size).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=config.lr, eps=config.eps)

        # Storage for rollout data
        self.states = torch.zeros(config.num_steps, config.num_envs, num_inputs).to(device)
        self.actions = torch.zeros(config.num_steps, config.num_envs, num_actions).to(device)
        self.log_probs = torch.zeros(config.num_steps, config.num_envs, 1).to(device)
        self.values = torch.zeros(config.num_steps, config.num_envs, 1).to(device)
        self.rewards = torch.zeros(config.num_steps, config.num_envs, 1).to(device)
        self.masks = torch.zeros(config.num_steps, config.num_envs, 1).to(device)
        self.returns = torch.zeros(config.num_steps, config.num_envs, 1).to(device)
        self.advantages = torch.zeros(config.num_steps, config.num_envs, 1).to(device)

    def compute_gae(self, next_value):
        gae = 0
        for step in reversed(range(self.config.num_steps)):
            if step == self.config.num_steps - 1:
                next_non_terminal = 1.0 - self.masks[step]
                next_values = next_value
            else:
                next_non_terminal = 1.0 - self.masks[step + 1]
                next_values = self.values[step + 1]

            delta = self.rewards[step] + self.config.gamma * next_values * next_non_terminal - self.values[step]
            gae = delta + self.config.gamma * self.config.gae_lambda * next_non_terminal * gae
            self.advantages[step] = gae

        self.returns = self.advantages + self.values

    def update(self):
        # Flatten the batch
        states = self.states.view(-1, self.states.size(-1))
        actions = self.actions.view(-1, self.actions.size(-1))
        log_probs_old = self.log_probs.view(-1, 1)
        returns = self.returns.view(-1, 1)
        advantages = self.advantages.view(-1, 1)

        # Normalize advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        # Optimize policy for K epochs
        for _ in range(self.config.ppo_epochs):
            # Minibatch update
            for batch_idx in self._get_batches(states.size(0)):
                batch_states = states[batch_idx]
                batch_actions = actions[batch_idx]
                batch_log_probs_old = log_probs_old[batch_idx]
                batch_returns = returns[batch_idx]
                batch_advantages = advantages[batch_idx]

                # Evaluate actions
                log_probs, values, entropy = self.model.evaluate(batch_states, batch_actions)

                # Compute ratio (pi_theta / pi_theta_old)
                ratio = torch.exp(log_probs - batch_log_probs_old)

                # Compute surrogate losses
                surr1 = ratio * batch_advantages
                surr2 = torch.clamp(ratio, 1.0 - self.config.clip_param, 1.0 + self.config.clip_param) * batch_advantages

                # Policy and value losses
                policy_loss = -torch.min(surr1, surr2).mean()
                value_loss = F.mse_loss(values, batch_returns)
                entropy_loss = -entropy.mean()

                # Total loss
                loss = policy_loss + self.config.value_loss_coef * value_loss + self.config.entropy_coef * entropy_loss

                # Gradient step
                self.optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm)
                self.optimizer.step()

    def _get_batches(self, num_samples):
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        batch_size = num_samples // self.config.num_minibatches

        for i in range(self.config.num_minibatches):
            yield indices[i*batch_size : (i+1)*batch_size]

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
import numpy as np
import gymnasium as gym
import time
from collections import deque

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Config:
    def __init__(self):
        self.env_name = "Humanoid-v5"  # Updated to v5
        self.seed = 42
        self.gamma = 0.99
        self.gae_lambda = 0.95
        self.entropy_coef = 0.01
        self.value_loss_coef = 0.5
        self.max_grad_norm = 0.5
        self.num_steps = 4096  # Increased from 2048
        self.num_envs = 1
        self.ppo_epochs = 10
        self.num_minibatches = 32
        self.clip_param = 0.2
        self.lr = 1e-4  # Reduced from 3e-4
        self.eps = 1e-5
        self.hidden_size = 256  # Increased from 64
        self.max_episodes = 10000
        self.save_interval = 100
        self.early_stop_threshold = 3000  # Target reward for early success

config = Config()

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size=256):
        super(ActorCritic, self).__init__()
        self.shared = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh()
        )
        self.actor = nn.Linear(hidden_size, num_actions)
        self.actor_logstd = nn.Parameter(torch.zeros(1, num_actions))
        self.critic = nn.Linear(hidden_size, 1)
        self.apply(self.init_weights)

    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.orthogonal_(m.weight.data)
            nn.init.constant_(m.bias.data, 0)

    def forward(self, x):
        x = self.shared(x)
        return self.actor(x), self.critic(x)

    def act(self, state):
        mean, value = self.forward(state)
        logstd = self.actor_logstd.expand_as(mean)
        std = torch.exp(logstd)
        dist = Normal(mean, std)
        action = dist.sample()
        log_prob = dist.log_prob(action).sum(-1, keepdim=True)
        return action, log_prob, value

    def evaluate(self, state, action):
        mean, value = self.forward(state)
        logstd = self.actor_logstd.expand_as(mean)
        std = torch.exp(logstd)
        dist = Normal(mean, std)
        log_prob = dist.log_prob(action).sum(-1, keepdim=True)
        entropy = dist.entropy().sum(-1, keepdim=True)
        return log_prob, value, entropy

class PPO:
    def __init__(self, num_inputs, num_actions, config):
        self.config = config
        self.model = ActorCritic(num_inputs, num_actions, config.hidden_size).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=config.lr, eps=config.eps)
        self.states = torch.zeros(config.num_steps, config.num_envs, num_inputs).to(device)
        self.actions = torch.zeros(config.num_steps, config.num_envs, num_actions).to(device)
        self.log_probs = torch.zeros(config.num_steps, config.num_envs, 1).to(device)
        self.values = torch.zeros(config.num_steps, config.num_envs, 1).to(device)
        self.rewards = torch.zeros(config.num_steps, config.num_envs, 1).to(device)
        self.masks = torch.zeros(config.num_steps, config.num_envs, 1).to(device)
        self.returns = torch.zeros(config.num_steps, config.num_envs, 1).to(device)
        self.advantages = torch.zeros(config.num_steps, config.num_envs, 1).to(device)

    def compute_gae(self, next_value):
        gae = 0
        for step in reversed(range(self.config.num_steps)):
            if step == self.config.num_steps - 1:
                next_non_terminal = 1.0 - self.masks[step]
                next_values = next_value
            else:
                next_non_terminal = 1.0 - self.masks[step + 1]
                next_values = self.values[step + 1]
            delta = self.rewards[step] + self.config.gamma * next_values * next_non_terminal - self.values[step]
            gae = delta + self.config.gamma * self.config.gae_lambda * next_non_terminal * gae
            self.advantages[step] = gae
        self.returns = self.advantages + self.values

    def update(self):
        states = self.states.view(-1, self.states.size(-1))
        actions = self.actions.view(-1, self.actions.size(-1))
        log_probs_old = self.log_probs.view(-1, 1)
        returns = self.returns.view(-1, 1)
        advantages = self.advantages.view(-1, 1)
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

        for _ in range(self.config.ppo_epochs):
            for batch_idx in self._get_batches(states.size(0)):
                batch_states = states[batch_idx]
                batch_actions = actions[batch_idx]
                batch_log_probs_old = log_probs_old[batch_idx]
                batch_returns = returns[batch_idx]
                batch_advantages = advantages[batch_idx]

                log_probs, values, entropy = self.model.evaluate(batch_states, batch_actions)
                ratio = torch.exp(log_probs - batch_log_probs_old)
                surr1 = ratio * batch_advantages
                surr2 = torch.clamp(ratio, 1.0 - self.config.clip_param, 1.0 + self.config.clip_param) * batch_advantages
                policy_loss = -torch.min(surr1, surr2).mean()
                value_loss = F.mse_loss(values, batch_returns)
                entropy_loss = -entropy.mean()
                loss = policy_loss + self.config.value_loss_coef * value_loss + self.config.entropy_coef * entropy_loss

                self.optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm)
                self.optimizer.step()

    def _get_batches(self, num_samples):
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        batch_size = num_samples // self.config.num_minibatches
        for i in range(self.config.num_minibatches):
            yield indices[i*batch_size : (i+1)*batch_size]

def train():
    env = gym.make(config.env_name)
    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    agent = PPO(num_inputs, num_actions, config)

    # Training metrics
    episode_rewards = []
    running_reward = 0
    best_reward = -np.inf
    start_time = time.time()
    episode_counter = 0
    group_size = 25

    # Early stopping
    no_improvement = 0
    early_stop_patience = 100

    # Observation normalization
    obs_mean = torch.zeros(num_inputs).to(device)
    obs_var = torch.ones(num_inputs).to(device)
    obs_count = 1e-4

    state, _ = env.reset()
    state = torch.FloatTensor(state).unsqueeze(0).to(device)

    print("Starting training...")
    print(f"Observation space: {num_inputs}, Action space: {num_actions}")
    print(f"Training for max {config.max_episodes} episodes")

    try:
        while episode_counter < config.max_episodes:
            group_rewards = []
            group_start_time = time.time()

            for _ in range(group_size):
                episode_reward = 0
                for step in range(config.num_steps):
                    # Normalize state
                    normalized_state = (state - obs_mean) / torch.sqrt(obs_var + 1e-8)

                    with torch.no_grad():
                        action, log_prob, value = agent.model.act(normalized_state)

                    next_state, reward, terminated, truncated, _ = env.step(action.cpu().numpy()[0])
                    done = terminated or truncated
                    mask = 1.0 - float(done)
                    reward = np.clip(reward, -10, 10)  # Reward clipping

                    agent.states[step] = normalized_state
                    agent.actions[step] = action
                    agent.log_probs[step] = log_prob
                    agent.values[step] = value
                    agent.rewards[step] = torch.FloatTensor([reward/10.0]).unsqueeze(1).to(device)
                    agent.masks[step] = torch.FloatTensor([mask]).unsqueeze(1).to(device)

                    state = torch.FloatTensor(next_state).unsqueeze(0).to(device)
                    episode_reward += reward

                    if done:
                        state, _ = env.reset()
                        state = torch.FloatTensor(state).unsqueeze(0).to(device)
                        episode_rewards.append(episode_reward)
                        group_rewards.append(episode_reward)
                        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
                        episode_counter += 1
                        break

                # Update normalization stats
                with torch.no_grad():
                    obs_mean = (obs_count * obs_mean + state.sum(0)) / (obs_count + state.size(0))
                    obs_var = (obs_count * obs_var + ((state - obs_mean)**2).sum(0)) / (obs_count + state.size(0))
                    obs_count += state.size(0)

                # Update policy
                with torch.no_grad():
                    normalized_state = (state - obs_mean) / torch.sqrt(obs_var + 1e-8)
                    _, _, next_value = agent.model.act(normalized_state)
                agent.compute_gae(next_value)
                agent.update()

            # Print group statistics
            group_avg = np.mean(group_rewards)
            group_std = np.std(group_rewards)
            group_min = np.min(group_rewards)
            group_max = np.max(group_rewards)
            time_per_episode = (time.time() - group_start_time) / group_size

            print(f"\nEpisodes {episode_counter-group_size+1}-{episode_counter}:")
            print(f"  Avg: {group_avg:.1f} ± {group_std:.1f}")
            print(f"  Range: {group_min:.1f} to {group_max:.1f}")
            print(f"  Running Avg: {running_reward:.1f}")
            print(f"  Time/episode: {time_per_episode:.2f}s")

            # Save and early stopping
            if running_reward > best_reward:
                best_reward = running_reward
                torch.save(agent.model.state_dict(), "humanoid_ppo_best.pth")
                no_improvement = 0
            else:
                no_improvement += group_size

            if running_reward >= config.early_stop_threshold:
                print(f"\nEarly success! Reached target reward {config.early_stop_threshold}")
                break

            if no_improvement >= early_stop_patience:
                print(f"\nEarly stopping - no improvement for {early_stop_patience} episodes")
                break

            if episode_counter % config.save_interval == 0:
                torch.save(agent.model.state_dict(), f"humanoid_ppo_{episode_counter}.pth")

    except KeyboardInterrupt:
        print("\nTraining interrupted by user")

    finally:
        total_time = time.time() - start_time
        print(f"\nTraining completed in {total_time/3600:.2f} hours")
        print(f"Final running reward: {running_reward:.1f}")
        print(f"Best running reward: {best_reward:.1f}")
        env.close()
        return episode_rewards

if __name__ == "__main__":
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    rewards = train()

Starting training...
Observation space: 348, Action space: 17
Training for max 10000 episodes

Episodes 1-25:
  Avg: -12.3 ± 42.2
  Range: -101.2 to 46.6
  Running Avg: -19.2
  Time/episode: 2.79s

Episodes 26-50:
  Avg: -167.2 ± 72.8
  Range: -324.6 to -59.2
  Running Avg: -139.7
  Time/episode: 2.82s

Episodes 51-75:
  Avg: -221.0 ± 53.2
  Range: -355.4 to -170.0
  Running Avg: -199.5
  Time/episode: 3.28s

Episodes 76-100:
  Avg: -204.0 ± 52.6
  Range: -350.0 to -160.0
  Running Avg: -206.7
  Time/episode: 4.22s

Episodes 101-125:
  Avg: -222.0 ± 63.3
  Range: -420.0 to -160.0
  Running Avg: -220.5
  Time/episode: 4.18s

Early stopping - no improvement for 100 episodes

Training completed in 0.12 hours
Final running reward: -220.5
Best running reward: -19.2
