In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Normal
import gymnasium as gym  # or use `import gym` if not using Gymnasium

# Define the Actor-Critic network
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU()
        )
        self.actor_mean = nn.Linear(256, action_dim)
        self.critic = nn.Linear(256, 1)
        self.log_std = nn.Parameter(torch.zeros(action_dim))

    def forward(self, x):
        x = self.net(x)
        mean = self.actor_mean(x)
        value = self.critic(x)
        return mean, value

# Initialize environment and model
env = gym.make("HumanoidStandup-v4", render_mode=None)  # or gym.make(...)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
model = ActorCritic(state_dim, action_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# Hyperparameters
gamma = 0.99
lam = 0.95
clip_eps = 0.2
value_coef = 0.5
entropy_coef = 0.01
n_steps = 2048
ppo_epochs = 10
batch_size = 64

# Trackers
episode_rewards = []
episode_lengths = []
ep_reward = 0
ep_len = 0

# Training loop
for update in range(500):  # PPO updates
    state, _ = env.reset() if hasattr(env.reset(), "__len__") else (env.reset(), {})
    state = state.astype(np.float32)

    states, actions, logprobs, rewards, dones, values = [], [], [], [], [], []

    for step in range(n_steps):
        state_tensor = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            mean, value = model(state_tensor)
        std = torch.exp(model.log_std)
        dist = Normal(mean, std)
        action = dist.sample()
        log_prob = dist.log_prob(action).sum(dim=1)
        action_np = action.cpu().numpy()[0]
        action_clamped = np.clip(action_np, env.action_space.low, env.action_space.high)

        # Step the environment
        result = env.step(action_clamped)
        if len(result) == 5:
            next_state, reward, terminated, truncated, _ = result
            done = terminated or truncated
        else:
            next_state, reward, done, _ = result

        # Store rollout
        states.append(state.copy())
        actions.append(action_np)
        logprobs.append(log_prob.item())
        rewards.append(np.clip(reward, -100.0, 100.0))
        dones.append(done)
        values.append(value.item())

        # Track episode reward
        ep_reward += reward
        ep_len += 1

        if done:
            episode_rewards.append(ep_reward)
            episode_lengths.append(ep_len)
            ep_reward = 0
            ep_len = 0
            state, _ = env.reset() if hasattr(env.reset(), "__len__") else (env.reset(), {})
            state = state.astype(np.float32)
        else:
            state = next_state.astype(np.float32)

    # Compute value of last state
    state_tensor = torch.from_numpy(state).float().unsqueeze(0)
    with torch.no_grad():
        _, last_value = model(state_tensor)
    last_value = last_value.item()

    # Convert to tensors
    states = torch.tensor(np.stack(states), dtype=torch.float32)
    actions = torch.tensor(np.stack(actions), dtype=torch.float32)
    old_logprobs = torch.tensor(logprobs, dtype=torch.float32)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    dones = torch.tensor(dones, dtype=torch.float32)
    values = torch.tensor(values + [last_value], dtype=torch.float32)

    # Compute GAE advantages and returns
    returns = torch.zeros_like(rewards)
    advantages = torch.zeros_like(rewards)
    gae = 0.0
    for t in reversed(range(n_steps)):
        mask = 1.0 - dones[t]
        delta = rewards[t] + gamma * values[t + 1] * mask - values[t]
        gae = delta + gamma * lam * gae * mask
        advantages[t] = gae
        returns[t] = advantages[t] + values[t]

    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    # PPO update
    num_samples = n_steps
    for epoch in range(ppo_epochs):
        idxs = np.random.permutation(num_samples)
        for start in range(0, num_samples, batch_size):
            end = start + batch_size
            batch_idx = idxs[start:end]

            batch_states = states[batch_idx]
            batch_actions = actions[batch_idx]
            batch_old_logp = old_logprobs[batch_idx]
            batch_returns = returns[batch_idx]
            batch_adv = advantages[batch_idx]

            mean, value = model(batch_states)
            dist = Normal(mean, torch.exp(model.log_std))
            new_logp = dist.log_prob(batch_actions).sum(dim=1)
            entropy = dist.entropy().sum(dim=1).mean()

            ratio = (new_logp - batch_old_logp).exp()
            surr1 = ratio * batch_adv
            surr2 = torch.clamp(ratio, 1.0 - clip_eps, 1.0 + clip_eps) * batch_adv
            policy_loss = -torch.min(surr1, surr2).mean()

            value_pred = value.squeeze(1)
            value_loss = (value_pred - batch_returns).pow(2).mean()

            loss = policy_loss + value_coef * value_loss - entropy_coef * entropy

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Print progress
    if len(episode_rewards) >= 10:
        avg_reward = np.mean(episode_rewards[-10:])
        print(f"Update {update+1} | Episodes: {len(episode_rewards)} | Avg Reward (last 10): {avg_reward:.2f} | Max: {np.max(episode_rewards[-10:]):.1f} | Min: {np.min(episode_rewards[-10:]):.1f}")
    elif len(episode_rewards) > 0:
        print(f"Update {update+1} | Episodes: {len(episode_rewards)} | Latest Reward: {episode_rewards[-1]:.2f}")
    else:
        print(f"Update {update+1} | No completed episodes yet.")

# Save the trained model
torch.save(model.state_dict(), "ppo_humanoid_standup.pth")
print("✅ Model training complete and saved as 'ppo_humanoid_standup.pth'")


  logger.deprecation(


Update 1 | Episodes: 2 | Latest Reward: 34836.85
Update 2 | Episodes: 4 | Latest Reward: 49514.46
Update 3 | Episodes: 6 | Latest Reward: 41788.51
Update 4 | Episodes: 8 | Latest Reward: 39258.62
Update 5 | Episodes: 10 | Avg Reward (last 10): 41351.11 | Max: 51115.3 | Min: 34456.7
Update 6 | Episodes: 12 | Avg Reward (last 10): 43067.99 | Max: 51115.3 | Min: 37242.7
Update 7 | Episodes: 14 | Avg Reward (last 10): 41762.56 | Max: 44744.5 | Min: 37242.7
Update 8 | Episodes: 16 | Avg Reward (last 10): 44892.31 | Max: 59393.1 | Min: 37242.7
Update 9 | Episodes: 18 | Avg Reward (last 10): 45061.91 | Max: 59393.1 | Min: 37242.7
Update 10 | Episodes: 20 | Avg Reward (last 10): 45727.00 | Max: 59393.1 | Min: 40384.2
Update 11 | Episodes: 22 | Avg Reward (last 10): 51221.10 | Max: 72091.5 | Min: 40384.2
Update 12 | Episodes: 24 | Avg Reward (last 10): 63397.30 | Max: 106979.6 | Min: 40384.2
Update 13 | Episodes: 26 | Avg Reward (last 10): 66462.69 | Max: 106979.6 | Min: 40384.2
Update 14 | Epi

In [3]:
import numpy as np
import torch
from IPython.display import Video, display
import gymnasium as gym
import imageio

# Recreate model and load weights
model = ActorCritic(state_dim, action_dim)
model.load_state_dict(torch.load("ppo_humanoid_standup.pth"))
model.eval()

# Create environment with RGB array rendering
eval_env = gym.make("HumanoidStandup-v4", render_mode="rgb_array")
state, _ = eval_env.reset() if hasattr(eval_env.reset(), "__len__") else (eval_env.reset(), {})
state = state.astype(np.float32)

frames = []
done = False
while not done:
    state_tensor = torch.from_numpy(state).float().unsqueeze(0)
    with torch.no_grad():
        mean, _ = model(state_tensor)
    action = mean.cpu().numpy()[0]
    # Clamp action to valid range
    action = np.clip(action, eval_env.action_space.low, eval_env.action_space.high)

    result = eval_env.step(action)
    if len(result) == 5:  # Gymnasium API
        state, reward, terminated, truncated, _ = result
        done = terminated or truncated
    else:  # Gym API
        state, reward, done, _ = result
    state = state.astype(np.float32)

    # Render and store frame
    frame = eval_env.render()
    frames.append(frame)

eval_env.close()

# Save as MP4 video
video_filename = "humanoid_standup_slow.mp4"
imageio.mimsave(video_filename, frames, fps=30)

# Display video
display(Video(video_filename, embed=True, width=600))


  logger.deprecation(


## FAST Training

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Normal
import gymnasium as gym

class ActorCriticSmall(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU()
        )
        self.actor_mean = nn.Linear(64, action_dim)
        self.critic = nn.Linear(64, 1)
        self.log_std = nn.Parameter(torch.zeros(action_dim))

    def forward(self, x):
        x = self.net(x)
        return self.actor_mean(x), self.critic(x)

env = gym.make("HumanoidStandup-v4", render_mode=None)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
model_fast = ActorCriticSmall(state_dim, action_dim)
optimizer = torch.optim.Adam(model_fast.parameters(), lr=3e-4)

gamma = 0.99
lam = 0.95
clip_eps = 0.2
value_coef = 0.5
entropy_coef = 0.01
n_steps = 1024
ppo_epochs = 4
batch_size = 64

for update in range(200):  # faster: fewer updates, ~200 updates * 1024 steps
    state, _ = env.reset() if hasattr(env.reset(), "__len__") else (env.reset(), {})
    state = state.astype(np.float32)
    states, actions, logprobs, rewards, dones, values = [], [], [], [], [], []

    for step in range(n_steps):
        state_tensor = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            mean, value = model_fast(state_tensor)
        std = torch.exp(model_fast.log_std)
        dist = Normal(mean, std)
        action = dist.sample()
        log_prob = dist.log_prob(action).sum(dim=1)
        action_np = action.cpu().numpy()[0]
        action_clamped = np.clip(action_np, env.action_space.low, env.action_space.high)

        result = env.step(action_clamped)
        if len(result) == 5:
            next_state, reward, terminated, truncated, _ = result
            done = terminated or truncated
        else:
            next_state, reward, done, _ = result

        states.append(state.copy())
        actions.append(action_np)
        logprobs.append(log_prob.item())
        values.append(value.item())
        rewards.append(np.clip(reward, -100.0, 100.0))  # clip rewards
        dones.append(done)

        if done:
            state, _ = env.reset() if hasattr(env.reset(), "__len__") else (env.reset(), {})
            state = state.astype(np.float32)
        else:
            state = next_state.astype(np.float32)

    state_tensor = torch.from_numpy(state).float().unsqueeze(0)
    with torch.no_grad():
        _, last_value = model_fast(state_tensor)
    last_value = last_value.item()

    states = torch.tensor(np.stack(states), dtype=torch.float32)
    actions = torch.tensor(np.stack(actions), dtype=torch.float32)
    old_logprobs = torch.tensor(logprobs, dtype=torch.float32)
    values = torch.tensor(values + [last_value], dtype=torch.float32)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    dones = torch.tensor(dones, dtype=torch.float32)

    returns = torch.zeros_like(rewards)
    advantages = torch.zeros_like(rewards)
    gae = 0.0
    for t in reversed(range(n_steps)):
        mask = 1.0 - dones[t]
        delta = rewards[t] + gamma * values[t+1] * mask - values[t]
        gae = delta + gamma * lam * gae * mask
        advantages[t] = gae
        returns[t] = advantages[t] + values[t]
    # (Optional: normalize advantages, skipped here for speed)
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    num_samples = n_steps
    for epoch in range(ppo_epochs):
        idxs = np.random.permutation(num_samples)
        for start in range(0, num_samples, batch_size):
            end = start + batch_size
            batch_idx = idxs[start:end]

            batch_states = states[batch_idx]
            batch_actions = actions[batch_idx]
            batch_old_logp = old_logprobs[batch_idx]
            batch_returns = returns[batch_idx]
            batch_adv = advantages[batch_idx]

            mean, value = model_fast(batch_states)
            dist = Normal(mean, torch.exp(model_fast.log_std))
            new_logp = dist.log_prob(batch_actions).sum(dim=1)
            entropy = dist.entropy().sum(dim=1).mean()

            ratio = (new_logp - batch_old_logp).exp()
            surr1 = ratio * batch_adv
            surr2 = torch.clamp(ratio, 1.0 - clip_eps, 1.0 + clip_eps) * batch_adv
            policy_loss = -torch.min(surr1, surr2).mean()
            value_pred = value.squeeze(1)
            value_loss = (value_pred - batch_returns).pow(2).mean()
            loss = policy_loss + value_coef * value_loss - entropy_coef * entropy

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

torch.save(model_fast.state_dict(), "ppo_humanoid_fast.pth")
print("Fast model training complete and saved.")


# modulare


In [4]:
import os
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Normal
from collections import deque

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# === Actor Network ===
# A Gaussian policy network (MLP) mapping states to action means (and log_std).
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_sizes=(256,256)):
        super().__init__()
        layers = []
        input_dim = state_dim
        # Build hidden layers
        for h in hidden_sizes:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.ReLU())
            input_dim = h
        self.net = nn.Sequential(*layers)
        # Output layer for mean action
        self.mean_layer = nn.Linear(input_dim, action_dim)
        # Log-std parameter (one per action dimension)
        self.log_std = nn.Parameter(torch.zeros(action_dim) - 0.5)

    def forward(self, state):
        """Given a state tensor, return action mean and std."""
        x = self.net(state)
        mean = self.mean_layer(x)
        std = torch.exp(self.log_std)  # ensure positivity
        return mean, std

    def get_action(self, state):
        """Select an action by sampling from the policy (for training)."""
        mean, std = self.forward(state)
        dist = Normal(mean, std)
        action = dist.sample()
        log_prob = dist.log_prob(action).sum(dim=1)
        return action, log_prob

    def get_det_action(self, state):
        """Get deterministic action (mean) for evaluation."""
        mean, std = self.forward(state)
        return mean


# === Critic Network ===
# A value network (MLP) mapping states to scalar values.
class Critic(nn.Module):
    def __init__(self, state_dim, hidden_sizes=(256,256)):
        super().__init__()
        layers = []
        input_dim = state_dim
        # Build hidden layers
        for h in hidden_sizes:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.ReLU())
            input_dim = h
        self.net = nn.Sequential(*layers)
        # Output layer for state-value
        self.value_layer = nn.Linear(input_dim, 1)

    def forward(self, state):
        """Given a state tensor, return state-value."""
        x = self.net(state)
        value = self.value_layer(x)
        return value


# === PPO Agent ===
class PPO:
    def __init__(self, state_dim, action_dim, args):
        """
        args: dictionary of hyperparameters, e.g.
          gamma, lam, clip_eps, epochs, batch_size, lr, ent_coef, vf_coef, max_grad_norm
        """
        # Actor and Critic networks
        self.actor = Actor(state_dim, action_dim).to(device)
        self.critic = Critic(state_dim).to(device)
        # Optimizers
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=args['lr'])
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=args['lr'])
        # PPO hyperparameters
        self.gamma = args.get('gamma', 0.99)
        self.lam = args.get('lam', 0.95)
        self.clip_eps = args.get('clip_eps', 0.2)
        self.ent_coef = args.get('ent_coef', 1e-3)
        self.vf_coef = args.get('vf_coef', 0.5)
        self.max_grad_norm = args.get('max_grad_norm', 0.5)
        self.n_epochs = args.get('n_epochs', 10)
        self.batch_size = args.get('batch_size', 64)

    def select_action(self, state):
        """Given a state (numpy), select action and compute logprob and value."""
        state = torch.tensor(state, dtype=torch.float32).to(device).unsqueeze(0)
        mean, std = self.actor(state)
        dist = Normal(mean, std)
        action = dist.sample()
        log_prob = dist.log_prob(action).sum(dim=1)
        value = self.critic(state)
        # Convert to numpy for environment
        return action.cpu().numpy()[0], log_prob.item(), value.item()

    def evaluate(self, states, actions):
        """
        For a batch of states and actions (torch tensors), compute log-probs,
        entropies, and state-values under the current policy.
        """
        mean, std = self.actor(states)
        dist = Normal(mean, std)
        log_probs = dist.log_prob(actions).sum(dim=1)
        entropy = dist.entropy().sum(dim=1)
        values = self.critic(states).squeeze(1)
        return log_probs, values, entropy

    def update(self, memory):
        """
        Update the policy and value networks using collected trajectories.
        memory is a dict with lists: states, actions, log_probs, values, rewards, dones.
        """
        # Convert memory to tensors
        states = torch.tensor(np.array(memory['states']), dtype=torch.float32).to(device)
        actions = torch.tensor(np.array(memory['actions']), dtype=torch.float32).to(device)
        old_log_probs = torch.tensor(memory['log_probs'], dtype=torch.float32).to(device)
        returns = []
        advantages = []
        # Convert to numpy for convenience
        rewards = memory['rewards']
        values = memory['values']
        dones = memory['dones']

        # Compute GAE advantages and returns
        gae = 0
        next_value = 0  # since episode ended
        for step in reversed(range(len(rewards))):
            mask = 1.0 - dones[step]
            delta = rewards[step] + self.gamma * next_value * mask - values[step]
            gae = delta + self.gamma * self.lam * mask * gae
            advantages.insert(0, gae)
            next_value = values[step]
        advantages = torch.tensor(advantages, dtype=torch.float32).to(device)
        returns = advantages + torch.tensor(values, dtype=torch.float32).to(device)

        # Normalize advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        # Optimize policy for K epochs
        dataset_size = states.size(0)
        for _ in range(self.n_epochs):
            # Shuffle indices
            indices = np.arange(dataset_size)
            np.random.shuffle(indices)
            for start in range(0, dataset_size, self.batch_size):
                end = start + self.batch_size
                batch_idx = indices[start:end]
                batch_states = states[batch_idx]
                batch_actions = actions[batch_idx]
                batch_old_log_probs = old_log_probs[batch_idx]
                batch_returns = returns[batch_idx]
                batch_advantages = advantages[batch_idx]
                batch_old_values = torch.tensor(values, dtype=torch.float32).to(device)[batch_idx]

                # Evaluate current policy
                log_probs, new_values, entropy = self.evaluate(batch_states, batch_actions)
                # Policy (actor) loss
                ratios = torch.exp(log_probs - batch_old_log_probs)
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1 - self.clip_eps, 1 + self.clip_eps) * batch_advantages
                actor_loss = -torch.min(surr1, surr2).mean()

                # Value (critic) loss (with value clipping for stability)
                value_clipped = batch_old_values + torch.clamp(new_values - batch_old_values, -self.clip_eps, self.clip_eps)
                loss_unclipped = (new_values - batch_returns).pow(2)
                loss_clipped = (value_clipped - batch_returns).pow(2)
                critic_loss = 0.5 * torch.max(loss_unclipped, loss_clipped).mean()

                # Entropy (to encourage exploration)
                entropy_loss = entropy.mean()

                # Combined loss
                loss = actor_loss + self.vf_coef * critic_loss - self.ent_coef * entropy_loss

                # Backpropagate
                self.actor_optimizer.zero_grad()
                self.critic_optimizer.zero_grad()
                loss.backward()
                # Gradient clipping
                nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
                nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
                # Update parameters
                self.actor_optimizer.step()
                self.critic_optimizer.step()


def train(env_name="HumanoidStandup-v4", total_episodes=1000,
          eval_interval=20, reward_threshold=None):
    """
    Train PPO on the specified environment.
    Prints episode stats and saves the trained model.
    """
    # Hyperparameters (tuned for HumanoidStandup)
    args = dict(
        gamma=0.99, lam=0.9, clip_eps=0.2, ent_coef=1e-4,
        vf_coef=0.5, lr=2.5e-5, batch_size=64, n_epochs=10,
        max_grad_norm=0.5
    )
    # Create environment
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    agent = PPO(state_dim, action_dim, args)

    # For logging
    reward_history = []
    avg_rewards = []
    best_avg = -np.inf
    plateau_count = 0

    for ep in range(1, total_episodes+1):
        state, _ = env.reset()
        done = False
        ep_reward = 0
        max_height = 0.0
        # Memory for this episode
        memory = {'states': [], 'actions': [], 'log_probs': [], 'values': [], 'rewards': [], 'dones': []}

        # Generate one episode
        while not done:
            # Select action
            action, logp, value = agent.select_action(state)
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            # Track highest z (height) from observation (state[0] holds height)
            max_height = max(max_height, state[0])

            # Store transition
            memory['states'].append(state)
            memory['actions'].append(action)
            memory['log_probs'].append(logp)
            memory['values'].append(value)
            memory['rewards'].append(reward)
            memory['dones'].append(done)

            state = next_state
            ep_reward += reward

        # End of episode: train on collected data
        agent.update(memory)

        # Logging
        reward_history.append(ep_reward)
        avg_reward = np.mean(reward_history[-20:])
        avg_rewards.append(avg_reward)

        print(f"Episode {ep:4d} | Reward: {ep_reward:8.2f} | "
              f"AvgReward(20): {avg_reward:8.2f} | MaxHeight: {max_height:6.2f}")

        # Early stopping if plateau or threshold
        if avg_reward > best_avg + 1e-3:
            best_avg = avg_reward
            plateau_count = 0
        else:
            plateau_count += 1

        if reward_threshold and avg_reward >= reward_threshold:
            print(f"Reached reward threshold of {reward_threshold}. Stopping early.")
            break
        if plateau_count >= 50:
            print(f"No improvement for 50 episodes (avg {best_avg:.2f}). Stopping early.")
            break

    # Save the trained model
    os.makedirs("models", exist_ok=True)
    torch.save({
        'actor_state_dict': agent.actor.state_dict(),
        'critic_state_dict': agent.critic.state_dict(),
    }, f"models/ppo_{env_name}.pth")
    print("Model saved.")

    env.close()
    return agent, reward_history, avg_rewards


def evaluate(agent, env_name="HumanoidStandup-v4", episodes=5, save_video=False):
    """
    Evaluate the trained agent for a number of episodes.
    Optionally save videos of the rollouts.
    """
    eval_env = gym.make(env_name, render_mode='rgb_array')
    if save_video:
        from gymnasium.wrappers import RecordVideo
        video_folder = "videos"
        eval_env = RecordVideo(eval_env, video_folder, episode_trigger=lambda x: True)
        print(f"Recording videos to {video_folder}/")

    total_reward = 0.0
    success_count = 0
    for ep in range(episodes):
        state, _ = eval_env.reset()
        done = False
        ep_reward = 0
        while not done:
            # Deterministic action (mean of policy)
            state_tensor = torch.tensor(state, dtype=torch.float32).to(device).unsqueeze(0)
            action = agent.actor.get_det_action(state_tensor).cpu().detach().numpy()[0]
            next_state, reward, terminated, truncated, info = eval_env.step(action)
            done = terminated or truncated
            state = next_state
            ep_reward += reward
        total_reward += ep_reward

        # Determine success (e.g., height above threshold)
        final_height = state[0]
        if final_height > 1.0:  # success threshold
            success_count += 1

        print(f"Eval Episode {ep+1}: Reward = {ep_reward:.2f}, FinalHeight = {final_height:.2f}")

    avg_reward = total_reward / episodes
    success_rate = success_count / episodes * 100
    print(f"Average Reward: {avg_reward:.2f} over {episodes} episodes")
    print(f"Success Rate (height>1.0): {success_rate:.1f}%")
    eval_env.close()


# === Main execution ===
if __name__ == "__main__":
    # Train the agent (prints logs)
    ppo_agent, rewards, avgs = train(env_name="HumanoidStandup-v4", total_episodes=500)

    # Evaluate the trained agent (run 5 episodes, save videos)
    evaluate(ppo_agent, env_name="HumanoidStandup-v4", episodes=5, save_video=True)


Episode    1 | Reward: 25430.04 | AvgReward(20): 25430.04 | MaxHeight:   0.18
Episode    2 | Reward: 26130.28 | AvgReward(20): 25780.16 | MaxHeight:   0.17
Episode    3 | Reward: 25825.52 | AvgReward(20): 25795.28 | MaxHeight:   0.21
Episode    4 | Reward: 24764.91 | AvgReward(20): 25537.69 | MaxHeight:   0.16
Episode    5 | Reward: 27621.27 | AvgReward(20): 25954.40 | MaxHeight:   0.18
Episode    6 | Reward: 27168.43 | AvgReward(20): 26156.74 | MaxHeight:   0.19
Episode    7 | Reward: 27851.01 | AvgReward(20): 26398.78 | MaxHeight:   0.21
Episode    8 | Reward: 34430.27 | AvgReward(20): 27402.72 | MaxHeight:   0.20
Episode    9 | Reward: 26737.66 | AvgReward(20): 27328.82 | MaxHeight:   0.17
Episode   10 | Reward: 27503.65 | AvgReward(20): 27346.30 | MaxHeight:   0.18
Episode   11 | Reward: 23863.92 | AvgReward(20): 27029.72 | MaxHeight:   0.17
Episode   12 | Reward: 26210.93 | AvgReward(20): 26961.49 | MaxHeight:   0.16
Episode   13 | Reward: 27385.43 | AvgReward(20): 26994.10 | MaxH