In [2]:
import os
import cv2
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
import cloudpickle

# Hyperparameters
GAMMA = 0.99
LAMBDA = 0.95
CLIP_EPSILON = 0.2
ACTOR_LR = 3e-4
CRITIC_LR = 1e-3
EPOCHS = 10
BATCH_SIZE = 64
ENTROPY_COEF = 0.01
GRAD_CLIP = 0.5
MAX_EPISODE_STEPS = 1000
ENV_NAME = "LunarLander-v3"
TOTAL_TIMESTEPS = 1000000
TARGET_REWARD = 200
EVAL_EPISODES = 10
EVAL_MAX_STEPS = 1000
EVAL_SAVE_VIDEO = True
SOLVED_EPISODES = 100

class RunningStat:
    def __init__(self, shape):
        self.mean = np.zeros(shape, dtype=np.float32)
        self.std = np.ones(shape, dtype=np.float32)
        self.count = 0

    def update(self, x):
        batch_mean = np.mean(x, axis=0)
        batch_std = np.std(x, axis=0) + 1e-6
        batch_count = x.shape[0]
        self.count += batch_count
        delta = batch_mean - self.mean
        self.mean += delta * batch_count / self.count
        m_a = self.std * self.std * (self.count - batch_count)
        m_b = batch_std * batch_std * batch_count
        self.std = np.sqrt((m_a + m_b + np.square(delta) * self.count * batch_count / self.count) / self.count)

    def normalize(self, x):
        normalized = (x - self.mean) / self.std
        return np.clip(normalized, -5, 5)

class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super(Actor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, act_dim),
            nn.Softmax(dim=-1)
        )
        nn.init.orthogonal_(self.net[-2].weight, gain=0.01)
        nn.init.zeros_(self.net[-2].bias)

    def forward(self, state):
        return self.net(state)

    def sample(self, state):
        probs = self.forward(state)
        if torch.isnan(probs).any():
            return None, None, None
        dist = Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        entropy = dist.entropy()
        return action, log_prob, entropy

class Critic(nn.Module):
    def __init__(self, obs_dim):
        super(Critic, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        nn.init.orthogonal_(self.net[-1].weight, gain=1.0)
        nn.init.zeros_(self.net[-1].bias)

    def forward(self, state):
        return self.net(state)

class PPO:
    def __init__(self, obs_dim, act_dim, device):
        self.actor = Actor(obs_dim, act_dim).to(device)
        self.critic = Critic(obs_dim).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=CRITIC_LR)
        self.device = device
        self.obs_normalizer = RunningStat(obs_dim)

    def compute_gae(self, rewards, values, next_value, dones):
        advantages = np.zeros_like(rewards)
        gae = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + GAMMA * next_value * (1 - dones[t]) - values[t]
            gae = delta + GAMMA * LAMBDA * (1 - dones[t]) * gae
            advantages[t] = gae
            next_value = values[t]
        returns = advantages + values
        return advantages, returns

    def update(self, states, actions, old_log_probs, returns, advantages):
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        old_log_probs = torch.FloatTensor(old_log_probs).to(self.device)
        returns = torch.FloatTensor(returns).to(self.device)
        advantages = torch.FloatTensor(advantages).to(self.device)
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        for _ in range(EPOCHS):
            indices = np.random.permutation(len(states))
            for start in range(0, len(states), BATCH_SIZE):
                batch_idx = indices[start:start + BATCH_SIZE]
                batch_states = states[batch_idx]
                batch_actions = actions[batch_idx]
                batch_old_log_probs = old_log_probs[batch_idx]
                batch_returns = returns[batch_idx]
                batch_advantages = advantages[batch_idx]

                action_out, log_prob, entropy = self.actor.sample(batch_states)
                if action_out is None:
                    continue
                ratio = torch.exp(log_prob - batch_old_log_probs)
                surr1 = ratio * batch_advantages
                surr2 = torch.clamp(ratio, 1 - CLIP_EPSILON, 1 + CLIP_EPSILON) * batch_advantages
                actor_loss = -torch.min(surr1, surr2).mean() - ENTROPY_COEF * entropy.mean()

                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.actor.parameters(), GRAD_CLIP)
                self.actor_optimizer.step()

                value = self.critic(batch_states).squeeze()
                critic_loss = (batch_returns - value).pow(2).mean()

                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.critic.parameters(), GRAD_CLIP)
                self.critic_optimizer.step()

    def save(self, path):
        with open(path, 'wb') as f:
            cloudpickle.dump(self, f)

    def evaluate(self, env, episodes=EVAL_EPISODES, max_steps=EVAL_MAX_STEPS, save_video=EVAL_SAVE_VIDEO):
        print("\nStarting evaluation...")
        total_reward = 0
        successful_landings = 0
        reward_threshold = 200

        if save_video:
            video_folder = "evaluation_videos"
            os.makedirs(video_folder, exist_ok=True)
            video_writer = None

        for ep in range(episodes):
            obs, _ = env.reset()
            episode_reward = 0
            steps = 0
            done = False

            if save_video:
                video_path = f"{video_folder}/eval_episode_{ep + 1}.mp4"
                frame = env.render()
                if frame is not None:
                    video_writer = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*'mp4v'), 30,
                                                   (frame.shape[1], frame.shape[0]))
                else:
                    print("Warning: Render returned None, skipping video")

            while not done and steps < max_steps:
                norm_obs = self.obs_normalizer.normalize(obs)
                obs_tensor = torch.FloatTensor(norm_obs).to(self.device).unsqueeze(0)
                probs = self.actor(obs_tensor)
                dist = Categorical(probs)
                action = dist.sample()
                if action is None:
                    print("Warning: Invalid action, breaking episode")
                    break
                action_np = action.detach().cpu().numpy().item()

                obs, reward, terminated, truncated, info = env.step(action_np)
                done = terminated or truncated
                episode_reward += reward
                steps += 1

                if save_video and video_writer is not None:
                    frame = env.render()
                    if frame is not None:
                        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                        video_writer.write(frame)

            total_reward += episode_reward
            if episode_reward >= reward_threshold:
                successful_landings += 1

            print(f"Evaluation Episode {ep + 1}: Reward: {episode_reward:.2f}")
            if save_video and video_writer is not None:
                video_writer.release()

        avg_reward = total_reward / episodes
        success_rate = successful_landings / episodes
        print(f"Average Evaluation Reward: {avg_reward:.2f}")
        print(f"Success Rate (>= {reward_threshold}): {success_rate:.2f} ({successful_landings}/{episodes} episodes)")
        return avg_reward, success_rate

def train():
    env = gym.make(ENV_NAME, render_mode="rgb_array")
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent = PPO(obs_dim, act_dim, device)

    total_steps = 0
    episode_rewards = []
    solved_count = 0

    while total_steps < TOTAL_TIMESTEPS:
        state, _ = env.reset()
        episode_reward = 0
        states, actions, rewards, log_probs, values, dones = [], [], [], [], [], []

        for t in range(MAX_EPISODE_STEPS):
            agent.obs_normalizer.update(np.array([state]))
            norm_state = agent.obs_normalizer.normalize(state)
            state_tensor = torch.FloatTensor(norm_state).to(device)

            action, log_prob, _ = agent.actor.sample(state_tensor.unsqueeze(0))
            if action is None:
                break
            value = agent.critic(state_tensor.unsqueeze(0)).item()

            action_np = action.detach().cpu().numpy().item()
            next_state, reward, terminated, truncated, _ = env.step(action_np)
            done = terminated or truncated

            states.append(state)
            actions.append(action_np)
            rewards.append(reward)
            log_probs.append(log_prob.item())
            values.append(value)
            dones.append(done)

            state = next_state
            episode_reward += reward
            total_steps += 1

            if done:
                break

        if np.isnan(next_state).any():
            continue

        agent.obs_normalizer.update(np.array([next_state]))
        norm_next_state = agent.obs_normalizer.normalize(next_state)
        next_value = agent.critic(torch.FloatTensor(norm_next_state).to(device).unsqueeze(0)).item()

        advantages, returns = agent.compute_gae(rewards, values, next_value, dones)
        agent.update(states, actions, log_probs, returns, advantages)

        episode_rewards.append(episode_reward)
        avg_reward = np.mean(episode_rewards[-SOLVED_EPISODES:]) if len(episode_rewards) >= SOLVED_EPISODES else np.mean(episode_rewards)
        print(f"Episode: {len(episode_rewards)}, Reward: {episode_reward:.2f}, Avg Reward: {avg_reward:.2f}")

        if len(episode_rewards) >= SOLVED_EPISODES and avg_reward >= TARGET_REWARD:
            solved_count += 1
            if solved_count >= 3:
                print(f"Stopping training: Avg reward {avg_reward:.2f} >= {TARGET_REWARD} for 3 consecutive checks")
                break
        else:
            solved_count = 0

    agent.save("ppo_lunar_lander")
    agent.evaluate(env)
    env.close()

if __name__ == "__main__":
    train()

DependencyNotInstalled: Box2D is not installed, you can install it by run `pip install swig` followed by `pip install "gymnasium[box2d]"`