In [7]:
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit1/requirements-unit1.txt



In [10]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

# Create environment
env = gym.make("LunarLander-v2", render_mode="rgb_array")

# Instantiate the agent
model = PPO(
    'MlpPolicy',
    env,
    verbose=1,
    learning_rate=0.0028,  # Slightly reduced from 0.0036
    n_steps=20480,         # Increased if iterations are doubled
    batch_size=512,        # Increased for greater stability and efficiency
    gamma=0.995,           # Adjusted if necessary based on task complexity
    gae_lambda=0.95,       # Evaluated based on new `n_steps`
    clip_range=0.2,
    ent_coef=0.04,         # Adjusted based on exploration needs
    n_epochs=10,              # Kept the same  
)


# Train the agent and display a progress bar
model.learn(total_timesteps=int(1200000), progress_bar=True)

# Save the agent
model.save("ppo_lunar")
#del model  # delete trained model to demonstrate loading

Output()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 90       |
|    ep_rew_mean     | -175     |
| time/              |          |
|    fps             | 4628     |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 20480    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 94.8        |
|    ep_rew_mean          | -133        |
| time/                   |             |
|    fps                  | 4257        |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 40960       |
| train/                  |             |
|    approx_kl            | 0.009656714 |
|    clip_fraction        | 0.127       |
|    clip_range           | 0.2         |
|    entropy_loss   

In [9]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
import imageio
import numpy as np

# Load the trained agent
env = gym.make("LunarLander-v2", render_mode="rgb_array")
model = PPO.load("ppo_lunar", env=env)

# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print('Mean reward:', mean_reward, 'Std. reward:', std_reward)

# Test the trained agent
images = []
j = 0
vec_env = model.get_env()
obs = vec_env.reset()
img = model.env.render(mode="rgb_array")
for i in range(2000):
    images.append(img)
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    #vec_env.render()
    if dones:
        j = j + 1
        print('Episode:', j, 'Rewards:', rewards, 'Info:', info, 'Dones:', dones)
        imageio.mimsave(f"./images/ppo_lander_{j}.gif", [np.array(img) for k, img in enumerate(images) if k%2 == 0], fps=29)
        images = []
    img = model.env.render(mode="rgb_array")
vec_env.close()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Mean reward: 276.05030539999996 Std. reward: 23.95621366326137
Episode: 1 Rewards: [100.] Info: [{'episode': {'r': 295.930974, 'l': 227, 't': 1.009827}, 'TimeLimit.truncated': False, 'terminal_observation': array([ 3.5619259e-02, -7.0737838e-04,  0.0000000e+00,  0.0000000e+00,
       -3.8757985e-03,  0.0000000e+00,  1.0000000e+00,  1.0000000e+00],
      dtype=float32)}] Dones: [ True]
Episode: 2 Rewards: [100.] Info: [{'episode': {'r': 259.368553, 'l': 210, 't': 2.37183}, 'TimeLimit.truncated': False, 'terminal_observation': array([-2.1464253e-02, -1.1508012e-03,  0.0000000e+00,  0.0000000e+00,
       -2.9669402e-04,  0.0000000e+00,  1.0000000e+00,  1.0000000e+00],
      dtype=float32)}] Dones: [ True]
Episode: 3 Rewards: [100.] Info: [{'episode': {'r': 297.968328, 'l': 309, 't': 3.681198}, 'TimeLimit.truncated': False, 'terminal_observation': array([ 0.02845259, -0.00112766,  0.        ,  0.        ,  0.00241