# Unit 1 Intro: LunarLander

Use high-level libraries to train an agent for the OpenAI Gym LunarLander environment.

In [1]:
%%html
<video controls autoplay><source src="https://huggingface.co/ThomasSimonini/ppo-LunarLander-v2/resolve/main/replay.mp4" type="video/mp4"></video>

### Create a virtual display for video rendering

In [1]:
import os
import time

import gym
import numpy as np

from math import pi

from huggingface_sb3 import load_from_hub, package_to_hub, push_to_hub
from huggingface_hub import notebook_login
from pyvirtualdisplay import Display
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv

In [2]:
display = Display(visible=0, size=(1400, 900))
_ = display.start()

# Environment demo

In [3]:
env = gym.make('LunarLander-v2')

print("_____OBSERVATION SPACE_____ \n")
print("Observation Space Shape:", env.observation_space.shape)
print("Sample observation:", env.observation_space.sample()) # Get a random observation

print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape:", env.action_space.n)
print("Action Space Sample:", env.action_space.sample()) # Take a random action

_____OBSERVATION SPACE_____ 

Observation Space Shape: (8,)
Sample observation: [-1.7677496e-01  6.4206737e-01  4.7862220e-01 -1.4629478e-02
 -1.6274517e+00 -9.5955265e-01  5.3424883e-04  1.6856719e+00]

 _____ACTION SPACE_____ 

Action Space Shape: 4
Action Space Sample: 1


In [5]:
# Reset and get first observation
observation = env.reset()

for _ in range(5):
    # Take a random action
    action = env.action_space.sample()
    print(f'Action taken: {action}')
    
    observation, reward, done, info = env.step(action)
    
    if done:
        print(f'Finished! Resetting environment')
        observation = env.reset()

Action taken: 2
Action taken: 3
Action taken: 3
Action taken: 2
Action taken: 1


# Train an agent

In [21]:
# max_lr = 0.00055
# def lr(x):
#     if x < 0.02:
#         return 50 * x * max_lr
#     else:
#         return 0.5 * (1 + np.cos(9 * pi * (x - 0.02))) * max_lr * (0.2 + 0.8 * (x < 0.5))
    
    
max_lr = 0.0005
def lr(x):
    if x < 0.05:
        return 50 * x * max_lr
    elif x < 0.333333:
        return max_lr
    else:
        return 0.5 * (1 + np.cos(1.3 * pi * 3 * (x - 0.333333))) * max_lr

In [4]:
env = make_vec_env('LunarLander-v2', n_envs=16)
model = PPO(
    policy = 'MlpPolicy',
    env = env,
    n_steps = 1024,
    batch_size = 64,
    n_epochs = 4,
    gamma = 0.99,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    verbose=1)

Using cuda device


In [22]:
env = make_vec_env('LunarLander-v2', n_envs=16)
model = PPO(
    policy = 'MlpPolicy',
    env = env,
    n_steps = 1024,
    batch_size = 64,
    learning_rate=lr,
    # learning_rate=lambda x: 0.00033 * (1 - 3.5 * (x - 0.47)**2),
    gamma = 0.999,
    gae_lambda = 0.98,
    ent_coef = 0.03,
    verbose=1)


Using cuda device


In [6]:
model_name = 'ppo-LunarLander-v2_v3'
model.set_parameters(model_name)

In [5]:
model_name = 'ppo-LunarLander-v2_tjdebug'
model.learn(total_timesteps=1000000)
model.save(model_name)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 91.1     |
|    ep_rew_mean     | -170     |
| time/              |          |
|    fps             | 7368     |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 16384    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 91.4        |
|    ep_rew_mean          | -150        |
| time/                   |             |
|    fps                  | 5265        |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.005990468 |
|    clip_fraction        | 0.0323      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | -4.2e-05    |
|    learning_rate        | 0.

### Evaluate trained model

In [24]:
eval_env = gym.make('LunarLander-v2')
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100, deterministic=True)
print(f'Reward = {mean_reward:.2f} +/- {std_reward}')

Reward = 272.11 +/- 33.54414533036526


### Upload trained model

In [11]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
env_id = 'LunarLander-v2'
model_architecture = 'PPO'
user_name = 'matt-guay'
version = 4
repo_id = f'{user_name}/{model_architecture}-{env_id}-{version}'
commit_message = 'Upload PPO LunarLander-v2 trained agent'

eval_env = DummyVecEnv([lambda: gym.make(env_id)])

package_to_hub(
    model=model,
    model_name=model_name,
    model_architecture=model_architecture,
    env_id=env_id,
    eval_env=eval_env,
    video_length=0,
    repo_id=repo_id,
    commit_message=commit_message)

[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m


Unknown encoder 'libx264'
[31mERROR: VideoRecorder encoder failed: None[0m
[31mERROR: VideoRecorder encoder exited with status 1[0m
ffmpeg version b4f2eb3 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.3.0-1ubuntu1~22.04)
  configuration: --enable-cuda --enable-cuvid --enable-shared --disable-static --disable-doc --extra-cflags=-I/usr/local/cuda/include --extra-ldflags=-L/usr/local/cuda/lib64 --enable-gpl --extra-libs=-lpthread --nvccflags='-gencode arch=compute_86,code=sm_86'
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55.  5.100 / 55.  5.100
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x56115166b280] Format mov,mp4,m4a,3gp,3g2,mj2 detected only with low score of 1, misdetection possible!
[mov,mp4,m4a,3gp,3g2,mj2

Saving video to /tmp/tmpb8qe42b4/-step-0-to-step-0.mp4
[38;5;4mℹ Pushing repo matt-guay/PPO-LunarLander-v2-4 to the Hugging Face
Hub[0m
[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/matt-guay/PPO-LunarLander-v2-4/tree/main/[0m


'https://huggingface.co/matt-guay/PPO-LunarLander-v2-4/tree/main/'