In [None]:
import gymnasium

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

## Learning Environment

Understanding how Gymnasium library works. It's useful for its environments in which RL can be tested out.

In [None]:
import gymnasium as gym

# create environment
env = gym.make("LunarLander-v2")

env.reset()

print("_____OBSERVATION SPACE_____ \n")
print("Observation Space Shape", env.observation_space.shape)
print("Sample observation", env.observation_space.sample())  # Get a random observation

In [None]:
print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())  # Take a random action

In [None]:
# reset environment
observation, info = env.reset()

for _ in range(20):
    # take random action
    action = env.action_space.sample()
    print("Action taken: ",action)

    # perform this action in environment and retrieve info
    observation, reward, terminated, truncated, info = env.step(action)

    # if the game terminated, stop environment
    if terminated or truncated:
        # reset env
        print("Environment reset")
        observation, info = env.reset()

env.close()

Create a vectorized environment, where we can stack multiple independent environments into a single one. This way we can create more diverse experiences during training

In [None]:
# Create the environment
env = make_vec_env("LunarLander-v2", n_envs=16)

## Modeling

Using StableBaselines3, PPO will be our model. PPO combines value-based RL and policy-based RL.

In [None]:
from stable_baselines3 import PPO

# instantiate agent
model = PPO('MlpPolicy',env,n_steps=1024,batch_size=64,n_epochs=4,gamma=0.999,gae_lambda=0.98,ent_coef=0.01,verbose=1)

In [None]:
save_path = "../exp/ppo-LunarLander-v2"

# train the model
model.learn(total_timesteps = 1e6,progress_bar=True)

# save model
model.save(save_path)

## Evaluation

After training the model, we can evaluate it and print the mean performance. For this purpose, we create a new environment.

In [1]:
import gymnasium as gym

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

save_path = "../exp/ppo-LunarLander-v2"

# load model
model = PPO.load(save_path)

# create new environment for evaluation
eval_env = Monitor(gym.make("LunarLander-v2"))

# evaluate over 10 episodes with rendering
mean_reward, std_reward = evaluate_policy(model, eval_env,n_eval_episodes=10,deterministic=True, render=False)
print(f"Mean reward: {mean_reward} | Std reward: {std_reward}")

# close render window and env
eval_env.close()

Mean reward: 262.12135 | Std reward: 22.212599357883985


## Push Model to Hub

This model can be pushed to Hugging Face hub, where users can interact with it, download it and see associated metadata.

In [2]:
from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.

notebook_login()
!git config --global credential.helper store

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import gymnasium as gym
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env

from huggingface_sb3 import package_to_hub

# remote location for the model
repo_id = 'jsmithdlc/ppo-lunarlander-v2'

# environment in which the model was trained
env_id = 'LunarLander-v2'

# evaluation env and render mode with rgb_array
eval_env = DummyVecEnv([lambda: gym.make(env_id,render_mode='rgb_array')])

# model architecture copied from above
model_architecture = "PPO"

commit_message = 'lunar lander PPO from hugging face deep RL tutorial'

model_name = 'ppo-lunarlander-v2'

# save, evaluate, generate a model card and record a replay video of your agent before pushing the repo to the hub
package_to_hub(model=model, # Our trained model
               model_name=model_name, # The name of our trained model 
               model_architecture=model_architecture, # The model architecture we used: in our case PPO
               env_id=env_id, # Name of the environment
               eval_env=eval_env, # Evaluation Environment
               repo_id=repo_id, # id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
               commit_message=commit_message)

[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m




Saving video to /tmp/tmp2k_6sgyf/-step-0-to-step-1000.mp4
Moviepy - Building video /tmp/tmp2k_6sgyf/-step-0-to-step-1000.mp4.
Moviepy - Writing video /tmp/tmp2k_6sgyf/-step-0-to-step-1000.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /tmp/tmp2k_6sgyf/-step-0-to-step-1000.mp4


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

[38;5;4mℹ Pushing repo jsmithdlc/ppo-lunarlander-v2 to the Hugging Face Hub[0m


ppo-lunarlander-v2.zip:   0%|          | 0.00/147k [00:00<?, ?B/s]

[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/jsmithdlc/ppo-lunarlander-v2/tree/main/[0m


'https://huggingface.co/jsmithdlc/ppo-lunarlander-v2/tree/main/'