<a href="https://colab.research.google.com/github/komazawa-deep-learning/komazawa-deep-learning.github.io/blob/master/2023notebooks/2023_0619stable_baselines3_demo_LunaLander_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---
date: 2023_0619
author: 浅川伸一
---

# Demo for `Lunalander-v2` from stable-baselines3

source from [https://araffin.github.io/post/sb3/](https://araffin.github.io/post/sb3/)

<center>
<div>
<video controls src="https://youtu.be/M1_qCqvW-u4" muted="false">
</video>
</center>


In [None]:
from IPython import get_ipython
isColab =  'google.colab' in str(get_ipython())
if isColab:
    !pip install jupyter-black

In [None]:
# for autoformatting
%load_ext jupyter_black

In [None]:
if isColab:
    !apt-get install ffmpeg freeglut3-dev xvfb  # For visualization
    !pip install "stable-baselines3[extra]>=2.0.0a4" --upgrade
    !pip install swig --upgrade
    !pip install 'gymnasium[box2d]' --upgrade

In [None]:
import gymnasium as gym
import numpy as np

In [None]:
# source form <https://araffin.github.io/post/sb3/>
# import gym

from stable_baselines3 import PPO  # Proximal Policy Optimization
from stable_baselines3 import A2C  # Asynchronous Advantage Actor Critic (A3C)
from stable_baselines3 import DDPG  # Deep Deterministic Policy Gradient
from stable_baselines3 import DQN  # Deep Q Network
from stable_baselines3 import SAC  # Soft Actor Critic

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback

# 途中経過 `checkpoint` を 5000 ステップ毎に保存
checkpoint_callback = CheckpointCallback(
    save_freq=5000, save_path="./logs/", name_prefix="rl_model"
)

# モデルを定期的に評価，最適モデルと評価を保存
# モニターラッパーを使用して，エピソードの統計情報を報告
eval_env = Monitor(gym.make("LunarLander-v3"))

# 評価のために決定論的な行動を行う
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./logs/",
    log_path="./logs/",
    eval_freq=2000,
    deterministic=True,
    render=False,
)

# 動作主 (エージェント) を訓練
model = A2C("MlpPolicy", "LunarLander-v3", verbose=1)
model.learn(total_timesteps=20000, callback=[checkpoint_callback, eval_callback])

# Retrieve and reset the environment
env = model.get_env()
obs = env.reset()

# Query the agent (stochastic action here)
action, _ = model.predict(obs, deterministic=False)

### 動画撮影の準備 <!-- ### Prepare video recording -->

In [None]:
# フェイク画面のセットアップ，そうしないとレンダリングに失敗する
import os

os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ["DISPLAY"] = ":1"

In [None]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay


def show_videos(video_path="", prefix=""):
    """
    https://github.com/eleurent/highway-env より援用

    :param video_path: (str) 動画が格納されているフォルダのパス
    :param prefix: (str) この接頭辞のついた動画のみを表示する
    """
    html = []
    for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append(
            """<video alt="{}" autoplay
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>""".format(
                mp4, video_b64.decode("ascii")
            )
        )
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [None]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv


def record_video(env_id, model, video_length=500, prefix="", video_folder="videos/"):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
    """
    eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array")])
    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(
        eval_env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix,
    )

    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()

In [None]:
record_video("LunarLander-v3", model, video_length=500, prefix="LunarLander-v3")

In [None]:
show_videos(video_path="videos", prefix="LunarLander-v3")

# 別のゲームタイトルやモデルを試す

In [None]:
game_id = 'Breakout'  # 'CartPole-v2', 'LunarLander-v2', 'Tennis'
game_id = 'Tennis'
#game_id = 'Boxing'
#game_id = 'Gopher'

# 途中経過 `checkpoint` を 5000 ステップ毎に保存
checkpoint_callback = CheckpointCallback(
    save_freq=5000, save_path="./logs/", name_prefix="rl_model")

# モデルを定期的に評価，最適モデルと評価を保存
# モニターラッパーを使用して，エピソードの統計情報を報告
eval_env = Monitor(gym.make(game_id))

# 評価のために決定論的な行動を行う
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./logs/",
    log_path="./logs/",
    eval_freq=2000,
    deterministic=True,
    render=False)

# 動作主 (エージェント) を訓練
# PPO, A2C, DDPG, DQN, SAC
model = PPO("MlpPolicy", game_id, verbose=1)
model.learn(total_timesteps=20000, callback=[checkpoint_callback, eval_callback])

# Retrieve and reset the environment
env = model.get_env()
obs = env.reset()

# Query the agent (stochastic action here)
action, _ = model.predict(obs, deterministic=False)

In [None]:
record_video(game_id, model, video_length=500, prefix=game_id)
show_videos(video_path="videos", prefix=game_id)

# さらに別のゲームタイトルやモデルを試す

In [None]:
game_id = 'Breakout'  # 'CartPole-v2', 'LunarLander-v2', 'Tennis'
game_id = 'Tennis'
#game_id = 'Boxing'
#game_id = 'Gopher'

# 途中経過 `checkpoint` を 5000 ステップ毎に保存
checkpoint_callback = CheckpointCallback(
    save_freq=5000, save_path="./logs/", name_prefix="rl_model")

# モデルを定期的に評価，最適モデルと評価を保存
# モニターラッパーを使用して，エピソードの統計情報を報告
eval_env = Monitor(gym.make(game_id))

# 評価のために決定論的な行動を行う
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./logs/",
    log_path="./logs/",
    eval_freq=2000,
    deterministic=True,
    render=False)

# 動作主 (エージェント) を訓練
# PPO, A2C, DDPG, DQN, SAC
model = PPO("MlpPolicy", game_id, verbose=1)
model.learn(total_timesteps=20000, callback=[checkpoint_callback, eval_callback])

# Retrieve and reset the environment
env = model.get_env()
obs = env.reset()

# Query the agent (stochastic action here)
action, _ = model.predict(obs, deterministic=False)
