In [1]:
import gym

from stable_baselines3 import TD3
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_env = gym.make("LunarLanderContinuous-v2")
eval_env = gym.make("LunarLanderContinuous-v2")

In [3]:
model = TD3(
    "MlpPolicy",
    train_env,
    gamma=0.98,
    buffer_size=200000,
    learning_starts=10000,
    gradient_steps=-1,
    train_freq=(1, "episode"),
    learning_rate=1e-3,
    policy_kwargs=dict(net_arch=[400, 300]),
    verbose=1,
    tensorboard_log="./runs/td3_lunarcontinuous_tensorboard/"
)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [4]:
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./logs/continuous/td3",
    log_path="./logs/continuous/td3",
    eval_freq=3e4,
    n_eval_episodes=10,
    deterministic=True,
    render=False,
)


In [5]:
model.learn(total_timesteps=int(3e5), callback=eval_callback)

Logging to ./runs/td3_lunarcontinuous_tensorboard/TD3_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 98.8     |
|    ep_rew_mean     | -174     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 3405     |
|    time_elapsed    | 0        |
|    total_timesteps | 395      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 104      |
|    ep_rew_mean     | -202     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 3460     |
|    time_elapsed    | 0        |
|    total_timesteps | 834      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 101      |
|    ep_rew_mean     | -172     |
| time/              |          |
|    episodes        | 12       |
|    fps             | 3382     |
|    time_elapsed    | 0  



Eval num_timesteps=30000, episode_reward=-54.53 +/- 52.62
Episode length: 787.10 +/- 311.50
---------------------------------
| eval/              |          |
|    mean_ep_length  | 787      |
|    mean_reward     | -54.5    |
| time/              |          |
|    total_timesteps | 30000    |
| train/             |          |
|    actor_loss      | -11.1    |
|    critic_loss     | 32.1     |
|    learning_rate   | 0.001    |
|    n_updates       | 19953    |
---------------------------------
New best mean reward!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 276      |
|    ep_rew_mean     | -197     |
| time/              |          |
|    episodes        | 132      |
|    fps             | 126      |
|    time_elapsed    | 243      |
|    total_timesteps | 30743    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 299      |
|    ep_rew_mean     | -187     |
| 

KeyboardInterrupt: 