In [3]:
import gym

from sb3_contrib import TQC
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor

In [4]:
train_env = make_vec_env("LunarLanderContinuous-v2", n_envs=16)
eval_env = Monitor(gym.make("LunarLanderContinuous-v2"))

In [5]:
from typing import Callable

def linear_schedule(initial_value):
    """
    Linear learning rate schedule.
    :param initial_value: (float or str)
    :return: (function)
    """
    def func(progress):
        """
        Progress will decrease from 1 (beginning) to 0
        :param progress: (float)
        :return: (float)
        """
        return progress * initial_value

    return func

In [6]:
model = TQC(
    "MlpPolicy",
    train_env,
    learning_rate=linear_schedule(7.3e-4),
    buffer_size=1000000,
    batch_size=256,
    ent_coef='auto',
    gamma=0.99,
    tau=0.01,
    train_freq=1,
    gradient_steps=1,
    learning_starts=10000,
    policy_kwargs=dict(net_arch=[400, 300]),
    verbose=1,
    tensorboard_log="./runs/tqc_lunarcontinuous_tensorboard/"
)

Using cuda device


In [7]:
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./logs/continuous/tqc",
    log_path="./logs/continuous/tqc",
    eval_freq=1000,
    n_eval_episodes=10,
    deterministic=True,
    render=False,
)


In [8]:
model.learn(total_timesteps=int(5e5), callback=eval_callback)

Logging to ./runs/tqc_lunar_tensorboard/TQC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 73.8     |
|    ep_rew_mean     | -77.1    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 3668     |
|    time_elapsed    | 0        |
|    total_timesteps | 1264     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 84.1     |
|    ep_rew_mean     | -160     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 3780     |
|    time_elapsed    | 0        |
|    total_timesteps | 1552     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 90.9     |
|    ep_rew_mean     | -221     |
| time/              |          |
|    episodes        | 12       |
|    fps             | 3851     |
|    time_elapsed    | 0        |
| 



Eval num_timesteps=8000, episode_reward=-158.61 +/- 13.86
Episode length: 61.40 +/- 7.53
---------------------------------
| eval/              |          |
|    mean_ep_length  | 61.4     |
|    mean_reward     | -159     |
| time/              |          |
|    total_timesteps | 8000     |
---------------------------------
New best mean reward!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 109      |
|    ep_rew_mean     | -216     |
| time/              |          |
|    episodes        | 64       |
|    fps             | 2420     |
|    time_elapsed    | 3        |
|    total_timesteps | 8160     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 108      |
|    ep_rew_mean     | -210     |
| time/              |          |
|    episodes        | 68       |
|    fps             | 2438     |
|    time_elapsed    | 3        |
|    total_timesteps | 8384     |
-----

<sb3_contrib.tqc.tqc.TQC at 0x174a1c7fb20>

In [10]:
model.save("data/policies/LunarLander-v2#tqc#training")
del model

In [11]:
model = TQC.load("logs/tqc/best_model", env=train_env)

In [12]:
model.learn(total_timesteps=int(1e6), callback=eval_callback)

Logging to ./runs/sac_lunar_tensorboard/SAC_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 69.8     |
|    ep_rew_mean     | -108     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 4455     |
|    time_elapsed    | 0        |
|    total_timesteps | 1152     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 80       |
|    ep_rew_mean     | -162     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 4498     |
|    time_elapsed    | 0        |
|    total_timesteps | 1584     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 90.6     |
|    ep_rew_mean     | -195     |
| time/              |          |
|    episodes        | 12       |
|    fps             | 4471     |
|    time_elapsed    | 0        |
| 

<stable_baselines3.sac.sac.SAC at 0x252239700a0>

In [13]:
model.save("data/policies/LunarLander-v2#tqc#train_best")
del model