In [5]:
import numpy as np
from stable_baselines3 import TD3
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import EvalCallback

from flock import FlockEnv

In [10]:
def train_td3():
    n_agents = 2

    env = FlockEnv(num_agents=n_agents, num_obstacles=0, width=300, height=300)
    env = DummyVecEnv([lambda: env])
    action_noise = NormalActionNoise(mean=np.zeros((n_agents, 2)), sigma=0.1 * np.ones((n_agents, 2)))

    model = TD3(
        "MlpPolicy",
        env,
        verbose=1,
        learning_rate=1e-3,
        buffer_size=int(1e6),
        action_noise=action_noise,
        learning_starts=25000,
        batch_size=128,
        gamma=0.99,
        tau=0.005,
    )

    eval_env = FlockEnv(num_agents=n_agents, num_obstacles=0, width=300, height=300)

    eval_callback = EvalCallback(eval_env, log_path="../models/", eval_freq=1000,
        deterministic=True, render=False, n_eval_episodes=1)

    model.learn(total_timesteps=125000, callback=eval_callback)
    model.save("../models/flock_td3_2")

def evaluate():
    env = FlockEnv(num_agents=2, num_obstacles=0, width=300, height=300)

    model = TD3.load("../models/flock_td3_2")

    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)

    print(f"Mean reward: {mean_reward}, Std reward: {std_reward}")

    obs = env.reset()
    done = False

    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render()

    env.close()

In [11]:
train_td3()



Using cuda device




Eval num_timesteps=1000, episode_reward=-123.82 +/- 0.00
Episode length: 1000.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | -124     |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
Eval num_timesteps=2000, episode_reward=-123.82 +/- 0.00
Episode length: 1000.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | -124     |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
Eval num_timesteps=3000, episode_reward=-123.82 +/- 0.00
Episode length: 1000.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | -124     |
| time/              |          |
|    total_timesteps | 3000     |
----------------------------

In [12]:
evaluate()

Mean reward: 94.12176662794081, Std reward: 0.0
