In [1]:
%load_ext autoreload
%autoreload 2

import argparse
import os
import sys
import pathlib
from typing import Type, Callable

import gymnasium as gym

sys.modules["gym"] = gym

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
from stable_baselines3.common.utils import safe_mean
from stable_baselines3.common.logger import Image
import numpy as np
import matplotlib.pyplot as plt


from swarm.bas import Agent, BASEnv, Blueprint, RenderWrapper, Swarm, wrappers
from swarm.analysis import policy,trajectories, reward
from swarm.training.train import create_parallel_env, SuccessRateCallback, DrawTrajectoriesCallback, RewardHeatmapCallback


In [2]:
class RelativeGoalInsideGridEnv(gym.Wrapper):
    def __init__(
        self,
        collision_termination: bool = False,
        collision_reward: int = 0,
        distance_reward_transform: Callable[[float], float] = lambda d: -d, 
        target_radius: float = 3,
        window_scale: float | None = 5,
    ):
        blueprint = Blueprint(
            world_size=np.array([200, 200]),
        )
        agent = Agent(
            radius=1,
            max_velocity=1,
            max_acceleration=0.2,
        )
        # Static grid.
        swarm = Swarm(
            num_boids=100,
            radius=2,
            max_velocity=None,
            reset_positions=np.stack(
                np.meshgrid(np.linspace(55, 145, 10), np.linspace(55, 145, 10)), -1
            ).reshape(-1, 2),
            max_acceleration=1,
            separation_range=1,
            cohesion_range=1,
            alignment_range=1,
            steering_weights=(1, 1, 1),
            obstacle_margin=1,
        )
        env = BASEnv(blueprint, agent, swarm)

        target = np.array([100, 100])
        env = wrappers.TargetDirectionAndSectionObservationWrapper(
            env, num_sections=8, max_range=20, position=target, subtract_radius=True
        )
        env = wrappers.TargetRewardWrapper(
            env,
            position=target,
            target_radius=target_radius,
            target_reward=100,
            distance_reward_transform=lambda d: -d
        )
        env = wrappers.BoidCollisionWrapper(
            env,
            collision_termination=collision_termination,
            collision_reward=collision_reward,
            add_reward=True,
        )

        if window_scale is not None:
            env = RenderWrapper(env, window_scale=window_scale)

        env = wrappers.RelativeRotationWrapper(env)
        env = wrappers.FlattenObservationWrapper(env)

        env = gym.wrappers.TimeLimit(env, 500)

        env = wrappers.TrajectoryWrapper(env)

        super().__init__(env)


In [3]:
train_steps = [
    (
        500000,
        lambda window_scale = None: RelativeGoalInsideGridEnv(
            collision_termination=False,
            collision_reward=0,
            distance_reward_transform=lambda d: -d,
            target_radius=3,
            window_scale=window_scale
        ),
    ),
    (
        500000,
        lambda window_scale = None: RelativeGoalInsideGridEnv(
            collision_termination=True,
            collision_reward=-1,
            distance_reward_transform=lambda d: -d/500,
            target_radius=40,
            window_scale=window_scale
        ),
    ),
    (
        500000,
        lambda window_scale = None: RelativeGoalInsideGridEnv(
            collision_termination=True,
            collision_reward=-1,
            distance_reward_transform=lambda d: -d/500,
            target_radius=40,
            window_scale=window_scale
        ),
    ),
    (
        500000,
        lambda window_scale = None: RelativeGoalInsideGridEnv(
            collision_termination=True,
            collision_reward=-1,
            distance_reward_transform=lambda d: -d/500,
            target_radius=30,
            window_scale=window_scale
        ),
    ),
    (
        500000,
        lambda window_scale = None: RelativeGoalInsideGridEnv(
            collision_termination=True,
            collision_reward=-1,
            distance_reward_transform=lambda d: -d/500,
            target_radius=20,
            window_scale=window_scale
        ),
    ),
    (
        500000,
        lambda window_scale = None: RelativeGoalInsideGridEnv(
            collision_termination=True,
            collision_reward=-1,
            distance_reward_transform=lambda d: -d/500,
            target_radius=10,
            window_scale=window_scale
        ),
    ),
    (
        500000,
        lambda window_scale = None: RelativeGoalInsideGridEnv(
            collision_termination=True,
            collision_reward=-1,
            distance_reward_transform=lambda d: -d/500,
            target_radius=3,
            window_scale=window_scale
        ),
    ),
]


In [4]:
runs_directory = pathlib.Path("runs")
tensorboard_directory = runs_directory / "tensorboard"

experiment_name = "relative_goal_inside_grid/fifth_try"
experiment_path = runs_directory / experiment_name

num_processes = 12


model = None
for i, (num_steps, create_env) in enumerate(train_steps):

    env = create_parallel_env(
        create_env,
        num_processes,
        ["is_success", "agent_trajectory"],
    )

    if model is None:
        model = PPO(
            "MlpPolicy",
            env,
            verbose=1,
            tensorboard_log=tensorboard_directory / experiment_name,
        )
    else:
        model.set_env(env)

    model.learn(
        total_timesteps=num_steps,
        # progress_bar=True,
        callback=[SuccessRateCallback(), DrawTrajectoriesCallback(create_env(5)), RewardHeatmapCallback(create_env())],
        reset_num_timesteps=False,
    )

    model.save(experiment_path)

    


Using cpu device
Logging to runs/tensorboard/relative_goal_inside_grid/fifth_try/PPO_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | -133     |
|    success_rate    | 0        |
| time/              |          |
|    fps             | 1451     |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 24576    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 489          |
|    ep_rew_mean          | -128         |
|    success_rate         | 0.0309       |
| time/                   |              |
|    fps                  | 1670         |
|    iterations           | 2            |
|    time_elapsed         | 29           |
|    total_timesteps      | 49152        |
| train/                  |              |
|    approx_kl            | 0.0040256013 |
|    clip_fraction     