In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from functools import partial

import jax
import jax.numpy as jnp

from calculate_metric import get_stats_for_state
from visualize_actor import get_state_traj


In [3]:
artifact_version = "286"
num_episodes = 100
model_artifact_remote_name = (
    f"josssdan/JaxInforMARL/PPO_RNN_Runner_State:v{artifact_version}"
)

traj_batch, config, env = get_state_traj(model_artifact_remote_name, artifact_version, num_episodes)

Config:
{'derived_values': {'minibatch_size': 12800,
                    'num_actors': 200,
                    'num_updates': 78,
                    'scaled_clip_eps': 0.2},
 'env_config': {'env_cls_name': 'TargetMPEEnvironment',
                'env_kwargs': {'agent_communication_type': None,
                               'agent_control_noise_std': 0.0,
                               'agent_max_speed': -1,
                               'agent_visibility_radius': [0],
                               'dist_to_goal_reward_ratio': 0.9,
                               'entities_initial_coord_radius': [1],
                               'entity_acceleration': 5,
                               'max_steps': 25,
                               'num_agents': 2,
                               'one_time_death_reward': 15}},
 'network_config': {'actor_num_hidden_linear_layer': 2,
                    'critic_num_hidden_linear_layer': 2,
                    'entity_type_embedding_dim': 4,
         

In [4]:
num_envs = config.training_config.num_envs
num_agents = config.env_config.env_kwargs.num_agents
num_steps = config.env_config.env_kwargs.max_steps

In [5]:
# reshaping so that the axis becomes num_env, num_steps, num_agents...

traj_batch = jax.tree.map(lambda x: x.reshape(num_steps, num_agents, num_envs, *x.shape[2:]), traj_batch)
traj_batch = jax.tree.map(
    lambda x: jnp.swapaxes(x, 1, 2),
    traj_batch,
)
traj_batch = jax.tree.map(
    lambda x: jnp.swapaxes(x, 0, 1),
    traj_batch,
)


In [6]:
jax.tree.map(lambda x: x.shape, traj_batch)

TransitionWithEnvState(global_done=(100, 25, 2), done=(100, 25, 2), action=(100, 25, 2), value=(100, 25, 2), reward=(100, 25, 2), log_prob=(100, 25, 2), obs=(100, 25, 2, 6), graph=GraphsTupleWithAgentIndex(nodes=(100, 25, 2, 4, 7), edges=(100, 25, 2, 14, 1), receivers=(100, 25, 2, 14), senders=(100, 25, 2, 14), globals=None, n_node=(100, 25, 2), n_edge=(100, 25, 2), agent_indices=(100, 25, 2)), world_state=(100, 25, 2, 12), info={'returned_episode': (100, 25, 2), 'returned_episode_lengths': (100, 25, 2), 'returned_episode_returns': (100, 25, 2)}, env_state=LogEnvState(env_state=MPEState(dones=(100, 25, 2, 2), step=(100, 25, 2), entity_positions=(100, 25, 2, 4, 2), entity_velocities=(100, 25, 2, 4, 2), did_agent_die_this_time_step=(100, 25, 2, 2), agent_communication_message=(100, 25, 2, 0), agent_visibility_radius=(100, 25, 2, 2)), episode_returns=(100, 25, 2, 2), episode_lengths=(100, 25, 2, 2), returned_episode_returns=(100, 25, 2, 2), returned_episode_lengths=(100, 25, 2, 2)))

In [7]:
# summing across all steps in episode and across all agents
total_reward = jnp.sum(traj_batch.reward, axis=(1, 2))
avg_reward_per_episode = jnp.average(total_reward).item()

In [8]:
avg_reward_per_episode

180.56466674804688

In [9]:
done = jnp.swapaxes(traj_batch.done, 1, 2)  # so that it becomes num_env, num_agents, num_steps
avg_goal_reach_time_in_episode_fraction = (jnp.argmax(done, axis=-1) + 1) / num_steps
agents_that_didnt_reach_goal = jnp.all(~done, axis=-1)
avg_goal_reach_time_in_episode_fraction = avg_goal_reach_time_in_episode_fraction.at[agents_that_didnt_reach_goal].set(
    1)
avg_goal_reach_time_in_episode_fraction = jnp.average(avg_goal_reach_time_in_episode_fraction).item()

In [10]:
avg_goal_reach_time_in_episode_fraction

0.6273999810218811

In [11]:
reached_goal = jnp.any(done, axis=-1)
all_agents_reached_goal = jnp.all(reached_goal, axis=-1)

episode_percent_all_agents_reached_goals = jnp.average(all_agents_reached_goal) * 100
episode_percent_all_agents_reached_goals = episode_percent_all_agents_reached_goals.item()

In [12]:
episode_percent_all_agents_reached_goals

41.0

In [13]:
@partial(jax.jit, static_argnums=(0,))
def compute_stats_for_all_episode(env, state):
    compute_stats_for_every_step = jax.vmap(get_stats_for_state, in_axes=(None, 0))
    compute_all_stats = jax.vmap(compute_stats_for_every_step, in_axes=(None, 0))
    return compute_all_stats(env, state)

In [14]:
env_state = traj_batch.env_state.env_state
env_state = jax.tree.map(lambda x: x[:, :, 0],
                         env_state)  # take state from one agent since it will be the same for all agents

In [15]:
num_collisions, num_agent_died = compute_stats_for_all_episode(env, env_state)

In [16]:
avg_num_collision_across_all_episodes = jnp.average(num_collisions).item()
avg_num_deaths_across_all_episodes = jnp.average(num_agent_died).item()

In [17]:
avg_reward_per_episode, avg_goal_reach_time_in_episode_fraction, episode_percent_all_agents_reached_goals, avg_num_collision_across_all_episodes

(180.56466674804688, 0.6273999810218811, 41.0, 0.025599999353289604)