In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()



In [2]:
from mlrl.experiments.train_maze_agent import (
    create_maze_meta_env, RestrictedActionsMazeState, create_batched_tf_meta_env, 
    create_agent, create_training_run, get_maze_name
)
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.environments.gym_wrapper import GymWrapper
from tf_agents.environments.batched_py_environment import BatchedPyEnvironment
from tf_agents.train.utils import spec_utils

args = {'agent': 'ppo_agent', 'meta_time_limit': 500}
env_batch_size = 2

env = BatchedPyEnvironment([
    GymWrapper(create_maze_meta_env(RestrictedActionsMazeState, args)) 
    for _ in range(env_batch_size)
])

eval_env = BatchedPyEnvironment([
    GymWrapper(create_maze_meta_env(RestrictedActionsMazeState, args)) 
    for _ in range(env_batch_size)
])

env.reset()

observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = (
      spec_utils.get_tensor_specs(env))

pygame 2.1.0 (SDL 2.0.16, Python 3.8.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


ALSA lib confmisc.c:767:(parse_card) cannot find card '0'
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory
ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default
ALSA lib confmisc.c:767:(parse_card) cannot find card '0'
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory
ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: N

In [3]:
from tf_agents.agents.ppo.ppo_agent import PPOAgent
from tf_agents.networks.sequential import Sequential
from tf_agents.networks.value_network import ValueNetwork
from tf_agents.networks.actor_distribution_network import ActorDistributionNetwork
from tf_agents.train.utils import train_utils

from mlrl.meta.search_networks import SearchActorNetwork, SearchValueNetwork
from mlrl.meta.search_networks import SearchTransformer, CategoricalNetwork, SearchActorLogitsNetwork


custom_objects = {
    'SearchActorLogitsNetwork': SearchActorLogitsNetwork,
    'SearchTransformer': SearchTransformer
}

with tf.keras.utils.custom_object_scope(custom_objects):
    actor_dist_net = ActorDistributionNetwork(
        observation_tensor_spec, action_tensor_spec,
        preprocessing_layers=SearchActorLogitsNetwork(),
        fc_layer_params=None,
        discrete_projection_net=lambda spec: CategoricalNetwork(spec)
    )

value_net = ValueNetwork(
    observation_tensor_spec,
    preprocessing_layers=SearchTransformer(3, 16, 2),
    preprocessing_combiner=tf.keras.layers.Lambda(lambda x: tf.reduce_sum(x[0], axis=-2)),
    batch_squash=True,
    fc_layer_params=None
)

train_step = train_utils.create_train_step()

agent = PPOAgent(
    time_step_tensor_spec,
    action_tensor_spec,
    actor_net=actor_dist_net,
    value_net=value_net,
    optimizer=tf.keras.optimizers.Adam(),
    train_step_counter=train_step,
    compute_value_and_advantage_in_train=False,
    update_normalizers_in_train=False,
    normalize_observations=False,
    discount_factor=0.99,
    num_epochs=1,  # deprecated param
)

2022-10-04 02:20:12.863269: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-10-04 02:20:12.877261: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-10-04 02:20:12.877838: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-10-04 02:20:12.878984: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate

In [4]:
from tf_agents.replay_buffers import tf_uniform_replay_buffer

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=env.batch_size or 1,
    max_length=10000
)


def preprocess_seq(experience, info):
    return agent.preprocess_sequence(experience), info


def dataset_fn():
    ds = replay_buffer.as_dataset(sample_batch_size=8, num_steps=32)
    return ds.map(preprocess_seq)

In [5]:
from tf_agents.train import actor
from tf_agents.train import learner
from tf_agents.metrics import py_metrics
from tf_agents.train import triggers

import os

root_dir = './runs/ppo_agent/test'

summary_interval = 1000
collect_sequence_length = 2048
policy_save_interval = 5000
# summary_interval = 1
# collect_sequence_length = 256
# policy_save_interval = 2

saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR)
collect_env_step_metric = py_metrics.EnvironmentSteps()
learning_triggers = [
    triggers.PolicySavedModelTrigger(
        saved_model_dir,
        agent,
        train_step,
        interval=policy_save_interval,
        metadata_metrics={
            triggers.ENV_STEP_METADATA_KEY: collect_env_step_metric
        }),
    triggers.StepPerSecondLogTrigger(train_step, interval=summary_interval),
]

collect_actor = actor.Actor(
    env,
    agent.collect_policy,
    train_step,
    steps_per_run=collect_sequence_length,
    observers=[replay_buffer.add_batch],
    metrics=actor.collect_metrics(buffer_size=collect_sequence_length),
    reference_metrics=[collect_env_step_metric],
    summary_dir=os.path.join(root_dir, learner.TRAIN_DIR),
    summary_interval=summary_interval)

collect_actor.run()

In [6]:
from tf_agents.policies import py_tf_eager_policy

eval_greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(
    agent.policy, use_tf_function=True)

eval_steps=1000

eval_actor = actor.Actor(
    eval_env,
    eval_greedy_policy,
    train_step,
    metrics=actor.eval_metrics(buffer_size=10),
    reference_metrics=[collect_env_step_metric],
    summary_dir=os.path.join(root_dir, 'eval'),
    steps_per_run=eval_steps)

eval_actor.run_and_log()

In [7]:
from tf_agents.train.ppo_learner import PPOLearner

ppo_learner = PPOLearner(
    root_dir,
    train_step,
    agent,
    experience_dataset_fn=dataset_fn,
    normalization_dataset_fn=dataset_fn,
    num_samples=1, num_epochs=10,  # num samples * num epochs = num iterations per run call
    triggers=learning_triggers
)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [8]:
import wandb
from mlrl.utils.render_utils import create_policy_eval_video


videos_dir = root_dir + '/videos'

from pathlib import Path
Path(videos_dir).mkdir(parents=True, exist_ok=True) 


eval_interval = 5
num_iterations = 100

try:
      config = {
            'eval_interval': eval_interval,
            'num_iterations': num_iterations,
            'summary_interval': summary_interval,
            'collect_sequence_length': collect_sequence_length,
            'policy_save_interval': policy_save_interval,
            'num_samples': ppo_learner._num_samples,
            'num_epochs': ppo_learner._num_epochs,
            'meta_discount_factor': agent._discount_factor,
            'max_tree_size': env.envs[0].max_tree_size,
            'env_batch_size': env.batch_size,
      }

      wandb.init(project='mlrl', entity='drcope', reinit=True, config=config)

      for i in range(num_iterations):
            iteration_logs = {'iteration': i}

            print(f'Iteration: {i}')
            for metric in eval_actor.metrics:
                  metric.reset()
            for metric in collect_actor.metrics:
                  metric.reset()

            collect_actor.run()
            print('Collect stats:')
            print(', '.join([f'{metric.name}: {metric.result():.3f}' for metric in collect_actor.metrics]))
            iteration_logs.update({metric.name: metric.result() for metric in collect_actor.metrics})

            loss_info = ppo_learner.run()
            print('Training info:')
            print(f'Loss: {loss_info.loss:.5f}, '
                  f'KL Penalty Loss: {loss_info.extra.kl_penalty_loss:.5f}, '
                  f'Entropy: {loss_info.extra.entropy_regularization_loss:.5f}, '
                  f'Value Estimation Loss: {loss_info.extra.value_estimation_loss:.5f}, '
                  f'PG Loss {loss_info.extra.policy_gradient_loss:.5f}')

            iteration_logs.update({
                  'loss': loss_info.loss.numpy(), 
                  **tf.nest.map_structure(lambda x: x.numpy(), loss_info.extra._asdict())
            })

            if i % eval_interval == 0:
                  eval_actor.run_and_log()
                  print('Evaluation stats:')
                  print(', '.join([f'{metric.name}: {metric.result():.3f}' for metric in eval_actor.metrics]))
                  iteration_logs.update({f'Eval{metric.name}': metric.result() for metric in eval_actor.metrics})

                  video_file = f'{videos_dir}/video_{i}.mp4'
                  create_policy_eval_video(agent.policy, env, max_steps=120, filename=video_file, max_envs_to_show=1)
                  iteration_logs['video'] = wandb.Video(video_file, fps=30, format="mp4")
                  print()

            wandb.log(iteration_logs)
finally:
      wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mdrcope[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.13.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Iteration: 0
Collect stats:
NumberOfEpisodes: 26.000, EnvironmentSteps: 2048.000, AverageReturn: 0.930, AverageEpisodeLength: 73.346
Training info:
Loss: 5.59590, KL Penalty Loss: 0.02784, Entropy: 0.00000, Value Estimation Loss: 5.64213, PG Loss -0.07407
Evaluation stats:
AverageReturn: 0.900, AverageEpisodeLength: 80.000





Iteration: 1
Collect stats:
NumberOfEpisodes: 27.000, EnvironmentSteps: 2048.000, AverageReturn: 0.920, AverageEpisodeLength: 75.704
Training info:
Loss: 2.46232, KL Penalty Loss: 0.02856, Entropy: 0.00000, Value Estimation Loss: 2.52418, PG Loss -0.09042
Iteration: 2
Collect stats:
NumberOfEpisodes: 25.000, EnvironmentSteps: 2048.000, AverageReturn: 0.924, AverageEpisodeLength: 78.720
Training info:
Loss: 6.40696, KL Penalty Loss: 0.00095, Entropy: 0.00000, Value Estimation Loss: 6.47580, PG Loss -0.06979
Iteration: 3
Collect stats:
NumberOfEpisodes: 28.000, EnvironmentSteps: 2048.000, AverageReturn: 0.933, AverageEpisodeLength: 71.893
Training info:
Loss: 2.17495, KL Penalty Loss: 0.01606, Entropy: 0.00000, Value Estimation Loss: 2.23141, PG Loss -0.07251
Iteration: 4
Collect stats:
NumberOfEpisodes: 27.000, EnvironmentSteps: 2049.000, AverageReturn: 0.931, AverageEpisodeLength: 70.593
Training info:
Loss: 4.42586, KL Penalty Loss: 0.00567, Entropy: 0.00000, Value Estimation Loss: 4