In [1]:
# default_exp actorcritic.dads

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [4]:
#export
import torch.nn.utils as nn_utils
from fastai.torch_basics import *
import torch.nn.functional as F
from fastai.data.all import *
from fastai.basics import *
from dataclasses import field,asdict
from typing import List,Any,Dict,Callable
from collections import deque
import gym
import torch.multiprocessing as mp
from torch.optim import *
from dataclasses import dataclass

from fastrl.data import *
from fastrl.async_data import *
from fastrl.basic_agents import *
from fastrl.learner import *
from fastrl.metrics import *
from fastai.callback.progress import *
from fastrl.ptan_extension import *
from fastrl.actorcritic.sac import *

from torch.distributions import *

if IN_NOTEBOOK:
    from IPython import display
    import PIL.Image

  return torch._C._cuda_getDeviceCount() > 0


In [4]:
# hide
from nbdev.showdoc import *
from nbdev.imports import *
from nbdev.export2html import *
if not os.environ.get("IN_TEST", None):
    assert IN_NOTEBOOK
    assert not IN_COLAB
    assert IN_IPYTHON

# DADS

> Notes: Temporarily here. goal is convertion into pytorch

In [10]:
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp



ModuleNotFoundError: No module named 'tensorflow'

In [9]:

# TODO(architsh): Implement the dynamics with last K step input
class SkillDynamics:

    def __init__(self,
      observation_size,
      action_size,
      restrict_observation=0,
      normalize_observations=False,
      # network properties
      fc_layer_params=(256, 256),
      network_type='default',
      num_components=1,
      fix_variance=False,
      reweigh_batches=False,
      graph=None,
      scope_name='skill_dynamics'):

        self._observation_size = observation_size
        self._action_size = action_size
        self._normalize_observations = normalize_observations
        self._restrict_observation = restrict_observation
        self._reweigh_batches = reweigh_batches

        # tensorflow requirements
        if graph is not None:
            self._graph = graph
        else:
            self._graph = tf.compat.v1.get_default_graph()
        self._scope_name = scope_name

        # dynamics network properties
        self._fc_layer_params = fc_layer_params
        self._network_type = network_type
        self._num_components = num_components
        self._fix_variance = fix_variance
        if not self._fix_variance:
            self._std_lower_clip = 0.3
            self._std_upper_clip = 10.0

        self._use_placeholders = False
        self.log_probability = None
        self.dyn_max_op = None
        self.dyn_min_op = None
        self._session = None
        self._use_modal_mean = False

        # saving/restoring variables
        self._saver = None

    def _get_distribution(self, out):
        if self._num_components > 1:
            self.logits = tf.compat.v1.layers.dense(
              out, self._num_components, name='logits', reuse=tf.compat.v1.AUTO_REUSE)
            means, scale_diags = [], []
            for component_id in range(self._num_components):
                means.append(
                tf.compat.v1.layers.dense(
                    out,
                    self._observation_size,
                    name='mean_' + str(component_id),
                    reuse=tf.compat.v1.AUTO_REUSE))
            if not self._fix_variance:
                scale_diags.append(
                  tf.clip_by_value(
                      tf.compat.v1.layers.dense(
                          out,
                          self._observation_size,
                          activation=tf.nn.softplus,
                          name='stddev_' + str(component_id),
                          reuse=tf.compat.v1.AUTO_REUSE), self._std_lower_clip,
                      self._std_upper_clip))
            else:
                scale_diags.append(
                  tf.fill([tf.shape(out)[0], self._observation_size], 1.0))

            self.means = tf.stack(means, axis=1)
            self.scale_diags = tf.stack(scale_diags, axis=1)
            return tfp.distributions.MixtureSameFamily(
              mixture_distribution=tfp.distributions.Categorical(
                  logits=self.logits),
              components_distribution=tfp.distributions.MultivariateNormalDiag(
                  loc=self.means, scale_diag=self.scale_diags))

        else:
            mean = tf.compat.v1.layers.dense(
              out, self._observation_size, name='mean', reuse=tf.compat.v1.AUTO_REUSE)
            if not self._fix_variance:
                stddev = tf.clip_by_value(
                tf.compat.v1.layers.dense(
                    out,
                    self._observation_size,
                    activation=tf.nn.softplus,
                    name='stddev',
                    reuse=tf.compat.v1.AUTO_REUSE), self._std_lower_clip,
                self._std_upper_clip)
            else:
                stddev = tf.fill([tf.shape(out)[0], self._observation_size], 1.0)
            return tfp.distributions.MultivariateNormalDiag(
              loc=mean, scale_diag=stddev)

    # dynamics graph with separate pipeline for skills and timesteps
    def _graph_with_separate_skill_pipe(self, timesteps, actions):
        skill_out = actions
        with tf.compat.v1.variable_scope('action_pipe'):
            for idx, layer_size in enumerate((self._fc_layer_params[0] // 2,)):
                skill_out = tf.compat.v1.layers.dense(
                  skill_out,
                  layer_size,
                  activation=tf.nn.relu,
                  name='hid_' + str(idx),
                  reuse=tf.compat.v1.AUTO_REUSE)

        ts_out = timesteps
        with tf.compat.v1.variable_scope('ts_pipe'):
            for idx, layer_size in enumerate((self._fc_layer_params[0] // 2,)):
                ts_out = tf.compat.v1.layers.dense(
                  ts_out,
                  layer_size,
                  activation=tf.nn.relu,
                  name='hid_' + str(idx),
                  reuse=tf.compat.v1.AUTO_REUSE)

        # out = tf.compat.v1.layers.flatten(tf.einsum('ai,aj->aij', ts_out, skill_out))
        out = tf.concat([ts_out, skill_out], axis=1)
        with tf.compat.v1.variable_scope('joint'):
            for idx, layer_size in enumerate(self._fc_layer_param[1:]):
                out = tf.compat.v1.layers.dense(
                  out,
                  layer_size,
                  activation=tf.nn.relu,
                  name='hid_' + str(idx),
                  reuse=tf.compat.v1.AUTO_REUSE)

        return self._get_distribution(out)

    # simple dynamics graph
    def _default_graph(self, timesteps, actions):
        out = tf.concat([timesteps, actions], axis=1)
        for idx, layer_size in enumerate(self._fc_layer_params):
            out = tf.compat.v1.layers.dense(
                out,
                layer_size,
                activation=tf.nn.relu,
                name='hid_' + str(idx),
                reuse=tf.compat.v1.AUTO_REUSE)

        return self._get_distribution(out)

    def _get_dict(self,
                input_data,
                input_actions,
                target_data,
                batch_size=-1,
                batch_weights=None,
                batch_norm=False,
                noise_targets=False,
                noise_std=0.5):
        if batch_size > 0:
            shuffled_batch = np.random.permutation(len(input_data))[:batch_size]
        else:
            shuffled_batch = np.arange(len(input_data))

        # if we are noising the input, it is better to create a new copy of the numpy arrays
        batched_input = input_data[shuffled_batch, :]
        batched_skills = input_actions[shuffled_batch, :]
        batched_targets = target_data[shuffled_batch, :]

        if self._reweigh_batches and batch_weights is not None:
            example_weights = batch_weights[shuffled_batch]

        if noise_targets:
            batched_targets += np.random.randn(*batched_targets.shape) * noise_std

        return_dict = {
            self.timesteps_pl: batched_input,
            self.actions_pl: batched_skills,
            self.next_timesteps_pl: batched_targets
        }
        if self._normalize_observations:
            return_dict[self.is_training_pl] = batch_norm
        if self._reweigh_batches and batch_weights is not None:
            return_dict[self.batch_weights] = example_weights

        return return_dict

    def _get_run_dict(self, input_data, input_actions):
        return_dict = {
            self.timesteps_pl: input_data,
            self.actions_pl: input_actions
        }
        if self._normalize_observations:
            return_dict[self.is_training_pl] = False

        return return_dict

    def make_placeholders(self):
        self._use_placeholders = True
        with self._graph.as_default(), tf.compat.v1.variable_scope(self._scope_name):
            self.timesteps_pl = tf.compat.v1.placeholder(
              tf.float32, shape=(None, self._observation_size), name='timesteps_pl')
            self.actions_pl = tf.compat.v1.placeholder(
              tf.float32, shape=(None, self._action_size), name='actions_pl')
            self.next_timesteps_pl = tf.compat.v1.placeholder(
              tf.float32,
              shape=(None, self._observation_size),
              name='next_timesteps_pl')
            if self._normalize_observations:
                self.is_training_pl = tf.compat.v1.placeholder(tf.bool, name='batch_norm_pl')
            if self._reweigh_batches:
                self.batch_weights = tf.compat.v1.placeholder(
                    tf.float32, shape=(None,), name='importance_sampled_weights')

    def set_session(self, session=None, initialize_or_restore_variables=False):
        if session is None:
            self._session = tf.Session(graph=self._graph)
        else:
            self._session = session

        # only initialize uninitialized variables
        if initialize_or_restore_variables:
            if tf.io.gfile.exists(self._save_prefix):
                self.restore_variables()
            with self._graph.as_default():
                var_list = tf.compat.v1.global_variables(
                ) + tf.compat.v1.local_variables()
                is_initialized = self._session.run(
                    [tf.compat.v1.is_variable_initialized(v) for v in var_list])
                uninitialized_vars = []
                for flag, v in zip(is_initialized, var_list):
                    if not flag:
                        uninitialized_vars.append(v)

                if uninitialized_vars:
                    self._session.run(
                      tf.compat.v1.variables_initializer(uninitialized_vars))

    def build_graph(self,
                  timesteps=None,
                  actions=None,
                  next_timesteps=None,
                  is_training=None):
        with self._graph.as_default(), tf.compat.v1.variable_scope(
            self._scope_name, reuse=tf.compat.v1.AUTO_REUSE):
            if self._use_placeholders:
                timesteps = self.timesteps_pl
                actions = self.actions_pl
                next_timesteps = self.next_timesteps_pl
            if self._normalize_observations:
                is_training = self.is_training_pl

            # predict deltas instead of observations
            next_timesteps -= timesteps

            if self._restrict_observation > 0:
                timesteps = timesteps[:, self._restrict_observation:]

            if self._normalize_observations:
                timesteps = tf.compat.v1.layers.batch_normalization(
                timesteps,
                training=is_training,
                name='input_normalization',
                reuse=tf.compat.v1.AUTO_REUSE)
                self.output_norm_layer = tf.compat.v1.layers.BatchNormalization(
                scale=False, center=False, name='output_normalization')
                next_timesteps = self.output_norm_layer(
                next_timesteps, training=is_training)

            if self._network_type == 'default':
                self.base_distribution = self._default_graph(timesteps, actions)
            elif self._network_type == 'separate':
                self.base_distribution = self._graph_with_separate_skill_pipe(
                timesteps, actions)

            # if building multiple times, be careful about which log_prob you are optimizing
            self.log_probability = self.base_distribution.log_prob(next_timesteps)
            self.mean = self.base_distribution.mean()

            return self.log_probability

    def increase_prob_op(self, learning_rate=3e-4, weights=None):
        with self._graph.as_default():
            update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                if self._reweigh_batches:
                    self.dyn_max_op = tf.compat.v1.train.AdamOptimizer(
                      learning_rate=learning_rate,
                      name='adam_max').minimize(-tf.reduce_mean(self.log_probability *
                                                                self.batch_weights))
                elif weights is not None:
                    self.dyn_max_op = tf.compat.v1.train.AdamOptimizer(
                      learning_rate=learning_rate,
                      name='adam_max').minimize(-tf.reduce_mean(self.log_probability *
                                                                weights))
                else:
                    self.dyn_max_op = tf.compat.v1.train.AdamOptimizer(
                      learning_rate=learning_rate,
                      name='adam_max').minimize(-tf.reduce_mean(self.log_probability))

                return self.dyn_max_op

    def decrease_prob_op(self, learning_rate=3e-4, weights=None):
        with self._graph.as_default():
            update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                if self._reweigh_batches:
                    self.dyn_min_op = tf.compat.v1.train.AdamOptimizer(
                      learning_rate=learning_rate, name='adam_min').minimize(
                          tf.reduce_mean(self.log_probability * self.batch_weights))
                elif weights is not None:
                    self.dyn_min_op = tf.compat.v1.train.AdamOptimizer(
                      learning_rate=learning_rate, name='adam_min').minimize(
                          tf.reduce_mean(self.log_probability * weights))
                else:
                    self.dyn_min_op = tf.compat.v1.train.AdamOptimizer(
                      learning_rate=learning_rate,
                      name='adam_min').minimize(tf.reduce_mean(self.log_probability))
                return self.dyn_min_op

    def create_saver(self, save_prefix):
        if self._saver is not None:
            return self._saver
        else:
            with self._graph.as_default():
                self._variable_list = {}
                for var in tf.compat.v1.get_collection(
                    tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope=self._scope_name):
                    self._variable_list[var.name] = var
                self._saver = tf.compat.v1.train.Saver(
                    self._variable_list, save_relative_paths=True)
                self._save_prefix = save_prefix

    def save_variables(self, global_step):
        if not tf.io.gfile.exists(self._save_prefix):
            tf.io.gfile.makedirs(self._save_prefix)

        self._saver.save(
            self._session,
            os.path.join(self._save_prefix, 'ckpt'),
            global_step=global_step)

    def restore_variables(self):
        self._saver.restore(self._session,
                            tf.compat.v1.train.latest_checkpoint(self._save_prefix))

    # all functions here-on require placeholders----------------------------------
    def train(self,
            timesteps,
            actions,
            next_timesteps,
            batch_weights=None,
            batch_size=512,
            num_steps=1,
            increase_probs=True):
        if not self._use_placeholders:return

        if increase_probs: run_op = self.dyn_max_op
        else: run_op = self.dyn_min_op

        for _ in range(num_steps):
            self._session.run(
              run_op,
              feed_dict=self._get_dict(
                  timesteps,
                  actions,
                  next_timesteps,
                  batch_weights=batch_weights,
                  batch_size=batch_size,
                  batch_norm=True))

    def get_log_prob(self, timesteps, actions, next_timesteps):
        if not self._use_placeholders:return

        return self._session.run(
            self.log_probability,
            feed_dict=self._get_dict(
                timesteps, actions, next_timesteps, batch_norm=False))

    def predict_state(self, timesteps, actions):
        if not self._use_placeholders:
            return

        if self._use_modal_mean:
            all_means, modal_mean_indices = self._session.run(
              [self.means, tf.argmax(self.logits, axis=1)],
              feed_dict=self._get_run_dict(timesteps, actions))
            pred_state = all_means[[
              np.arange(all_means.shape[0]), modal_mean_indices
          ]]
        else:
            pred_state = self._session.run(
              self.mean, feed_dict=self._get_run_dict(timesteps, actions))

        if self._normalize_observations:
            with self._session.as_default(), self._graph.as_default():
                mean_correction, variance_correction = self.output_norm_layer.get_weights(
                )

            pred_state = pred_state * np.sqrt(variance_correction +
                                                1e-3) + mean_correction

        pred_state += timesteps
        return pred_state


ModuleNotFoundError: No module named 'tensorflow'

In [3]:
class DADSAgent(SAC):

    def __init__(self,
               save_directory,
               skill_dynamics_observation_size,
               observation_modify_fn=None,
               restrict_input_size=0,
               latent_size=2,
               latent_prior='cont_uniform',
               prior_samples=100,
               fc_layer_params=(256, 256),
               normalize_observations=True,
               network_type='default',
               num_mixture_components=4,
               fix_variance=True,
               skill_dynamics_learning_rate=3e-4,
               reweigh_batches=False,
               agent_graph=None,
               skill_dynamics_graph=None,
               *sac_args,
               **sac_kwargs):
        self._skill_dynamics_learning_rate = skill_dynamics_learning_rate
        self._latent_size = latent_size
        self._latent_prior = latent_prior
        self._prior_samples = prior_samples
        self._save_directory = save_directory
        self._restrict_input_size = restrict_input_size
        self._process_observation = observation_modify_fn

        if agent_graph is None:
            self._graph = tf.compat.v1.get_default_graph()
        else:
            self._graph = agent_graph

        if skill_dynamics_graph is None:
            skill_dynamics_graph = self._graph

        # instantiate the skill dynamics
        self._skill_dynamics = skill_dynamics.SkillDynamics(
            observation_size=skill_dynamics_observation_size,
            action_size=self._latent_size,
            restrict_observation=self._restrict_input_size,
            normalize_observations=normalize_observations,
            fc_layer_params=fc_layer_params,
            network_type=network_type,
            num_components=num_mixture_components,
            fix_variance=fix_variance,
            reweigh_batches=reweigh_batches,
            graph=skill_dynamics_graph)

        super(DADSAgent, self).__init__(*sac_args, **sac_kwargs)
        self._placeholders_in_place = False

    def compute_dads_reward(self, input_obs, cur_skill, target_obs):
        if self._process_observation is not None:
            input_obs, target_obs = self._process_observation(
              input_obs), self._process_observation(target_obs)

        num_reps = self._prior_samples if self._prior_samples > 0 else self._latent_size - 1
        input_obs_altz = np.concatenate([input_obs] * num_reps, axis=0)
        target_obs_altz = np.concatenate([target_obs] * num_reps, axis=0)

        # for marginalization of the denominator
        if self._latent_prior == 'discrete_uniform' and not self._prior_samples:
            alt_skill = np.concatenate(
              [np.roll(cur_skill, i, axis=1) for i in range(1, num_reps + 1)],
              axis=0)
        elif self._latent_prior == 'discrete_uniform':
            alt_skill = np.random.multinomial(
              1, [1. / self._latent_size] * self._latent_size,
              size=input_obs_altz.shape[0])
        elif self._latent_prior == 'gaussian':
            alt_skill = np.random.multivariate_normal(
              np.zeros(self._latent_size),
              np.eye(self._latent_size),
              size=input_obs_altz.shape[0])
        elif self._latent_prior == 'cont_uniform':
            alt_skill = np.random.uniform(
              low=-1.0, high=1.0, size=(input_obs_altz.shape[0], self._latent_size))

        logp = self._skill_dynamics.get_log_prob(input_obs, cur_skill, target_obs)

        # denominator may require more memory than that of a GPU, break computation
        split_group = 20 * 4000
        if input_obs_altz.shape[0] <= split_group:
            logp_altz = self._skill_dynamics.get_log_prob(input_obs_altz, alt_skill,
                                                        target_obs_altz)
        else:
            logp_altz = []
            for split_idx in range(input_obs_altz.shape[0] // split_group):
                start_split = split_idx * split_group
                end_split = (split_idx + 1) * split_group
                logp_altz.append(
                  self._skill_dynamics.get_log_prob(
                    input_obs_altz[start_split:end_split],
                    alt_skill[start_split:end_split],
                    target_obs_altz[start_split:end_split]))
            if input_obs_altz.shape[0] % split_group:
                start_split = input_obs_altz.shape[0] % split_group
                logp_altz.append(
                    self._skill_dynamics.get_log_prob(input_obs_altz[-start_split:],
                                                      alt_skill[-start_split:],
                                                  target_obs_altz[-start_split:]))
            logp_altz = np.concatenate(logp_altz)
        logp_altz = np.array(np.array_split(logp_altz, num_reps))

        # final DADS reward
        intrinsic_reward = np.log(num_reps + 1) - np.log(1 + np.exp(
            np.clip(logp_altz - logp.reshape(1, -1), -50, 50)).sum(axis=0))

        return intrinsic_reward, {'logp': logp, 'logp_altz': logp_altz.flatten()}

    def get_experience_placeholder(self):
        self._placeholders_in_place = True
        self._placeholders = []
        for item in nest.flatten(self.collect_data_spec):
            self._placeholders += [
              tf.compat.v1.placeholder(
                  item.dtype,
                  shape=(None, 2) if len(item.shape) == 0 else
                  (None, 2, item.shape[-1]),
                  name=item.name)
          ]
        self._policy_experience_ph = nest.pack_sequence_as(self.collect_data_spec,
                                                           self._placeholders)
        return self._policy_experience_ph

    def build_agent_graph(self):
        with self._graph.as_default():
            self.get_experience_placeholder()
            self.agent_train_op = self.train(self._policy_experience_ph)
            self.summary_ops = tf.compat.v1.summary.all_v2_summary_ops()
            return self.agent_train_op

    def build_skill_dynamics_graph(self):
        self._skill_dynamics.make_placeholders()
        self._skill_dynamics.build_graph()
        self._skill_dynamics.increase_prob_op(
        learning_rate=self._skill_dynamics_learning_rate)

    def create_savers(self):
        self._skill_dynamics.create_saver(
            save_prefix=os.path.join(self._save_directory, 'dynamics'))

    def set_sessions(self, initialize_or_restore_skill_dynamics, session=None):
        if session is not None:
            self._session = session
        else:
            self._session = tf.compat.v1.Session(graph=self._graph)
        self._skill_dynamics.set_session(
            initialize_or_restore_variables=initialize_or_restore_skill_dynamics,
            session=session)

    def save_variables(self, global_step):
        self._skill_dynamics.save_variables(global_step=global_step)

    def _get_dict(self, trajectories, batch_size=-1):
        tf.nest.assert_same_structure(self.collect_data_spec, trajectories)
        if batch_size > 0:
              shuffled_batch = np.random.permutation(
              trajectories.observation.shape[0])[:batch_size]
        else:
              shuffled_batch = np.arange(trajectories.observation.shape[0])

        return_dict = {}

        for placeholder, val in zip(self._placeholders, nest.flatten(trajectories)):
              return_dict[placeholder] = val[shuffled_batch]

        return return_dict

    def train_loop(self,
                 trajectories,
                 recompute_reward=False,
                 batch_size=-1,
                 num_steps=1):
        if not self._placeholders_in_place:
            return

        if recompute_reward:
            input_obs = trajectories.observation[:, 0, :-self._latent_size]
            cur_skill = trajectories.observation[:, 0, -self._latent_size:]
            target_obs = trajectories.observation[:, 1, :-self._latent_size]
            new_reward, info = self.compute_dads_reward(input_obs, cur_skill,
                                                      target_obs)
            trajectories = trajectories._replace(
              reward=np.concatenate(
                  [np.expand_dims(new_reward, axis=1), trajectories.reward[:, 1:]],
                  axis=1))

          # TODO(architsh):all agent specs should be the same as env specs, shift preprocessing to actor/critic networks
        if self._restrict_input_size > 0:
            trajectories = trajectories._replace(
              observation=trajectories.observation[:, :,
                                                   self._restrict_input_size:])

        for _ in range(num_steps):
            self._session.run([self.agent_train_op, self.summary_ops],
                            feed_dict=self._get_dict(
                                trajectories, batch_size=batch_size))

        if recompute_reward:
            return new_reward, info
        else:
            return None, None

    @property
    def skill_dynamics(self):
        return self._skill_dynamics

NameError: name 'SAC' is not defined

In [None]:
class DADSLearner(SACLearner):pass

In [None]:
env='Pendulum-v0'
agent=DADSAgent(3,gym.make(env).action_space,gamma=0.99,tau=0.005,alpha=0.2)

block=FirstLastExperienceBlock(agent=agent,seed=0,n_steps=2,exclude_nones=True,
                               dls_kwargs={'bs':1,'num_workers':0,'verbose':False,'indexed':True,'shuffle_train':False})
blk=IterableDataBlock(blocks=(block),
                      splitter=FuncSplitter(lambda x:False),
#                       batch_tfms=lambda x:(x['s'],x),
                     )
dls=blk.dataloaders([env]*1,n=1000,device=default_device())

learner=DADSLearner(dls,agent=agent,cbs=[ExperienceReplay(sz=1000000,bs=64,starting_els=1000,max_steps=gym.make(env)._max_episode_steps),SACCriticTrainer],
                   metrics=[AvgEpisodeRewardMetric()])
learner.fit(30,lr=0.001,wd=0)