# Custom LSTM Policy

Source code: <https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/common/policies.py>

In [1]:
import tensorflow as tf
import numpy as np
from stable_baselines.common.tf_util import batch_to_seq, seq_to_batch
from stable_baselines.common.tf_layers import conv, linear, conv_to_fc, lstm
from stable_baselines.common.policies import RecurrentActorCriticPolicy, ActorCriticPolicy,register_policy, nature_cnn

  for external in metadata.entry_points().get(self.group, []):


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



  "stable-baselines is in maintenance mode, please use [Stable-Baselines3 (SB3)](https://github.com/DLR-RM/stable-baselines3) for an up-to-date version. You can find a [migration guide](https://stable-baselines3.readthedocs.io/en/master/guide/migration.html) in SB3 documentation."


## LSTM Policy 1
 + 1 Lstm Layer with 128 nodes (default: 256)

In [9]:
class LstmPolicy1(RecurrentActorCriticPolicy):
    """
    Policy object that implements actor critic, using LSTMs.
    :param sess: (TensorFlow session) The current TensorFlow session
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param n_env: (int) The number of environments to run
    :param n_steps: (int) The number of steps to run for each environment
    :param n_batch: (int) The number of batch to run (n_envs * n_steps)
    :param n_lstm: (int) The number of LSTM cells (for recurrent policies)
    :param reuse: (bool) If the policy is reusable or not
    :param layers: ([int]) The size of the Neural network before the LSTM layer  (if None, default to [64, 64])
    :param net_arch: (list) Specification of the actor-critic policy network architecture. Notation similar to the
        format described in mlp_extractor but with additional support for a 'lstm' entry in the shared network part.
    :param act_fun: (tf.func) the activation function to use in the neural network.
    :param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction
    :param layer_norm: (bool) Whether or not to use layer normalizing LSTMs
    :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp")
    :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
    """

    recurrent = True

    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=128, reuse=False, layers=None,
                 net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, layer_norm=False, feature_extraction="mlp",
                 **kwargs):
        # state_shape = [n_lstm * 2] dim because of the cell and hidden states of the LSTM
        super(LstmPolicy1, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch,
                                         state_shape=(2 * n_lstm, ), reuse=reuse,
                                         scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        if net_arch is None:  # Legacy mode
            if layers is None:
                layers = [64, 64]
            else:
                warnings.warn("The layers parameter is deprecated. Use the net_arch parameter instead.")

            with tf.variable_scope("model", reuse=reuse):
                if feature_extraction == "cnn":
                    extracted_features = cnn_extractor(self.processed_obs, **kwargs)
                else:
                    extracted_features = tf.layers.flatten(self.processed_obs)
                    for i, layer_size in enumerate(layers):
                        extracted_features = act_fun(linear(extracted_features, 'pi_fc' + str(i), n_hidden=layer_size,
                                                            init_scale=np.sqrt(2)))
                input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps)
                masks = batch_to_seq(self.dones_ph, self.n_env, n_steps)
                rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm,
                                             layer_norm=layer_norm)
                rnn_output, self.snew = lstm(rnn_output, masks, self.snew, 'lstm2', n_hidden=n_lstm,
                                             layer_norm=layer_norm)
                rnn_output = seq_to_batch(rnn_output)
                value_fn = linear(rnn_output, 'vf', 1)

                self._proba_distribution, self._policy, self.q_value = \
                    self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output)

            self._value_fn = value_fn
        else:  # Use the new net_arch parameter
            if layers is not None:
                warnings.warn("The new net_arch parameter overrides the deprecated layers parameter.")
            if feature_extraction == "cnn":
                raise NotImplementedError()

            with tf.variable_scope("model", reuse=reuse):
                latent = tf.layers.flatten(self.processed_obs)
                policy_only_layers = []  # Layer sizes of the network that only belongs to the policy network
                value_only_layers = []  # Layer sizes of the network that only belongs to the value network

                # Iterate through the shared layers and build the shared parts of the network
                lstm_layer_constructed = False
                for idx, layer in enumerate(net_arch):
                    if isinstance(layer, int):  # Check that this is a shared layer
                        layer_size = layer
                        latent = act_fun(linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2)))
                    elif layer == "lstm":
                        if lstm_layer_constructed:
                            raise ValueError("The net_arch parameter must only contain one occurrence of 'lstm'!")
                        input_sequence = batch_to_seq(latent, self.n_env, n_steps)
                        masks = batch_to_seq(self.dones_ph, self.n_env, n_steps)
                        rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm,
                                                     layer_norm=layer_norm)
                        latent = seq_to_batch(rnn_output)
                        lstm_layer_constructed = True
                    else:
                        assert isinstance(layer, dict), "Error: the net_arch list can only contain ints and dicts"
                        if 'pi' in layer:
                            assert isinstance(layer['pi'],
                                              list), "Error: net_arch[-1]['pi'] must contain a list of integers."
                            policy_only_layers = layer['pi']

                        if 'vf' in layer:
                            assert isinstance(layer['vf'],
                                              list), "Error: net_arch[-1]['vf'] must contain a list of integers."
                            value_only_layers = layer['vf']
                        break  # From here on the network splits up in policy and value network

                # Build the non-shared part of the policy-network
                latent_policy = latent
                for idx, pi_layer_size in enumerate(policy_only_layers):
                    if pi_layer_size == "lstm":
                        raise NotImplementedError("LSTMs are only supported in the shared part of the policy network.")
                    assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers."
                    latent_policy = act_fun(
                        linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2)))

                # Build the non-shared part of the value-network
                latent_value = latent
                for idx, vf_layer_size in enumerate(value_only_layers):
                    if vf_layer_size == "lstm":
                        raise NotImplementedError("LSTMs are only supported in the shared part of the value function "
                                                  "network.")
                    assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers."
                    latent_value = act_fun(
                        linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2)))

                if not lstm_layer_constructed:
                    raise ValueError("The net_arch parameter must contain at least one occurrence of 'lstm'!")

                self._value_fn = linear(latent_value, 'vf', 1)
                # TODO: why not init_scale = 0.001 here like in the feedforward
                self._proba_distribution, self._policy, self.q_value = \
                    self.pdtype.proba_distribution_from_latent(latent_policy, latent_value)
        self._setup_init()

    def step(self, obs, state=None, mask=None, deterministic=False):
        if deterministic:
            return self.sess.run([self.deterministic_action, self.value_flat, self.snew, self.neglogp],
                                 {self.obs_ph: obs, self.states_ph: state, self.dones_ph: mask})
        else:
            return self.sess.run([self.action, self.value_flat, self.snew, self.neglogp],
                                 {self.obs_ph: obs, self.states_ph: state, self.dones_ph: mask})

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, {self.obs_ph: obs, self.states_ph: state, self.dones_ph: mask})

    def value(self, obs, state=None, mask=None):
        return self.sess.run(self.value_flat, {self.obs_ph: obs, self.states_ph: state, self.dones_ph: mask})

In [11]:
import gym
from stable_baselines.common.policies import LstmPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines import A2C

env_1 = gym.make('CartPole-v1')
model_1 = A2C(LstmPolicy1, env_1, verbose = 1)
model_1.learn(total_timesteps = 20000)
model_1.save("a2c_cartpole_ver1")

Wrapping the env in a DummyVecEnv.


Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


---------------------------------
| explained_variance | -0.00161 |
| fps                | 4        |
| nupdates           | 1        |
| policy_entropy     | 0.693    |
| total_timesteps    | 5        |
| value_loss         | 10.7     |
---------------------------------
---------------------------------
| explained_variance | 0.000573 |
| fps                | 181      |
| nupdates           | 100      |
| policy_entropy     | 0.693    |
| total_timesteps    | 500      |
| value_loss         | 6.05     |
---------------------------------
---------------------------------
| explained_variance | 0.0014   

---------------------------------
| explained_variance | -0.0128  |
| fps                | 283      |
| nupdates           | 1700     |
| policy_entropy     | 0.693    |
| total_timesteps    | 8500     |
| value_loss         | 1.54     |
---------------------------------
---------------------------------
| explained_variance | -0.95    |
| fps                | 283      |
| nupdates           | 1800     |
| policy_entropy     | 0.482    |
| total_timesteps    | 9000     |
| value_loss         | 1.03e+03 |
---------------------------------
---------------------------------
| explained_variance | -0.984   |
| fps                | 283      |
| nupdates           | 1900     |
| policy_entropy     | 0.614    |
| total_timesteps    | 9500     |
| value_loss         | 1.04     |
---------------------------------
---------------------------------
| explained_variance | -0.518   |
| fps                | 283      |
| nupdates           | 2000     |
| policy_entropy     | 0.3      |
| total_timest

In [16]:
eval_env1 = DummyVecEnv([lambda: gym.make('CartPole-v1')])
model_1 = model_1.load('a2c_cartpole_ver1')
mean_reward_1, std_reward_1 = evaluate_policy(model_1, eval_env1, n_eval_episodes = 100, return_episode_rewards = False)
print(mean_reward_1)
print(std_reward_1)

Loading a model without an environment, this model cannot be trained until it has a valid environment.

9.34
0.7512656


## LSTM Policy 2
 + 2 Lstm Layers with 128 nodes each

In [17]:
class LstmPolicy2(RecurrentActorCriticPolicy):
    """
    Policy object that implements actor critic, using LSTMs.
    :param sess: (TensorFlow session) The current TensorFlow session
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param n_env: (int) The number of environments to run
    :param n_steps: (int) The number of steps to run for each environment
    :param n_batch: (int) The number of batch to run (n_envs * n_steps)
    :param n_lstm: (int) The number of LSTM cells (for recurrent policies)
    :param reuse: (bool) If the policy is reusable or not
    :param layers: ([int]) The size of the Neural network before the LSTM layer  (if None, default to [64, 64])
    :param net_arch: (list) Specification of the actor-critic policy network architecture. Notation similar to the
        format described in mlp_extractor but with additional support for a 'lstm' entry in the shared network part.
    :param act_fun: (tf.func) the activation function to use in the neural network.
    :param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction
    :param layer_norm: (bool) Whether or not to use layer normalizing LSTMs
    :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp")
    :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
    """

    recurrent = True

    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=128, reuse=False, layers=None,
                 net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, layer_norm=False, feature_extraction="mlp",
                 **kwargs):
        # state_shape = [n_lstm * 2] dim because of the cell and hidden states of the LSTM
        super(LstmPolicy2, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch,
                                         state_shape=(2 * n_lstm, ), reuse=reuse,
                                         scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        if net_arch is None:  # Legacy mode
            if layers is None:
                layers = [64, 64]
            else:
                warnings.warn("The layers parameter is deprecated. Use the net_arch parameter instead.")

            with tf.variable_scope("model", reuse=reuse):
                if feature_extraction == "cnn":
                    extracted_features = cnn_extractor(self.processed_obs, **kwargs)
                else:
                    extracted_features = tf.layers.flatten(self.processed_obs)
                    for i, layer_size in enumerate(layers):
                        extracted_features = act_fun(linear(extracted_features, 'pi_fc' + str(i), n_hidden=layer_size,
                                                            init_scale=np.sqrt(2)))
                input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps)
                masks = batch_to_seq(self.dones_ph, self.n_env, n_steps)
                rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm,
                                             layer_norm=layer_norm)
                rnn_output, self.snew = lstm(rnn_output, masks, self.snew, 'lstm2', n_hidden=n_lstm,
                                             layer_norm=layer_norm)
                rnn_output = seq_to_batch(rnn_output)
                value_fn = linear(rnn_output, 'vf', 1)

                self._proba_distribution, self._policy, self.q_value = \
                    self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output)

            self._value_fn = value_fn
        else:  # Use the new net_arch parameter
            if layers is not None:
                warnings.warn("The new net_arch parameter overrides the deprecated layers parameter.")
            if feature_extraction == "cnn":
                raise NotImplementedError()

            with tf.variable_scope("model", reuse=reuse):
                latent = tf.layers.flatten(self.processed_obs)
                policy_only_layers = []  # Layer sizes of the network that only belongs to the policy network
                value_only_layers = []  # Layer sizes of the network that only belongs to the value network

                # Iterate through the shared layers and build the shared parts of the network
                lstm_layer_constructed = False
                for idx, layer in enumerate(net_arch):
                    if isinstance(layer, int):  # Check that this is a shared layer
                        layer_size = layer
                        latent = act_fun(linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2)))
                    elif layer == "lstm":
                        if lstm_layer_constructed:
                            raise ValueError("The net_arch parameter must only contain one occurrence of 'lstm'!")
                        input_sequence = batch_to_seq(latent, self.n_env, n_steps)
                        masks = batch_to_seq(self.dones_ph, self.n_env, n_steps)
                        rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm,
                                                     layer_norm=layer_norm)
                        rnn_output, self.snew = lstm(rnn_output, masks, self.states_ph, 'lstm2', n_hidden=n_lstm, 
                                             layer_norm=layer_norm)
                        latent = seq_to_batch(rnn_output)
                        lstm_layer_constructed = True
                    else:
                        assert isinstance(layer, dict), "Error: the net_arch list can only contain ints and dicts"
                        if 'pi' in layer:
                            assert isinstance(layer['pi'],
                                              list), "Error: net_arch[-1]['pi'] must contain a list of integers."
                            policy_only_layers = layer['pi']

                        if 'vf' in layer:
                            assert isinstance(layer['vf'],
                                              list), "Error: net_arch[-1]['vf'] must contain a list of integers."
                            value_only_layers = layer['vf']
                        break  # From here on the network splits up in policy and value network

                # Build the non-shared part of the policy-network
                latent_policy = latent
                for idx, pi_layer_size in enumerate(policy_only_layers):
                    if pi_layer_size == "lstm":
                        raise NotImplementedError("LSTMs are only supported in the shared part of the policy network.")
                    assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers."
                    latent_policy = act_fun(
                        linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2)))

                # Build the non-shared part of the value-network
                latent_value = latent
                for idx, vf_layer_size in enumerate(value_only_layers):
                    if vf_layer_size == "lstm":
                        raise NotImplementedError("LSTMs are only supported in the shared part of the value function "
                                                  "network.")
                    assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers."
                    latent_value = act_fun(
                        linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2)))

                if not lstm_layer_constructed:
                    raise ValueError("The net_arch parameter must contain at least one occurrence of 'lstm'!")

                self._value_fn = linear(latent_value, 'vf', 1)
                # TODO: why not init_scale = 0.001 here like in the feedforward
                self._proba_distribution, self._policy, self.q_value = \
                    self.pdtype.proba_distribution_from_latent(latent_policy, latent_value)
        self._setup_init()

    def step(self, obs, state=None, mask=None, deterministic=False):
        if deterministic:
            return self.sess.run([self.deterministic_action, self.value_flat, self.snew, self.neglogp],
                                 {self.obs_ph: obs, self.states_ph: state, self.dones_ph: mask})
        else:
            return self.sess.run([self.action, self.value_flat, self.snew, self.neglogp],
                                 {self.obs_ph: obs, self.states_ph: state, self.dones_ph: mask})

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, {self.obs_ph: obs, self.states_ph: state, self.dones_ph: mask})

    def value(self, obs, state=None, mask=None):
        return self.sess.run(self.value_flat, {self.obs_ph: obs, self.states_ph: state, self.dones_ph: mask})

In [18]:
env_2 = gym.make('CartPole-v1')
model_2 = A2C(LstmPolicy2, env_2, verbose = 1)
model_2.learn(total_timesteps = 20000)
model_2.save("a2c_cartpole_ver2")

Wrapping the env in a DummyVecEnv.
---------------------------------
| explained_variance | 0.01     |
| fps                | 5        |
| nupdates           | 1        |
| policy_entropy     | 0.693    |
| total_timesteps    | 5        |
| value_loss         | 10.6     |
---------------------------------
---------------------------------
| explained_variance | 0.00771  |
| fps                | 201      |
| nupdates           | 100      |
| policy_entropy     | 0.693    |
| total_timesteps    | 500      |
| value_loss         | 10.6     |
---------------------------------
---------------------------------
| explained_variance | -0.0246  |
| fps                | 238      |
| nupdates           | 200      |
| policy_entropy     | 0.693    |
| total_timesteps    | 1000     |
| value_loss         | 10.7     |
---------------------------------
---------------------------------
| explained_variance | 0.0259   |
| fps                | 254      |
| nupdates           | 300      |
| policy_entr

---------------------------------
| explained_variance | -0.253   |
| fps                | 280      |
| nupdates           | 3000     |
| policy_entropy     | 0.00469  |
| total_timesteps    | 15000    |
| value_loss         | 40.7     |
---------------------------------
---------------------------------
| explained_variance | 0.262    |
| fps                | 280      |
| nupdates           | 3100     |
| policy_entropy     | 0.00383  |
| total_timesteps    | 15500    |
| value_loss         | 226      |
---------------------------------
---------------------------------
| explained_variance | 0.0809   |
| fps                | 280      |
| nupdates           | 3200     |
| policy_entropy     | 0.00946  |
| total_timesteps    | 16000    |
| value_loss         | 235      |
---------------------------------
---------------------------------
| explained_variance | -1.6     |
| fps                | 280      |
| nupdates           | 3300     |
| policy_entropy     | 0.000211 |
| total_timest

In [21]:
eval_env2 = DummyVecEnv([lambda:gym.make('CartPole-v1')])
model_2 = model_2.load("a2c_cartpole_ver2")
mean_reward_2, std_reward_2 = evaluate_policy(model_2, eval_env2, n_eval_episodes=100, return_episode_rewards=False)
print(mean_reward_2, std_reward_2)

Loading a model without an environment, this model cannot be trained until it has a valid environment.

9.31 0.77064914


## LSTM Policy 3
 + 2 Lstm layer (128 nodes each) + 2 MLP layers for actor and critic layer

In [25]:
env = gym.make('CartPole-v1')
model_3 = A2C(LstmPolicy2, env, verbose = 1, policy_kwargs = dict(net_arch = ['lstm', dict(pi = [64, 64], vf = [64, 64])]))
model_3.learn(total_timesteps = 20000)
model_3.save('a2c_cartpole_ver3')

Wrapping the env in a DummyVecEnv.
---------------------------------
| explained_variance | 0.00179  |
| fps                | 5        |
| nupdates           | 1        |
| policy_entropy     | 0.693    |
| total_timesteps    | 5        |
| value_loss         | 10.6     |
---------------------------------
---------------------------------
| explained_variance | 9.06e-06 |
| fps                | 198      |
| nupdates           | 100      |
| policy_entropy     | 0.693    |
| total_timesteps    | 500      |
| value_loss         | 10.7     |
---------------------------------
---------------------------------
| explained_variance | 0.000252 |
| fps                | 240      |
| nupdates           | 200      |
| policy_entropy     | 0.693    |
| total_timesteps    | 1000     |
| value_loss         | 10.6     |
---------------------------------
---------------------------------
| explained_variance | -0.0226  |
| fps                | 255      |
| nupdates           | 300      |
| policy_entr

---------------------------------
| explained_variance | -0.00166 |
| fps                | 297      |
| nupdates           | 3000     |
| policy_entropy     | 0.549    |
| total_timesteps    | 15000    |
| value_loss         | 2.05     |
---------------------------------
---------------------------------
| explained_variance | -0.0833  |
| fps                | 297      |
| nupdates           | 3100     |
| policy_entropy     | 0.687    |
| total_timesteps    | 15500    |
| value_loss         | 1.91     |
---------------------------------
---------------------------------
| explained_variance | -0.0134  |
| fps                | 297      |
| nupdates           | 3200     |
| policy_entropy     | 0.558    |
| total_timesteps    | 16000    |
| value_loss         | 1.65     |
---------------------------------
---------------------------------
| explained_variance | -0.0268  |
| fps                | 297      |
| nupdates           | 3300     |
| policy_entropy     | 0.488    |
| total_timest

In [27]:
eval_env3 = DummyVecEnv([lambda:gym.make('CartPole-v1')])
model_3 = model_3.load("a2c_cartpole_ver3")
print(model_3.get_parameter_list())
mean_reward_3, std_reward_3 = evaluate_policy(model_3, eval_env3, n_eval_episodes=100, return_episode_rewards=False)
print('\n')
print(mean_reward_3, std_reward_3)

Loading a model without an environment, this model cannot be trained until it has a valid environment.

[<tf.Variable 'model/lstm1/wx:0' shape=(4, 512) dtype=float32_ref>, <tf.Variable 'model/lstm1/wh:0' shape=(128, 512) dtype=float32_ref>, <tf.Variable 'model/lstm1/b:0' shape=(512,) dtype=float32_ref>, <tf.Variable 'model/lstm2/wx:0' shape=(128, 512) dtype=float32_ref>, <tf.Variable 'model/lstm2/wh:0' shape=(128, 512) dtype=float32_ref>, <tf.Variable 'model/lstm2/b:0' shape=(512,) dtype=float32_ref>, <tf.Variable 'model/pi_fc0/w:0' shape=(128, 64) dtype=float32_ref>, <tf.Variable 'model/pi_fc0/b:0' shape=(64,) dtype=float32_ref>, <tf.Variable 'model/pi_fc1/w:0' shape=(64, 64) dtype=float32_ref>, <tf.Variable 'model/pi_fc1/b:0' shape=(64,) dtype=float32_ref>, <tf.Variable 'model/vf_fc0/w:0' shape=(128, 64) dtype=float32_ref>, <tf.Variable 'model/vf_fc0/b:0' shape=(64,) dtype=float32_ref>, <tf.Variable 'model/vf_fc1/w:0' shape=(64, 64) dtype=float32_ref>, <tf.Variable 'model/vf_fc1/b:0'

## Cutom Lstm Policy based MlpLstmPolicy

In [33]:
class MlpLstmPolicy2(LstmPolicy2):
    """
    Policy object that implements actor critic, using LSTMs with a MLP feature extraction
    :param sess: (TensorFlow session) The current TensorFlow session
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param n_env: (int) The number of environments to run
    :param n_steps: (int) The number of steps to run for each environment
    :param n_batch: (int) The number of batch to run (n_envs * n_steps)
    :param n_lstm: (int) The number of LSTM cells (for recurrent policies)
    :param reuse: (bool) If the policy is reusable or not
    :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
    """

    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm = 128, reuse = False, **_kwargs):
        super(MlpLstmPolicy2, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse,
                                            layer_norm=False, feature_extraction = "mlp", **_kwargs)

In [34]:
env = gym.make('CartPole-v1')
model = A2C(MlpLstmPolicy2, env, verbose = 1)
model.learn(total_timesteps = 20000)
params = model.get_parameter_list()
print(params)

Wrapping the env in a DummyVecEnv.
---------------------------------
| explained_variance | -0.00054 |
| fps                | 5        |
| nupdates           | 1        |
| policy_entropy     | 0.693    |
| total_timesteps    | 5        |
| value_loss         | 10.7     |
---------------------------------
---------------------------------
| explained_variance | -0.00265 |
| fps                | 192      |
| nupdates           | 100      |
| policy_entropy     | 0.693    |
| total_timesteps    | 500      |
| value_loss         | 6.04     |
---------------------------------
---------------------------------
| explained_variance | 0.000757 |
| fps                | 229      |
| nupdates           | 200      |
| policy_entropy     | 0.693    |
| total_timesteps    | 1000     |
| value_loss         | 10.6     |
---------------------------------
---------------------------------
| explained_variance | 0.000269 |
| fps                | 244      |
| nupdates           | 300      |
| policy_entr

---------------------------------
| explained_variance | -31.3    |
| fps                | 287      |
| nupdates           | 3000     |
| policy_entropy     | 0.00774  |
| total_timesteps    | 15000    |
| value_loss         | 293      |
---------------------------------
---------------------------------
| explained_variance | -9.47    |
| fps                | 288      |
| nupdates           | 3100     |
| policy_entropy     | 8.01e-05 |
| total_timesteps    | 15500    |
| value_loss         | 0.338    |
---------------------------------
---------------------------------
| explained_variance | -0.494   |
| fps                | 287      |
| nupdates           | 3200     |
| policy_entropy     | 0.00211  |
| total_timesteps    | 16000    |
| value_loss         | 5.73     |
---------------------------------
---------------------------------
| explained_variance | -45      |
| fps                | 287      |
| nupdates           | 3300     |
| policy_entropy     | 0.00168  |
| total_timest

In [35]:
eval_env = DummyVecEnv([lambda:gym.make('CartPole-v1')])
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100, return_episode_rewards=False)
print('\n')
print(mean_reward, std_reward)



11.57 0.8631917
