In [20]:
import time
import gym
import random
import numpy as np
from tensorflow import keras
#import keras
#from keras import layers
from tensorflow.keras import layers
import tensorflow as tf
print(tf. __version__)

1.13.1


In [2]:
from baselines.ppo2 import ppo2
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

from baselines import bench
from baselines import logger
from baselines import deepq
from baselines.common.tf_util import make_session

In [3]:
# Model / data parameters
num_classes = 10
input_shape = (28, 28, 1)

In [4]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

In [5]:
# Scale images to the [0, 1] range
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255

In [6]:
# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples


In [7]:
# convert class vectors to binary class matrices
y_train_one_hot = keras.utils.to_categorical(y_train, num_classes)
y_test_one_hot = keras.utils.to_categorical(y_test, num_classes)

In [8]:
y_train

array([5, 0, 4, ..., 5, 6, 8], dtype=uint8)

In [9]:
y_train_one_hot

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)

In [18]:
# Baseline model traditional supervise learning uing NN classifier
def keras_train(batch_size=32, epochs=2):
    model = tf.keras.Sequential(
        [
            keras.Input(shape=input_shape),
            #keras.InputLayer(input_shape=input_shape),
            #model.add(InputLayer(input_shape=shape, name=name))
            layers.Flatten(),
            layers.Dense(64, activation='relu'),
            layers.Dense(64, activation='relu'),
            layers.Dense(num_classes, activation='softmax')
        ]
    )

    model.summary()

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    start_time = time.time()
    model.fit(x_train, y_train_one_hot, batch_size=batch_size, epochs=epochs, validation_split=0.1)
    end_time = time.time()

    score = model.evaluate(x_test, y_test_one_hot, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    print('Training Time:', end_time - start_time)

keras_train()

TypeError: The added layer must be an instance of class Layer. Found: Tensor("input_7:0", shape=(?, 28, 28, 1), dtype=float32)

In [22]:
class MnistEnv(gym.Env):
    def __init__(self, images_per_episode=1, dataset=(x_train, y_train), random=True):
        super().__init__()

        self.action_space = gym.spaces.Discrete(10)
        self.observation_space = gym.spaces.Box(low=0, high=1,
                                                shape=(28, 28, 1),
                                                dtype=np.float32)

        self.images_per_episode = images_per_episode
        self.step_count = 0

        self.x, self.y = dataset
        self.random = random
        self.dataset_idx = 0

    def step(self, action):
        done = False
        reward = int(action == self.expected_action)

        obs = self._next_obs()

        self.step_count += 1
        if self.step_count >= self.images_per_episode:
            done = True

        return obs, reward, done, {}

    def reset(self):
        self.step_count = 0

        obs = self._next_obs()
        return obs

    def _next_obs(self):
        if self.random:
            next_obs_idx = random.randint(0, len(self.x) - 1)
            self.expected_action = int(self.y[next_obs_idx])
            obs = self.x[next_obs_idx]

        else:
            obs = self.x[self.dataset_idx]
            self.expected_action = int(self.y[self.dataset_idx])

            self.dataset_idx += 1
            if self.dataset_idx >= len(self.x):
                raise StopIteration()

        return obs

In [23]:
def mnist_dqn():
    logger.configure(dir='./logs/mnist_dqn', format_strs=['stdout', 'tensorboard'])
    env = MnistEnv(images_per_episode=1)
    env = bench.Monitor(env, logger.get_dir())

    model = deepq.learn(
        env,
        "mlp",
        num_layers=1,
        num_hidden=64,
        activation=tf.nn.relu,
        hiddens=[32],
        dueling=True,
        lr=1e-4,
        total_timesteps=int(1.2e5),
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
    )

    model.save('dqn_mnist.pkl')
    env.close()

    return model

start_time = time.time()
dqn_model = mnist_dqn()
print("DQN Training Time:", time.time() - start_time)

Logging to ./logs/mnist_dqn
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.




Instructions for updating:
Use tf.cast instead.
--------------------------------------
| % time spent exploring  | 99       |
| episodes                | 100      |
| mean 100 episode reward | 0.1      |
| steps                   | 98       |
--------------------------------------
--------------------------------------
| % time spent exploring  | 98       |
| episodes                | 200      |
| mean 100 episode reward | 0.1      |
| steps                   | 198      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 97       |
| episodes                | 300      |
| mean 100 episode reward | 0.1      |
| steps                   | 298      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 96       |
| episodes                | 400      |
| mean 100 episode reward | 0.2      |
| steps                   | 398      |
--------------------------------------
----------------

--------------------------------------
| % time spent exploring  | 71       |
| episodes                | 3.5e+03  |
| mean 100 episode reward | 0        |
| steps                   | 3.5e+03  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 70       |
| episodes                | 3.6e+03  |
| mean 100 episode reward | 0.1      |
| steps                   | 3.6e+03  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 69       |
| episodes                | 3.7e+03  |
| mean 100 episode reward | 0.1      |
| steps                   | 3.7e+03  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 68       |
| episodes                | 3.8e+03  |
| mean 100 episode reward | 0.1      |
| steps                   | 3.8e+03  |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 41       |
| episodes                | 7.1e+03  |
| mean 100 episode reward | 0.1      |
| steps                   | 7.1e+03  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 40       |
| episodes                | 7.2e+03  |
| mean 100 episode reward | 0.1      |
| steps                   | 7.2e+03  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 39       |
| episodes                | 7.3e+03  |
| mean 100 episode reward | 0.2      |
| steps                   | 7.3e+03  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 38       |
| episodes                | 7.4e+03  |
| mean 100 episode reward | 0.1      |
| steps                   | 7.4e+03  |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 11       |
| episodes                | 1.07e+04 |
| mean 100 episode reward | 0.2      |
| steps                   | 1.07e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 10       |
| episodes                | 1.08e+04 |
| mean 100 episode reward | 0.2      |
| steps                   | 1.08e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 10       |
| episodes                | 1.09e+04 |
| mean 100 episode reward | 0.2      |
| steps                   | 1.09e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 9        |
| episodes                | 1.1e+04  |
| mean 100 episode reward | 0.2      |
| steps                   | 1.1e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.43e+04 |
| mean 100 episode reward | 0.5      |
| steps                   | 1.43e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.44e+04 |
| mean 100 episode reward | 0.7      |
| steps                   | 1.44e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.45e+04 |
| mean 100 episode reward | 0.6      |
| steps                   | 1.45e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.46e+04 |
| mean 100 episode reward | 0.7      |
| steps                   | 1.46e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.79e+04 |
| mean 100 episode reward | 0.7      |
| steps                   | 1.79e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.8e+04  |
| mean 100 episode reward | 0.8      |
| steps                   | 1.8e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.81e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 1.81e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.82e+04 |
| mean 100 episode reward | 0.8      |
| steps                   | 1.82e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.14e+04 |
| mean 100 episode reward | 0.8      |
| steps                   | 2.14e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.15e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 2.15e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.16e+04 |
| mean 100 episode reward | 0.8      |
| steps                   | 2.16e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.17e+04 |
| mean 100 episode reward | 0.8      |
| steps                   | 2.17e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.5e+04  |
| mean 100 episode reward | 0.9      |
| steps                   | 2.5e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.51e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 2.51e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.52e+04 |
| mean 100 episode reward | 0.8      |
| steps                   | 2.52e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.53e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 2.53e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.86e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 2.86e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.87e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 2.87e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.88e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 2.88e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 2.89e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 2.89e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 3.21e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 3.21e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 3.22e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 3.22e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 3.23e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 3.23e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 3.24e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 3.24e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 3.57e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 3.57e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 3.58e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 3.58e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 3.59e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 3.59e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 3.6e+04  |
| mean 100 episode reward | 0.9      |
| steps                   | 3.6e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 3.93e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 3.93e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 3.94e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 3.94e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 3.95e+04 |
| mean 100 episode reward | 1        |
| steps                   | 3.95e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 3.96e+04 |
| mean 100 episode reward | 1        |
| steps                   | 3.96e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 4.29e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 4.29e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 4.3e+04  |
| mean 100 episode reward | 0.8      |
| steps                   | 4.3e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 4.31e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 4.31e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 4.32e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 4.32e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 4.65e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 4.65e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 4.66e+04 |
| mean 100 episode reward | 1        |
| steps                   | 4.66e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 4.67e+04 |
| mean 100 episode reward | 1        |
| steps                   | 4.67e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 4.68e+04 |
| mean 100 episode reward | 1        |
| steps                   | 4.68e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 5.01e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 5.01e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 5.02e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 5.02e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 5.03e+04 |
| mean 100 episode reward | 1        |
| steps                   | 5.03e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 5.04e+04 |
| mean 100 episode reward | 1        |
| steps                   | 5.04e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 5.37e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 5.37e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 5.38e+04 |
| mean 100 episode reward | 1        |
| steps                   | 5.38e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 5.39e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 5.39e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 5.4e+04  |
| mean 100 episode reward | 1        |
| steps                   | 5.4e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 5.73e+04 |
| mean 100 episode reward | 1        |
| steps                   | 5.73e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 5.74e+04 |
| mean 100 episode reward | 1        |
| steps                   | 5.74e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 5.75e+04 |
| mean 100 episode reward | 1        |
| steps                   | 5.75e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 5.76e+04 |
| mean 100 episode reward | 1        |
| steps                   | 5.76e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 6.09e+04 |
| mean 100 episode reward | 1        |
| steps                   | 6.09e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 6.1e+04  |
| mean 100 episode reward | 1        |
| steps                   | 6.1e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 6.11e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 6.11e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 6.12e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 6.12e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 6.45e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 6.45e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 6.46e+04 |
| mean 100 episode reward | 1        |
| steps                   | 6.46e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 6.47e+04 |
| mean 100 episode reward | 1        |
| steps                   | 6.47e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 6.48e+04 |
| mean 100 episode reward | 1        |
| steps                   | 6.48e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 6.81e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 6.81e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 6.82e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 6.82e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 6.83e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 6.83e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 6.84e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 6.84e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.16e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 7.16e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.17e+04 |
| mean 100 episode reward | 1        |
| steps                   | 7.17e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.18e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 7.18e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.19e+04 |
| mean 100 episode reward | 1        |
| steps                   | 7.19e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.52e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 7.52e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.53e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 7.53e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.54e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 7.54e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.55e+04 |
| mean 100 episode reward | 1        |
| steps                   | 7.55e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.88e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 7.88e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.89e+04 |
| mean 100 episode reward | 1        |
| steps                   | 7.89e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.9e+04  |
| mean 100 episode reward | 0.9      |
| steps                   | 7.9e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 7.91e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 7.91e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 8.24e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 8.24e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 8.25e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 8.25e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 8.26e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 8.26e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 8.27e+04 |
| mean 100 episode reward | 1        |
| steps                   | 8.27e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 8.6e+04  |
| mean 100 episode reward | 1        |
| steps                   | 8.6e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 8.61e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 8.61e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 8.62e+04 |
| mean 100 episode reward | 1        |
| steps                   | 8.62e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 8.63e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 8.63e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 8.96e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 8.96e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 8.97e+04 |
| mean 100 episode reward | 1        |
| steps                   | 8.97e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 8.98e+04 |
| mean 100 episode reward | 1        |
| steps                   | 8.98e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 8.99e+04 |
| mean 100 episode reward | 1        |
| steps                   | 8.99e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 9.32e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 9.32e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 9.33e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 9.33e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 9.34e+04 |
| mean 100 episode reward | 0.9      |
| steps                   | 9.34e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 9.35e+04 |
| mean 100 episode reward | 1        |
| steps                   | 9.35e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 9.68e+04 |
| mean 100 episode reward | 1        |
| steps                   | 9.68e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 9.69e+04 |
| mean 100 episode reward | 1        |
| steps                   | 9.69e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 9.7e+04  |
| mean 100 episode reward | 1        |
| steps                   | 9.7e+04  |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 9.71e+04 |
| mean 100 episode reward | 1        |
| steps                   | 9.71e+04 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1e+05    |
| mean 100 episode reward | 1        |
| steps                   | 1e+05    |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.00e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1e+05    |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.01e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.01e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.01e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.01e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.04e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.04e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.04e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.04e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.04e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.04e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.04e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.04e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.08e+05 |
| mean 100 episode reward | 0.9      |
| steps                   | 1.08e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.08e+05 |
| mean 100 episode reward | 0.9      |
| steps                   | 1.08e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.08e+05 |
| mean 100 episode reward | 0.9      |
| steps                   | 1.08e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.08e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.08e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.11e+05 |
| mean 100 episode reward | 0.9      |
| steps                   | 1.11e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.11e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.11e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.11e+05 |
| mean 100 episode reward | 0.9      |
| steps                   | 1.11e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.12e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.11e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.15e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.15e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.15e+05 |
| mean 100 episode reward | 0.9      |
| steps                   | 1.15e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.15e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.15e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.15e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.15e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.18e+05 |
| mean 100 episode reward | 0.9      |
| steps                   | 1.18e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.18e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.18e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.19e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.19e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring  | 1        |
| episodes                | 1.19e+05 |
| mean 100 episode reward | 1        |
| steps                   | 1.19e+05 |
--------------------------------------
--------------------------------------
| % time spent exploring 

In [24]:
def mnist_dqn_eval(dqn_model):
    attempts, correct = 0,0

    env = MnistEnv(images_per_episode=1, dataset=(x_test, y_test), random=False)

    try:
        while True:
            obs, done = env.reset(), False
            while not done:
                obs, rew, done, _ = env.step(dqn_model(obs[None])[0])

                attempts += 1
                if rew > 0:
                    correct += 1

    except StopIteration:
        print()
        print('validation done...')
        print('Accuracy: {0}%'.format((float(correct) / attempts) * 100))

mnist_dqn_eval(dqn_model)


validation done...
Accuracy: 93.55871174234846%


In [30]:
def mnist_ppo():
    logger.configure(dir='./logs/mnist_ppo', format_strs=['stdout', 'tensorboard'])
    env = DummyVecEnv([lambda: bench.Monitor(MnistEnv(images_per_episode=1), logger.get_dir())])

    model = ppo2.learn(
        env=env,
        network='mlp',
        num_layers=2,
        num_hidden=64,
        nsteps=32,
        total_timesteps=int(1.2e5),
        seed=int(time.time()))

    return model

start_time = time.time()
ppo_model = mnist_ppo()
print('PPO Training Time:', time.time() - start_time)

Logging to ./logs/mnist_ppo
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.219    |
| fps                     | 79       |
| loss/approxkl           | 0.000137 |
| loss/clipfrac           | 0        |
| loss/policy_entropy     | 2.3      |
| loss/policy_loss        | -0.00857 |
| loss/value_loss         | 0.0874   |
| misc/explained_variance | -0.31    |
| misc/nupdates           | 1        |
| misc/serial_timesteps   | 32       |
| misc/time_elapsed       | 0.402    |
| misc/total_timesteps    | 32       |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.12     |
| fps                     | 172      |
| loss/approxkl           | 0.000397 |
| loss/clipfrac           | 0        |
| loss/policy_entropy     | 2.26     |
| loss

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.63     |
| fps                     | 182      |
| loss/approxkl           | 0.00956  |
| loss/clipfrac           | 0.109    |
| loss/policy_entropy     | 0.636    |
| loss/policy_loss        | -0.0427  |
| loss/value_loss         | 0.0562   |
| misc/explained_variance | 0.511    |
| misc/nupdates           | 130      |
| misc/serial_timesteps   | 4.16e+03 |
| misc/time_elapsed       | 20.7     |
| misc/total_timesteps    | 4.16e+03 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.75     |
| fps                     | 174      |
| loss/approxkl           | 0.00706  |
| loss/clipfrac           | 0.0781   |
| loss/policy_entropy     | 0.634    |
| loss/policy_loss        | -0.0279  |
| loss/value_loss         | 0.039    |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.74     |
| fps                     | 199      |
| loss/approxkl           | 0.00882  |
| loss/clipfrac           | 0.0859   |
| loss/policy_entropy     | 0.322    |
| loss/policy_loss        | -0.0249  |
| loss/value_loss         | 0.0493   |
| misc/explained_variance | 0.489    |
| misc/nupdates           | 270      |
| misc/serial_timesteps   | 8.64e+03 |
| misc/time_elapsed       | 41.7     |
| misc/total_timesteps    | 8.64e+03 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.67     |
| fps                     | 194      |
| loss/approxkl           | 0.0156   |
| loss/clipfrac           | 0.117    |
| loss/policy_entropy     | 0.517    |
| loss/policy_loss        | -0.0462  |
| loss/value_loss         | 0.0662   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.7      |
| fps                     | 327      |
| loss/approxkl           | 0.0113   |
| loss/clipfrac           | 0.109    |
| loss/policy_entropy     | 0.32     |
| loss/policy_loss        | -0.0361  |
| loss/value_loss         | 0.0435   |
| misc/explained_variance | 0.576    |
| misc/nupdates           | 410      |
| misc/serial_timesteps   | 1.31e+04 |
| misc/time_elapsed       | 62.2     |
| misc/total_timesteps    | 1.31e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.7      |
| fps                     | 189      |
| loss/approxkl           | 0.00909  |
| loss/clipfrac           | 0.0547   |
| loss/policy_entropy     | 0.255    |
| loss/policy_loss        | -0.027   |
| loss/value_loss         | 0.0552   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.77     |
| fps                     | 215      |
| loss/approxkl           | 0.0516   |
| loss/clipfrac           | 0.211    |
| loss/policy_entropy     | 0.267    |
| loss/policy_loss        | -0.0589  |
| loss/value_loss         | 0.052    |
| misc/explained_variance | 0.437    |
| misc/nupdates           | 550      |
| misc/serial_timesteps   | 1.76e+04 |
| misc/time_elapsed       | 82.4     |
| misc/total_timesteps    | 1.76e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.74     |
| fps                     | 211      |
| loss/approxkl           | 0.012    |
| loss/clipfrac           | 0.0859   |
| loss/policy_entropy     | 0.378    |
| loss/policy_loss        | -0.0219  |
| loss/value_loss         | 0.032    |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.81     |
| fps                     | 200      |
| loss/approxkl           | 0.0152   |
| loss/clipfrac           | 0.0859   |
| loss/policy_entropy     | 0.266    |
| loss/policy_loss        | -0.0228  |
| loss/value_loss         | 0.0278   |
| misc/explained_variance | 0.746    |
| misc/nupdates           | 690      |
| misc/serial_timesteps   | 2.21e+04 |
| misc/time_elapsed       | 103      |
| misc/total_timesteps    | 2.21e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.68     |
| fps                     | 212      |
| loss/approxkl           | 0.0158   |
| loss/clipfrac           | 0.0625   |
| loss/policy_entropy     | 0.219    |
| loss/policy_loss        | -0.0232  |
| loss/value_loss         | 0.0422   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.74     |
| fps                     | 203      |
| loss/approxkl           | 0.00143  |
| loss/clipfrac           | 0.0156   |
| loss/policy_entropy     | 0.163    |
| loss/policy_loss        | -0.00558 |
| loss/value_loss         | 0.0246   |
| misc/explained_variance | 0.519    |
| misc/nupdates           | 830      |
| misc/serial_timesteps   | 2.66e+04 |
| misc/time_elapsed       | 122      |
| misc/total_timesteps    | 2.66e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.75     |
| fps                     | 205      |
| loss/approxkl           | 0.0222   |
| loss/clipfrac           | 0.0703   |
| loss/policy_entropy     | 0.392    |
| loss/policy_loss        | -0.0343  |
| loss/value_loss         | 0.0425   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.92     |
| fps                     | 176      |
| loss/approxkl           | 0.0293   |
| loss/clipfrac           | 0.109    |
| loss/policy_entropy     | 0.202    |
| loss/policy_loss        | -0.0444  |
| loss/value_loss         | 0.0623   |
| misc/explained_variance | -0.108   |
| misc/nupdates           | 970      |
| misc/serial_timesteps   | 3.1e+04  |
| misc/time_elapsed       | 143      |
| misc/total_timesteps    | 3.1e+04  |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.87     |
| fps                     | 303      |
| loss/approxkl           | 0.0134   |
| loss/clipfrac           | 0.102    |
| loss/policy_entropy     | 0.205    |
| loss/policy_loss        | -0.025   |
| loss/value_loss         | 0.048    |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.88     |
| fps                     | 183      |
| loss/approxkl           | 0.0152   |
| loss/clipfrac           | 0.0547   |
| loss/policy_entropy     | 0.174    |
| loss/policy_loss        | -0.0291  |
| loss/value_loss         | 0.0412   |
| misc/explained_variance | 0.177    |
| misc/nupdates           | 1.11e+03 |
| misc/serial_timesteps   | 3.55e+04 |
| misc/time_elapsed       | 165      |
| misc/total_timesteps    | 3.55e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.9      |
| fps                     | 180      |
| loss/approxkl           | 0.0221   |
| loss/clipfrac           | 0.0781   |
| loss/policy_entropy     | 0.16     |
| loss/policy_loss        | -0.0255  |
| loss/value_loss         | 0.0467   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.94     |
| fps                     | 183      |
| loss/approxkl           | 0.0135   |
| loss/clipfrac           | 0.0391   |
| loss/policy_entropy     | 0.06     |
| loss/policy_loss        | -0.015   |
| loss/value_loss         | 0.0209   |
| misc/explained_variance | 0.181    |
| misc/nupdates           | 1.25e+03 |
| misc/serial_timesteps   | 4e+04    |
| misc/time_elapsed       | 188      |
| misc/total_timesteps    | 4e+04    |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.89     |
| fps                     | 185      |
| loss/approxkl           | 0.0604   |
| loss/clipfrac           | 0.0859   |
| loss/policy_entropy     | 0.145    |
| loss/policy_loss        | -0.0363  |
| loss/value_loss         | 0.0307   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.95     |
| fps                     | 184      |
| loss/approxkl           | 0.0307   |
| loss/clipfrac           | 0.0391   |
| loss/policy_entropy     | 0.0562   |
| loss/policy_loss        | -0.0169  |
| loss/value_loss         | 0.0245   |
| misc/explained_variance | 0.157    |
| misc/nupdates           | 1.39e+03 |
| misc/serial_timesteps   | 4.45e+04 |
| misc/time_elapsed       | 211      |
| misc/total_timesteps    | 4.45e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.92     |
| fps                     | 182      |
| loss/approxkl           | 0.0186   |
| loss/clipfrac           | 0.0547   |
| loss/policy_entropy     | 0.0761   |
| loss/policy_loss        | -0.0242  |
| loss/value_loss         | 0.0375   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.91     |
| fps                     | 206      |
| loss/approxkl           | 0.0109   |
| loss/clipfrac           | 0.0625   |
| loss/policy_entropy     | 0.0949   |
| loss/policy_loss        | -0.0226  |
| loss/value_loss         | 0.0353   |
| misc/explained_variance | 0.152    |
| misc/nupdates           | 1.53e+03 |
| misc/serial_timesteps   | 4.9e+04  |
| misc/time_elapsed       | 234      |
| misc/total_timesteps    | 4.9e+04  |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.91     |
| fps                     | 174      |
| loss/approxkl           | 0.00409  |
| loss/clipfrac           | 0.0391   |
| loss/policy_entropy     | 0.0584   |
| loss/policy_loss        | -0.0152  |
| loss/value_loss         | 0.0291   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.92     |
| fps                     | 169      |
| loss/approxkl           | 0.0138   |
| loss/clipfrac           | 0.0859   |
| loss/policy_entropy     | 0.0692   |
| loss/policy_loss        | -0.0335  |
| loss/value_loss         | 0.0306   |
| misc/explained_variance | 0.242    |
| misc/nupdates           | 1.67e+03 |
| misc/serial_timesteps   | 5.34e+04 |
| misc/time_elapsed       | 257      |
| misc/total_timesteps    | 5.34e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.9      |
| fps                     | 409      |
| loss/approxkl           | 0.0358   |
| loss/clipfrac           | 0.0703   |
| loss/policy_entropy     | 0.0622   |
| loss/policy_loss        | -0.0314  |
| loss/value_loss         | 0.0269   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.96     |
| fps                     | 278      |
| loss/approxkl           | 0.0123   |
| loss/clipfrac           | 0.0156   |
| loss/policy_entropy     | 0.0229   |
| loss/policy_loss        | -0.00982 |
| loss/value_loss         | 0.0104   |
| misc/explained_variance | 0.254    |
| misc/nupdates           | 1.81e+03 |
| misc/serial_timesteps   | 5.79e+04 |
| misc/time_elapsed       | 269      |
| misc/total_timesteps    | 5.79e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.91     |
| fps                     | 244      |
| loss/approxkl           | 0.0164   |
| loss/clipfrac           | 0.0547   |
| loss/policy_entropy     | 0.112    |
| loss/policy_loss        | -0.0263  |
| loss/value_loss         | 0.028    |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.89     |
| fps                     | 307      |
| loss/approxkl           | 0.0401   |
| loss/clipfrac           | 0.0391   |
| loss/policy_entropy     | 0.0751   |
| loss/policy_loss        | -0.0222  |
| loss/value_loss         | 0.0181   |
| misc/explained_variance | 0.367    |
| misc/nupdates           | 1.95e+03 |
| misc/serial_timesteps   | 6.24e+04 |
| misc/time_elapsed       | 281      |
| misc/total_timesteps    | 6.24e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.95     |
| fps                     | 184      |
| loss/approxkl           | 0.00815  |
| loss/clipfrac           | 0.0234   |
| loss/policy_entropy     | 0.0565   |
| loss/policy_loss        | -0.00424 |
| loss/value_loss         | 0.00585  |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.93     |
| fps                     | 182      |
| loss/approxkl           | 0.0108   |
| loss/clipfrac           | 0.0703   |
| loss/policy_entropy     | 0.0633   |
| loss/policy_loss        | -0.0221  |
| loss/value_loss         | 0.0342   |
| misc/explained_variance | 0.126    |
| misc/nupdates           | 2.09e+03 |
| misc/serial_timesteps   | 6.69e+04 |
| misc/time_elapsed       | 304      |
| misc/total_timesteps    | 6.69e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.89     |
| fps                     | 174      |
| loss/approxkl           | 0.0637   |
| loss/clipfrac           | 0.0938   |
| loss/policy_entropy     | 0.108    |
| loss/policy_loss        | -0.0365  |
| loss/value_loss         | 0.0293   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.94     |
| fps                     | 181      |
| loss/approxkl           | 1.4e-05  |
| loss/clipfrac           | 0        |
| loss/policy_entropy     | 0.017    |
| loss/policy_loss        | -0.00118 |
| loss/value_loss         | 0.0138   |
| misc/explained_variance | 0.000618 |
| misc/nupdates           | 2.23e+03 |
| misc/serial_timesteps   | 7.14e+04 |
| misc/time_elapsed       | 328      |
| misc/total_timesteps    | 7.14e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.94     |
| fps                     | 196      |
| loss/approxkl           | 0.0108   |
| loss/clipfrac           | 0.0312   |
| loss/policy_entropy     | 0.0346   |
| loss/policy_loss        | -0.0183  |
| loss/value_loss         | 0.0167   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.9      |
| fps                     | 183      |
| loss/approxkl           | 0.0811   |
| loss/clipfrac           | 0.0625   |
| loss/policy_entropy     | 0.0891   |
| loss/policy_loss        | -0.0348  |
| loss/value_loss         | 0.0497   |
| misc/explained_variance | 0.263    |
| misc/nupdates           | 2.37e+03 |
| misc/serial_timesteps   | 7.58e+04 |
| misc/time_elapsed       | 350      |
| misc/total_timesteps    | 7.58e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.91     |
| fps                     | 178      |
| loss/approxkl           | 0.0413   |
| loss/clipfrac           | 0.0938   |
| loss/policy_entropy     | 0.121    |
| loss/policy_loss        | -0.0411  |
| loss/value_loss         | 0.0445   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.93     |
| fps                     | 368      |
| loss/approxkl           | 0.0516   |
| loss/clipfrac           | 0.0703   |
| loss/policy_entropy     | 0.0706   |
| loss/policy_loss        | -0.0205  |
| loss/value_loss         | 0.0625   |
| misc/explained_variance | 0.105    |
| misc/nupdates           | 2.51e+03 |
| misc/serial_timesteps   | 8.03e+04 |
| misc/time_elapsed       | 373      |
| misc/total_timesteps    | 8.03e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.92     |
| fps                     | 173      |
| loss/approxkl           | 0.00605  |
| loss/clipfrac           | 0.0312   |
| loss/policy_entropy     | 0.044    |
| loss/policy_loss        | -0.0121  |
| loss/value_loss         | 0.0255   |
| mi

Stepping environment...
Done.
---------------------------------------
| eplenmean               | 1         |
| eprewmean               | 0.98      |
| fps                     | 189       |
| loss/approxkl           | 3.04e-07  |
| loss/clipfrac           | 0         |
| loss/policy_entropy     | 0.00458   |
| loss/policy_loss        | -2.43e-05 |
| loss/value_loss         | 0.00316   |
| misc/explained_variance | nan       |
| misc/nupdates           | 2.65e+03  |
| misc/serial_timesteps   | 8.48e+04  |
| misc/time_elapsed       | 392       |
| misc/total_timesteps    | 8.48e+04  |
---------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.95     |
| fps                     | 179      |
| loss/approxkl           | 0.0733   |
| loss/clipfrac           | 0.0938   |
| loss/policy_entropy     | 0.133    |
| loss/policy_loss        | -0.0373  |
| loss/value_loss         | 

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.96     |
| fps                     | 189      |
| loss/approxkl           | 0.0246   |
| loss/clipfrac           | 0.0938   |
| loss/policy_entropy     | 0.125    |
| loss/policy_loss        | -0.0394  |
| loss/value_loss         | 0.0305   |
| misc/explained_variance | -0.0842  |
| misc/nupdates           | 2.79e+03 |
| misc/serial_timesteps   | 8.93e+04 |
| misc/time_elapsed       | 413      |
| misc/total_timesteps    | 8.93e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.97     |
| fps                     | 183      |
| loss/approxkl           | 0.00996  |
| loss/clipfrac           | 0.0234   |
| loss/policy_entropy     | 0.0448   |
| loss/policy_loss        | -0.0154  |
| loss/value_loss         | 0.014    |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.97     |
| fps                     | 224      |
| loss/approxkl           | 0.0127   |
| loss/clipfrac           | 0.0312   |
| loss/policy_entropy     | 0.0431   |
| loss/policy_loss        | -0.00694 |
| loss/value_loss         | 0.00934  |
| misc/explained_variance | 0.305    |
| misc/nupdates           | 2.93e+03 |
| misc/serial_timesteps   | 9.38e+04 |
| misc/time_elapsed       | 433      |
| misc/total_timesteps    | 9.38e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.98     |
| fps                     | 172      |
| loss/approxkl           | 0.000874 |
| loss/clipfrac           | 0.0156   |
| loss/policy_entropy     | 0.042    |
| loss/policy_loss        | -0.0165  |
| loss/value_loss         | 0.028    |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.97     |
| fps                     | 176      |
| loss/approxkl           | 0.000444 |
| loss/clipfrac           | 0.0156   |
| loss/policy_entropy     | 0.0142   |
| loss/policy_loss        | -0.00567 |
| loss/value_loss         | 0.0154   |
| misc/explained_variance | -0.0441  |
| misc/nupdates           | 3.07e+03 |
| misc/serial_timesteps   | 9.82e+04 |
| misc/time_elapsed       | 454      |
| misc/total_timesteps    | 9.82e+04 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.97     |
| fps                     | 183      |
| loss/approxkl           | 0.0114   |
| loss/clipfrac           | 0.0156   |
| loss/policy_entropy     | 0.0184   |
| loss/policy_loss        | -0.00398 |
| loss/value_loss         | 0.00175  |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.96     |
| fps                     | 204      |
| loss/approxkl           | 0.00247  |
| loss/clipfrac           | 0.0234   |
| loss/policy_entropy     | 0.0194   |
| loss/policy_loss        | -0.00675 |
| loss/value_loss         | 0.00562  |
| misc/explained_variance | nan      |
| misc/nupdates           | 3.21e+03 |
| misc/serial_timesteps   | 1.03e+05 |
| misc/time_elapsed       | 477      |
| misc/total_timesteps    | 1.03e+05 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.93     |
| fps                     | 182      |
| loss/approxkl           | 0.134    |
| loss/clipfrac           | 0.0703   |
| loss/policy_entropy     | 0.0871   |
| loss/policy_loss        | -0.0367  |
| loss/value_loss         | 0.0203   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.95     |
| fps                     | 185      |
| loss/approxkl           | 0.0637   |
| loss/clipfrac           | 0.0625   |
| loss/policy_entropy     | 0.051    |
| loss/policy_loss        | -0.0205  |
| loss/value_loss         | 0.0288   |
| misc/explained_variance | -0.00267 |
| misc/nupdates           | 3.35e+03 |
| misc/serial_timesteps   | 1.07e+05 |
| misc/time_elapsed       | 500      |
| misc/total_timesteps    | 1.07e+05 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.94     |
| fps                     | 189      |
| loss/approxkl           | 0.0362   |
| loss/clipfrac           | 0.0703   |
| loss/policy_entropy     | 0.0651   |
| loss/policy_loss        | -0.0307  |
| loss/value_loss         | 0.0293   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.95     |
| fps                     | 182      |
| loss/approxkl           | 0.0585   |
| loss/clipfrac           | 0.0234   |
| loss/policy_entropy     | 0.0237   |
| loss/policy_loss        | -0.012   |
| loss/value_loss         | 0.0127   |
| misc/explained_variance | 0.13     |
| misc/nupdates           | 3.49e+03 |
| misc/serial_timesteps   | 1.12e+05 |
| misc/time_elapsed       | 522      |
| misc/total_timesteps    | 1.12e+05 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.89     |
| fps                     | 285      |
| loss/approxkl           | 0.281    |
| loss/clipfrac           | 0.0938   |
| loss/policy_entropy     | 0.0599   |
| loss/policy_loss        | -0.0397  |
| loss/value_loss         | 0.0461   |
| mi

Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.96     |
| fps                     | 192      |
| loss/approxkl           | 0.104    |
| loss/clipfrac           | 0.0234   |
| loss/policy_entropy     | 0.00681  |
| loss/policy_loss        | -0.0126  |
| loss/value_loss         | 0.0235   |
| misc/explained_variance | 0.171    |
| misc/nupdates           | 3.63e+03 |
| misc/serial_timesteps   | 1.16e+05 |
| misc/time_elapsed       | 545      |
| misc/total_timesteps    | 1.16e+05 |
--------------------------------------
Stepping environment...
Done.
--------------------------------------
| eplenmean               | 1        |
| eprewmean               | 0.94     |
| fps                     | 205      |
| loss/approxkl           | 0.0113   |
| loss/clipfrac           | 0.0391   |
| loss/policy_entropy     | 0.0375   |
| loss/policy_loss        | -0.0172  |
| loss/value_loss         | 0.0254   |
| mi

In [31]:
def mnist_ppo_eval(ppo_model):
    attempts, correct = 0,0

    env = DummyVecEnv([lambda: MnistEnv(images_per_episode=1, dataset=(x_test, y_test), random=False)])

    try:
        while True:
            obs, done = env.reset(), [False]
            while not done[0]:
                obs, rew, done, _ = env.step(ppo_model.step(obs[None])[0])

                attempts += 1
                if rew[0] > 0:
                    correct += 1

    except StopIteration:
        print()
        print('validation done...')
        print('Accuracy: {0}%'.format((float(correct) / attempts) * 100))

mnist_ppo_eval(ppo_model)


validation done...
Accuracy: 94.71947194719472%
