In [1]:
import base64
import IPython
import imageio

def embed_mp4(filename):
    """Embeds an mp4 file in the notebook."""
    video = open(filename,'rb').read()
    b64 = base64.b64encode(video)
    tag = '''
    <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())

    return IPython.display.HTML(tag)

def record_game(model, env, num_episodes=5, video_filename='video.mp4'):
    with imageio.get_writer(video_filename, fps=60) as video:
        for _ in range(num_episodes):
            obs = env.reset()
            done = False
            total_reward = 0
            video.append_data(env.render('rgb_array'))

            while not done:
                action, _steps = model.predict(obs)
                obs, reward, done, info = env.step(action)
                total_reward += reward
                video.append_data(env.render('rgb_array'))

            print("score:", total_reward)

In [14]:
import os
import gym
import slimevolleygym
from slimevolleygym import SurvivalRewardEnv

from stable_baselines.common.policies import MlpPolicy  # for everything else
# from stable_baselines.deepq.policies import MlpPolicy  # for DQN
from stable_baselines import logger
from stable_baselines.common.callbacks import EvalCallback

from stable_baselines.ppo1 import PPO1
from stable_baselines import A2C, ACER, ACKTR, DQN, HER, GAIL, TRPO

algo = {
    'a2c': A2C,
    'acer': ACER,
    'acktr': ACKTR,
    'dqn': DQN,
    'her': HER,
    'gail': GAIL,
    'trpo': TRPO,
}
trained_model = {}

In [15]:
def experiment(model_str, timesteps=15_000_000):
    NUM_TIMESTEPS = int(timesteps)
    SEED = 721
    EVAL_FREQ = 25000
    EVAL_EPISODES = 10  # was 1000
    LOGDIR = "sb-" + model_str

    logger.configure(folder=LOGDIR)
    env = gym.make("SlimeVolley-v0")
    env.seed(SEED)

    model = algo[model_str](MlpPolicy, env, verbose=2)
    trained_model[model_str] = model
    eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)
    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)
    model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.
    env.close()

In [None]:
experiment('trpo')

Logging to sb-trpo
********** Iteration 0 ************
Optimizing Policy...


  "{} != {}".format(self.training_env, self.eval_env))


[35msampling[0m
[35mdone in 0.789 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.127 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0133          0
         1    0.00505      0.154
         2    0.00283      0.257
         3    0.00119      0.391
         4   0.000913      0.533
         5   0.000806      0.816
         6   0.000742       1.11
         7   0.000239       1.26
         8   0.000428       1.29
         9   8.96e-05       1.36
        10   1.92e-05       1.43
[35mdone in 0.152 seconds[0m
Expected: 0.034 Actual: 0.034
Stepsize OK!
[35mvf[0m
[35mdone in 0.084 seconds[0m
------------------------------------------
| EpLenMean               | 981          |
| EpRewMean               | -4           |
| EpThisIter              | 1            |
| EpisodesSoFar           | 1            |
| TimeElapsed             | 1.25         |
| TimestepsSoFar          | 1024         |
| entloss                 | 0.0          |
| entropy  

Stepsize OK!
[35mvf[0m
[35mdone in 0.045 seconds[0m
-----------------------------------------
| EpLenMean               | 537         |
| EpRewMean               | -4.92       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 13          |
| TimeElapsed             | 6.62        |
| TimestepsSoFar          | 7168        |
| entloss                 | 0.0         |
| entropy                 | 2.0336356   |
| explained_variance_t... | 0.507       |
| meankl                  | 0.007651411 |
| optimgain               | 0.03103182  |
| surrgain                | 0.03103182  |
-----------------------------------------
********** Iteration 7 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.672 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0301          0
         1      0.017     0.0748
         2    0.00699      0.261
         3     0.0315      0.426
  

********** Iteration 13 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.658 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.051          0
         1     0.0219     0.0856
         2    0.00959      0.182
         3     0.0259      0.297
         4     0.0114      0.573
         5     0.0179       0.93
         6     0.0246       1.13
         7    0.00982       1.48
         8    0.00387       1.79
         9    0.00611       2.01
        10    0.00283       2.23
[35mdone in 0.020 seconds[0m
Expected: 0.057 Actual: 0.054
Stepsize OK!
[35mvf[0m
[35mdone in 0.038 seconds[0m
------------------------------------------
| EpLenMean               | 534          |
| EpRewMean               | -4.96        |
| EpThisIter              | 1            |
| EpisodesSoFar           | 26           |
| TimeElapsed             | 12           |
| TimestepsSoFar          | 14336      

Stepsize OK!
[35mvf[0m
[35mdone in 0.044 seconds[0m
-----------------------------------------
| EpLenMean               | 553         |
| EpRewMean               | -4.89       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 37          |
| TimeElapsed             | 16.6        |
| TimestepsSoFar          | 20480       |
| entloss                 | 0.0         |
| entropy                 | 1.930401    |
| explained_variance_t... | 0.629       |
| meankl                  | 0.009155822 |
| optimgain               | 0.057049107 |
| surrgain                | 0.057049107 |
-----------------------------------------
********** Iteration 20 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.693 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.045          0
         1      0.021     0.0994
         2     0.0166       0.21
         3     0.0142      0.315
 

********** Iteration 26 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.713 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0433          0
         1     0.0223      0.119
         2      0.022      0.252
         3     0.0241      0.445
         4     0.0144      0.707
         5     0.0141      0.908
         6     0.0183       1.21
         7     0.0123       1.39
         8     0.0135        1.7
         9    0.00694       1.97
        10      0.005       2.34
[35mdone in 0.022 seconds[0m
Expected: 0.062 Actual: 0.068
Stepsize OK!
[35mvf[0m
[35mdone in 0.040 seconds[0m
-----------------------------------------
| EpLenMean               | 567         |
| EpRewMean               | -4.92       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 48          |
| TimeElapsed             | 26.5        |
| TimestepsSoFar          | 27648       |
| e

Stepsize OK!
[35mvf[0m
[35mdone in 0.045 seconds[0m
-----------------------------------------
| EpLenMean               | 608         |
| EpRewMean               | -4.83       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 57          |
| TimeElapsed             | 31.2        |
| TimestepsSoFar          | 33792       |
| entloss                 | 0.0         |
| entropy                 | 1.8499289   |
| explained_variance_t... | 0.819       |
| meankl                  | 0.008657148 |
| optimgain               | 0.052255508 |
| surrgain                | 0.052255508 |
-----------------------------------------
********** Iteration 33 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.744 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0343          0
         1     0.0173      0.099
         2     0.0228      0.223
         3      0.012       0.39
 

********** Iteration 39 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.684 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0536          0
         1     0.0236     0.0814
         2     0.0188       0.16
         3     0.0234       0.35
         4     0.0185      0.571
         5     0.0224      0.939
         6     0.0103       1.12
         7     0.0121       1.35
         8     0.0115       1.53
         9    0.00842       1.71
        10    0.00738        1.9
[35mdone in 0.025 seconds[0m
Expected: 0.057 Actual: 0.051
Stepsize OK!
[35mvf[0m
[35mdone in 0.047 seconds[0m
-----------------------------------------
| EpLenMean               | 649         |
| EpRewMean               | -4.72       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 67          |
| TimeElapsed             | 36.7        |
| TimestepsSoFar          | 40960       |
| e

Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
-----------------------------------------
| EpLenMean               | 676         |
| EpRewMean               | -4.67       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 76          |
| TimeElapsed             | 41.1        |
| TimestepsSoFar          | 47104       |
| entloss                 | 0.0         |
| entropy                 | 1.8005209   |
| explained_variance_t... | 0.767       |
| meankl                  | 0.007226281 |
| optimgain               | 0.054645263 |
| surrgain                | 0.054645263 |
-----------------------------------------
********** Iteration 46 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.668 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0484          0
         1     0.0262      0.135
         2     0.0206      0.288
         3     0.0224      0.436
 

********** Iteration 52 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.667 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0436          0
         1     0.0222     0.0469
         2     0.0241      0.161
         3     0.0204       0.23
         4     0.0183      0.423
         5     0.0266      0.634
         6     0.0118      0.786
         7    0.00945      0.974
         8    0.00581       1.33
         9    0.00722       1.58
        10    0.00457       1.88
[35mdone in 0.025 seconds[0m
Expected: 0.053 Actual: 0.047
Stepsize OK!
[35mvf[0m
[35mdone in 0.042 seconds[0m
-----------------------------------------
| EpLenMean               | 692         |
| EpRewMean               | -4.67       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 86          |
| TimeElapsed             | 51.2        |
| TimestepsSoFar          | 54272       |
| e

Stepsize OK!
[35mvf[0m
[35mdone in 0.072 seconds[0m
------------------------------------------
| EpLenMean               | 686          |
| EpRewMean               | -4.8         |
| EpThisIter              | 1            |
| EpisodesSoFar           | 96           |
| TimeElapsed             | 56.3         |
| TimestepsSoFar          | 60416        |
| entloss                 | 0.0          |
| entropy                 | 1.7303734    |
| explained_variance_t... | 0.844        |
| meankl                  | 0.0069021676 |
| optimgain               | 0.049050715  |
| surrgain                | 0.049050715  |
------------------------------------------
********** Iteration 59 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.753 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0334          0
         1     0.0275      0.101
         2     0.0228      0.178
         3     0.017

********** Iteration 65 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.672 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0397          0
         1     0.0115     0.0508
         2     0.0152       0.16
         3      0.017      0.245
         4     0.0147      0.422
         5     0.0218       0.65
         6     0.0166      0.848
         7    0.00656       1.04
         8     0.0103       1.33
         9    0.00427        1.6
        10    0.00492       1.85
[35mdone in 0.023 seconds[0m
Expected: 0.049 Actual: 0.048
Stepsize OK!
[35mvf[0m
[35mdone in 0.043 seconds[0m
-----------------------------------------
| EpLenMean               | 666         |
| EpRewMean               | -4.92       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 107         |
| TimeElapsed             | 62.4        |
| TimestepsSoFar          | 67584       |
| e

Stepsize OK!
[35mvf[0m
[35mdone in 0.052 seconds[0m
-----------------------------------------
| EpLenMean               | 652         |
| EpRewMean               | -5          |
| EpThisIter              | 2           |
| EpisodesSoFar           | 117         |
| TimeElapsed             | 67.3        |
| TimestepsSoFar          | 73728       |
| entloss                 | 0.0         |
| entropy                 | 1.8090236   |
| explained_variance_t... | 0.699       |
| meankl                  | 0.009445077 |
| optimgain               | 0.060879454 |
| surrgain                | 0.060879454 |
-----------------------------------------
********** Iteration 72 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.753 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0305          0
         1     0.0266     0.0709
         2     0.0152       0.19
         3     0.0135      0.287
 

********** Iteration 78 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.718 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0281          0
         1     0.0182     0.0616
         2     0.0193       0.22
         3       0.02      0.339
         4     0.0162      0.495
         5     0.0112      0.742
         6      0.014      0.898
         7     0.0117       1.19
         8     0.0105       1.39
         9    0.00609       1.63
        10    0.00807       1.74
[35mdone in 0.021 seconds[0m
Expected: 0.050 Actual: 0.049
Stepsize OK!
[35mvf[0m
[35mdone in 0.043 seconds[0m
-----------------------------------------
| EpLenMean               | 602         |
| EpRewMean               | -4.92       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 130         |
| TimeElapsed             | 77.5        |
| TimestepsSoFar          | 80896       |
| e

Stepsize OK!
[35mvf[0m
[35mdone in 0.043 seconds[0m
-----------------------------------------
| EpLenMean               | 601         |
| EpRewMean               | -4.85       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 140         |
| TimeElapsed             | 82.1        |
| TimestepsSoFar          | 87040       |
| entloss                 | 0.0         |
| entropy                 | 1.7178378   |
| explained_variance_t... | 0.711       |
| meankl                  | 0.008465742 |
| optimgain               | 0.056296166 |
| surrgain                | 0.056296166 |
-----------------------------------------
********** Iteration 85 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.648 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0478          0
         1     0.0247     0.0951
         2     0.0282      0.225
         3     0.0356      0.441
 

********** Iteration 91 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.661 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.035          0
         1     0.0297     0.0885
         2     0.0285      0.255
         3     0.0262      0.432
         4     0.0293      0.658
         5     0.0276      0.851
         6     0.0201       1.05
         7     0.0175       1.31
         8      0.012       1.55
         9    0.00967       1.77
        10     0.0168          2
[35mdone in 0.023 seconds[0m
Expected: 0.060 Actual: 0.059
Stepsize OK!
[35mvf[0m
[35mdone in 0.044 seconds[0m
-----------------------------------------
| EpLenMean               | 580         |
| EpRewMean               | -4.85       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 153         |
| TimeElapsed             | 87.5        |
| TimestepsSoFar          | 94208       |
| e

Stepsize OK!
[35mvf[0m
[35mdone in 0.037 seconds[0m
-----------------------------------------
| EpLenMean               | 584         |
| EpRewMean               | -4.83       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 163         |
| TimeElapsed             | 96.7        |
| TimestepsSoFar          | 100352      |
| entloss                 | 0.0         |
| entropy                 | 1.7271532   |
| explained_variance_t... | 0.611       |
| meankl                  | 0.009362048 |
| optimgain               | 0.05447779  |
| surrgain                | 0.05447779  |
-----------------------------------------
********** Iteration 98 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.657 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0444          0
         1     0.0302     0.0848
         2     0.0434      0.288
         3     0.0241      0.472
 

********** Iteration 104 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.637 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0524          0
         1     0.0284     0.0754
         2      0.034      0.205
         3     0.0267      0.384
         4     0.0393      0.508
         5     0.0424      0.779
         6     0.0217       1.02
         7     0.0198        1.3
         8    0.00861       1.58
         9     0.0257       1.76
        10    0.00854       2.05
[35mdone in 0.021 seconds[0m
Expected: 0.062 Actual: 0.059
Stepsize OK!
[35mvf[0m
[35mdone in 0.040 seconds[0m
-----------------------------------------
| EpLenMean               | 610         |
| EpRewMean               | -4.8        |
| EpThisIter              | 1           |
| EpisodesSoFar           | 173         |
| TimeElapsed             | 102         |
| TimestepsSoFar          | 107520      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.040 seconds[0m
-----------------------------------------
| EpLenMean               | 642         |
| EpRewMean               | -4.8        |
| EpThisIter              | 1           |
| EpisodesSoFar           | 182         |
| TimeElapsed             | 106         |
| TimestepsSoFar          | 113664      |
| entloss                 | 0.0         |
| entropy                 | 1.7442565   |
| explained_variance_t... | 0.797       |
| meankl                  | 0.010388942 |
| optimgain               | 0.061040025 |
| surrgain                | 0.061040025 |
-----------------------------------------
********** Iteration 111 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.639 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0351          0
         1     0.0308     0.0983
         2     0.0319      0.237
         3     0.0393      0.566


********** Iteration 117 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.717 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0334          0
         1     0.0398     0.0936
         2     0.0367      0.205
         3     0.0442      0.451
         4     0.0419       0.78
         5     0.0477       1.05
         6     0.0262       1.31
         7     0.0333       1.64
         8     0.0236       1.97
         9     0.0156       2.11
        10     0.0113       2.38
[35mdone in 0.022 seconds[0m
Expected: 0.068 Actual: 0.061
Stepsize OK!
[35mvf[0m
[35mdone in 0.044 seconds[0m
-----------------------------------------
| EpLenMean               | 657         |
| EpRewMean               | -4.8        |
| EpThisIter              | 1           |
| EpisodesSoFar           | 193         |
| TimeElapsed             | 112         |
| TimestepsSoFar          | 120832      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.053 seconds[0m
-----------------------------------------
| EpLenMean               | 693         |
| EpRewMean               | -4.85       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 201         |
| TimeElapsed             | 122         |
| TimestepsSoFar          | 126976      |
| entloss                 | 0.0         |
| entropy                 | 1.7028801   |
| explained_variance_t... | 0.618       |
| meankl                  | 0.00948697  |
| optimgain               | 0.060452543 |
| surrgain                | 0.060452543 |
-----------------------------------------
********** Iteration 124 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.903 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0722          0
         1     0.0277     0.0924
         2     0.0386      0.225
         3     0.0492      0.424


********** Iteration 130 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.723 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.012 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0248          0
         1     0.0326     0.0759
         2     0.0397      0.182
         3     0.0236      0.321
         4     0.0298      0.442
         5     0.0243      0.643
         6     0.0151      0.762
         7     0.0135       0.92
         8     0.0144       1.07
         9     0.0172       1.37
        10     0.0105       1.55
[35mdone in 0.023 seconds[0m
Expected: 0.051 Actual: 0.053
Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
-----------------------------------------
| EpLenMean               | 675         |
| EpRewMean               | -4.92       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 213         |
| TimeElapsed             | 128         |
| TimestepsSoFar          | 134144      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
-----------------------------------------
| EpLenMean               | 667         |
| EpRewMean               | -5          |
| EpThisIter              | 2           |
| EpisodesSoFar           | 222         |
| TimeElapsed             | 133         |
| TimestepsSoFar          | 140288      |
| entloss                 | 0.0         |
| entropy                 | 1.7223291   |
| explained_variance_t... | 0.678       |
| meankl                  | 0.010839554 |
| optimgain               | 0.062182475 |
| surrgain                | 0.062182475 |
-----------------------------------------
********** Iteration 137 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.735 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0456          0
         1     0.0381      0.113
         2     0.0513      0.318
         3       0.06      0.627


********** Iteration 143 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.696 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0668          0
         1     0.0228     0.0764
         2     0.0437      0.219
         3       0.03      0.375
         4     0.0382      0.562
         5     0.0169      0.744
         6     0.0208      0.972
         7     0.0299       1.18
         8      0.014       1.52
         9     0.0242       1.83
        10     0.0112       2.06
[35mdone in 0.021 seconds[0m
Expected: 0.062 Actual: 0.059
Stepsize OK!
[35mvf[0m
[35mdone in 0.045 seconds[0m
-----------------------------------------
| EpLenMean               | 660         |
| EpRewMean               | -5.03       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 234         |
| TimeElapsed             | 139         |
| TimestepsSoFar          | 147456      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.040 seconds[0m
-----------------------------------------
| EpLenMean               | 631         |
| EpRewMean               | -5.03       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 243         |
| TimeElapsed             | 148         |
| TimestepsSoFar          | 153600      |
| entloss                 | 0.0         |
| entropy                 | 1.6691363   |
| explained_variance_t... | 0.728       |
| meankl                  | 0.010497427 |
| optimgain               | 0.055060852 |
| surrgain                | 0.055060852 |
-----------------------------------------
********** Iteration 150 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.668 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.121          0
         1     0.0275      0.102
         2     0.0529      0.206
         3     0.0404      0.433


********** Iteration 156 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.644 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0245          0
         1      0.033     0.0798
         2     0.0644       0.31
         3     0.0383      0.446
         4     0.0404      0.616
         5     0.0342      0.967
         6      0.026       1.09
         7     0.0233       1.25
         8     0.0246       1.47
         9     0.0123       1.64
        10     0.0165       1.86
[35mdone in 0.025 seconds[0m
Expected: 0.060 Actual: 0.054
Stepsize OK!
[35mvf[0m
[35mdone in 0.038 seconds[0m
-----------------------------------------
| EpLenMean               | 637         |
| EpRewMean               | -4.97       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 254         |
| TimeElapsed             | 154         |
| TimestepsSoFar          | 160768      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.039 seconds[0m
-----------------------------------------
| EpLenMean               | 629         |
| EpRewMean               | -4.9        |
| EpThisIter              | 1           |
| EpisodesSoFar           | 264         |
| TimeElapsed             | 158         |
| TimestepsSoFar          | 166912      |
| entloss                 | 0.0         |
| entropy                 | 1.5826237   |
| explained_variance_t... | 0.653       |
| meankl                  | 0.008424228 |
| optimgain               | 0.068850055 |
| surrgain                | 0.068850055 |
-----------------------------------------
********** Iteration 163 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.648 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0405          0
         1     0.0222     0.0688
         2      0.032      0.259
         3     0.0254      0.458


********** Iteration 169 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.705 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.068          0
         1     0.0334      0.098
         2     0.0277      0.249
         3     0.0365      0.399
         4     0.0211      0.626
         5      0.032      0.781
         6     0.0182       1.16
         7     0.0224       1.33
         8     0.0139       1.55
         9     0.0131       1.66
        10     0.0113       1.87
[35mdone in 0.023 seconds[0m
Expected: 0.063 Actual: 0.062
Stepsize OK!
[35mvf[0m
[35mdone in 0.042 seconds[0m
-----------------------------------------
| EpLenMean               | 643         |
| EpRewMean               | -4.88       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 275         |
| TimeElapsed             | 164         |
| TimestepsSoFar          | 174080      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
----------------------------------------
| EpLenMean               | 615        |
| EpRewMean               | -4.9       |
| EpThisIter              | 2          |
| EpisodesSoFar           | 286        |
| TimeElapsed             | 172        |
| TimestepsSoFar          | 180224     |
| entloss                 | 0.0        |
| entropy                 | 1.6041145  |
| explained_variance_t... | 0.805      |
| meankl                  | 0.00987006 |
| optimgain               | 0.06277317 |
| surrgain                | 0.06277317 |
----------------------------------------
********** Iteration 176 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.658 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.012 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0542          0
         1     0.0401      0.117
         2     0.0465      0.277
         3       0.05      0.455
         4    

********** Iteration 182 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.730 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.012 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0589          0
         1     0.0325      0.103
         2     0.0636      0.261
         3     0.0396      0.666
         4     0.0499      0.884
         5     0.0517       1.18
         6      0.038       1.48
         7     0.0302       1.85
         8     0.0272       2.18
         9     0.0161       2.49
        10     0.0255       2.81
[35mdone in 0.022 seconds[0m
Expected: 0.079 Actual: 0.091
Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
-----------------------------------------
| EpLenMean               | 630         |
| EpRewMean               | -4.9        |
| EpThisIter              | 2           |
| EpisodesSoFar           | 297         |
| TimeElapsed             | 178         |
| TimestepsSoFar          | 187392      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.039 seconds[0m
-----------------------------------------
| EpLenMean               | 635         |
| EpRewMean               | -4.97       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 306         |
| TimeElapsed             | 183         |
| TimestepsSoFar          | 193536      |
| entloss                 | 0.0         |
| entropy                 | 1.5704598   |
| explained_variance_t... | 0.612       |
| meankl                  | 0.009810649 |
| optimgain               | 0.05830022  |
| surrgain                | 0.05830022  |
-----------------------------------------
********** Iteration 189 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.665 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0275          0
         1     0.0631      0.179
         2     0.0561      0.545
         3     0.0493      0.757


********** Iteration 195 ************
Optimizing Policy...
[35msampling[0m
Eval num_timesteps=199680, episode_reward=-4.90 +/- 0.30
Episode length: 676.50 +/- 100.27
[35mdone in 5.683 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.012 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0456          0
         1     0.0318      0.099
         2     0.0451      0.237
         3     0.0368      0.455
         4     0.0277      0.653
         5     0.0236      0.831
         6     0.0159       0.97
         7     0.0215       1.28
         8     0.0305       1.48
         9     0.0108       1.78
        10    0.00944       2.08
[35mdone in 0.021 seconds[0m
Expected: 0.064 Actual: 0.063
Stepsize OK!
[35mvf[0m
[35mdone in 0.043 seconds[0m
-----------------------------------------
| EpLenMean               | 648         |
| EpRewMean               | -5.03       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 317      

Stepsize OK!
[35mvf[0m
[35mdone in 0.047 seconds[0m
-----------------------------------------
| EpLenMean               | 675         |
| EpRewMean               | -4.9        |
| EpThisIter              | 1           |
| EpisodesSoFar           | 325         |
| TimeElapsed             | 198         |
| TimestepsSoFar          | 206848      |
| entloss                 | 0.0         |
| entropy                 | 1.5284855   |
| explained_variance_t... | 0.71        |
| meankl                  | 0.010913643 |
| optimgain               | 0.064967364 |
| surrgain                | 0.064967364 |
-----------------------------------------
********** Iteration 202 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.795 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0267          0
         1     0.0523      0.158
         2      0.033      0.388
         3     0.0331      0.587


********** Iteration 208 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.782 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0307          0
         1     0.0319     0.0839
         2     0.0396      0.258
         3     0.0376      0.445
         4     0.0311      0.728
         5     0.0319      0.911
         6     0.0291       1.18
         7     0.0292        1.5
         8     0.0163       1.78
         9     0.0253       2.08
        10     0.0175       2.52
[35mdone in 0.026 seconds[0m
Expected: 0.067 Actual: 0.070
Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
-----------------------------------------
| EpLenMean               | 698         |
| EpRewMean               | -4.88       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 335         |
| TimeElapsed             | 204         |
| TimestepsSoFar          | 214016      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.046 seconds[0m
----------------------------------------
| EpLenMean               | 681        |
| EpRewMean               | -4.8       |
| EpThisIter              | 1          |
| EpisodesSoFar           | 345        |
| TimeElapsed             | 209        |
| TimestepsSoFar          | 220160     |
| entloss                 | 0.0        |
| entropy                 | 1.448329   |
| explained_variance_t... | 0.557      |
| meankl                  | 0.01131312 |
| optimgain               | 0.06410972 |
| surrgain                | 0.06410972 |
----------------------------------------
********** Iteration 215 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.710 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.012 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0324          0
         1      0.047     0.0867
         2     0.0725      0.288
         3     0.0434      0.466
         4    

********** Iteration 221 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.650 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0       0.06          0
         1     0.0555      0.148
         2     0.0548      0.328
         3     0.0431      0.529
         4     0.0314       0.72
         5     0.0268      0.984
         6     0.0205       1.19
         7     0.0284       1.64
         8     0.0335       1.94
         9     0.0153       2.27
        10     0.0176       2.52
[35mdone in 0.020 seconds[0m
Expected: 0.076 Actual: 0.076
Stepsize OK!
[35mvf[0m
[35mdone in 0.039 seconds[0m
-----------------------------------------
| EpLenMean               | 701         |
| EpRewMean               | -4.8        |
| EpThisIter              | 2           |
| EpisodesSoFar           | 355         |
| TimeElapsed             | 219         |
| TimestepsSoFar          | 227328      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.044 seconds[0m
-----------------------------------------
| EpLenMean               | 677         |
| EpRewMean               | -4.8        |
| EpThisIter              | 2           |
| EpisodesSoFar           | 365         |
| TimeElapsed             | 224         |
| TimestepsSoFar          | 233472      |
| entloss                 | 0.0         |
| entropy                 | 1.4805977   |
| explained_variance_t... | 0.728       |
| meankl                  | 0.011401998 |
| optimgain               | 0.05912342  |
| surrgain                | 0.05912342  |
-----------------------------------------
********** Iteration 228 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.640 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0628          0
         1     0.0382     0.0939
         2      0.055      0.305
         3      0.044      0.471


********** Iteration 234 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.695 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0586          0
         1     0.0277     0.0823
         2     0.0248      0.206
         3     0.0407      0.365
         4     0.0423      0.658
         5     0.0218      0.894
         6     0.0147       1.09
         7     0.0347       1.31
         8     0.0191        1.7
         9     0.0149       1.91
        10     0.0132       2.22
[35mdone in 0.031 seconds[0m
Expected: 0.064 Actual: 0.056
Stepsize OK!
[35mvf[0m
[35mdone in 0.052 seconds[0m
-----------------------------------------
| EpLenMean               | 667         |
| EpRewMean               | -4.85       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 375         |
| TimeElapsed             | 229         |
| TimestepsSoFar          | 240640      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.040 seconds[0m
-----------------------------------------
| EpLenMean               | 685         |
| EpRewMean               | -4.9        |
| EpThisIter              | 2           |
| EpisodesSoFar           | 384         |
| TimeElapsed             | 234         |
| TimestepsSoFar          | 246784      |
| entloss                 | 0.0         |
| entropy                 | 1.4182637   |
| explained_variance_t... | 0.808       |
| meankl                  | 0.011559807 |
| optimgain               | 0.069661856 |
| surrgain                | 0.069661856 |
-----------------------------------------
********** Iteration 241 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.659 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0444          0
         1      0.028     0.0977
         2     0.0337      0.235
         3     0.0338      0.454


********** Iteration 247 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.696 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.012 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.042          0
         1     0.0621      0.105
         2     0.0502      0.244
         3     0.0447      0.384
         4     0.0786      0.623
         5     0.0303      0.933
         6     0.0197       1.09
         7     0.0171       1.27
         8     0.0141       1.49
         9     0.0205       1.65
        10    0.00873       1.87
[35mdone in 0.030 seconds[0m
Expected: 0.065 Actual: 0.058
Stepsize OK!
[35mvf[0m
[35mdone in 0.050 seconds[0m
-----------------------------------------
| EpLenMean               | 667         |
| EpRewMean               | -4.92       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 395         |
| TimeElapsed             | 244         |
| TimestepsSoFar          | 253952      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.043 seconds[0m
-----------------------------------------
| EpLenMean               | 690         |
| EpRewMean               | -4.9        |
| EpThisIter              | 1           |
| EpisodesSoFar           | 403         |
| TimeElapsed             | 248         |
| TimestepsSoFar          | 260096      |
| entloss                 | 0.0         |
| entropy                 | 1.3290308   |
| explained_variance_t... | 0.772       |
| meankl                  | 0.012266462 |
| optimgain               | 0.06214451  |
| surrgain                | 0.06214451  |
-----------------------------------------
********** Iteration 254 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.669 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0668          0
         1     0.0291     0.0856
         2     0.0437      0.288
         3     0.0269      0.458


********** Iteration 260 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.671 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.012 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0567          0
         1     0.0327     0.0997
         2     0.0655      0.299
         3     0.0347      0.543
         4     0.0314      0.788
         5     0.0404       1.05
         6     0.0286       1.26
         7     0.0233       1.46
         8     0.0127        1.7
         9     0.0201       1.94
        10     0.0145       2.19
[35mdone in 0.020 seconds[0m
Expected: 0.070 Actual: 0.067
Stepsize OK!
[35mvf[0m
[35mdone in 0.043 seconds[0m
-----------------------------------------
| EpLenMean               | 681         |
| EpRewMean               | -4.9        |
| EpThisIter              | 2           |
| EpisodesSoFar           | 414         |
| TimeElapsed             | 254         |
| TimestepsSoFar          | 267264      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.042 seconds[0m
-----------------------------------------
| EpLenMean               | 681         |
| EpRewMean               | -4.88       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 423         |
| TimeElapsed             | 258         |
| TimestepsSoFar          | 273408      |
| entloss                 | 0.0         |
| entropy                 | 1.2961203   |
| explained_variance_t... | 0.653       |
| meankl                  | 0.011247847 |
| optimgain               | 0.07494237  |
| surrgain                | 0.07494237  |
-----------------------------------------
********** Iteration 267 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.691 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0393          0
         1     0.0368      0.096
         2     0.0412      0.265
         3     0.0373      0.515


********** Iteration 273 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.701 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.056          0
         1     0.0553      0.162
         2      0.123      0.396
         3      0.102       0.73
         4     0.0794       1.11
         5      0.047       1.44
         6     0.0646       1.76
         7     0.0677       2.09
         8     0.0404       2.36
         9     0.0331       2.71
        10     0.0482       2.98
[35mdone in 0.024 seconds[0m
Expected: 0.091 Actual: 0.080
Stepsize OK!
[35mvf[0m
[35mdone in 0.042 seconds[0m
-----------------------------------------
| EpLenMean               | 670         |
| EpRewMean               | -4.85       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 435         |
| TimeElapsed             | 269         |
| TimestepsSoFar          | 280576      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.046 seconds[0m
-----------------------------------------
| EpLenMean               | 676         |
| EpRewMean               | -4.83       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 443         |
| TimeElapsed             | 274         |
| TimestepsSoFar          | 286720      |
| entloss                 | 0.0         |
| entropy                 | 1.3326565   |
| explained_variance_t... | 0.749       |
| meankl                  | 0.009189706 |
| optimgain               | 0.06491488  |
| surrgain                | 0.06491488  |
-----------------------------------------
********** Iteration 280 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.696 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0       0.07          0
         1     0.0832      0.175
         2     0.0561      0.373
         3     0.0767      0.554


********** Iteration 286 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.770 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0467          0
         1     0.0673      0.176
         2     0.0524      0.424
         3     0.0788      0.854
         4      0.041       1.18
         5      0.036       1.38
         6     0.0762       1.76
         7     0.0378       2.25
         8     0.0203       2.66
         9     0.0546       3.03
        10     0.0173       3.38
[35mdone in 0.023 seconds[0m
Expected: 0.090 Actual: 0.132
Stepsize OK!
[35mvf[0m
[35mdone in 0.045 seconds[0m
-----------------------------------------
| EpLenMean               | 671         |
| EpRewMean               | -4.85       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 454         |
| TimeElapsed             | 280         |
| TimestepsSoFar          | 293888      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.040 seconds[0m
-----------------------------------------
| EpLenMean               | 658         |
| EpRewMean               | -4.88       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 464         |
| TimeElapsed             | 289         |
| TimestepsSoFar          | 300032      |
| entloss                 | 0.0         |
| entropy                 | 1.4539186   |
| explained_variance_t... | 0.789       |
| meankl                  | 0.010476616 |
| optimgain               | 0.0700099   |
| surrgain                | 0.0700099   |
-----------------------------------------
********** Iteration 293 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.653 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0347          0
         1     0.0327      0.118
         2     0.0265      0.231
         3     0.0639      0.595


********** Iteration 299 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.690 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0509          0
         1      0.042      0.105
         2     0.0371      0.335
         3     0.0347       0.62
         4     0.0427      0.891
         5      0.024       1.12
         6     0.0228       1.53
         7     0.0242       1.79
         8     0.0251       2.01
         9     0.0271       2.43
        10     0.0115        2.7
[35mdone in 0.022 seconds[0m
Expected: 0.075 Actual: 0.072
Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
-----------------------------------------
| EpLenMean               | 660         |
| EpRewMean               | -4.83       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 474         |
| TimeElapsed             | 294         |
| TimestepsSoFar          | 307200      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
-----------------------------------------
| EpLenMean               | 652         |
| EpRewMean               | -4.88       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 483         |
| TimeElapsed             | 299         |
| TimestepsSoFar          | 313344      |
| entloss                 | 0.0         |
| entropy                 | 1.2691257   |
| explained_variance_t... | 0.855       |
| meankl                  | 0.010651601 |
| optimgain               | 0.07521863  |
| surrgain                | 0.07521863  |
-----------------------------------------
********** Iteration 306 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.704 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0378          0
         1     0.0611      0.143
         2     0.0404       0.33
         3     0.0405      0.497


********** Iteration 312 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.683 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.104          0
         1     0.0432      0.118
         2     0.0857      0.285
         3     0.0706      0.619
         4      0.119      0.984
         5     0.0577        1.4
         6     0.0587       1.69
         7     0.0452       2.12
         8     0.0654        2.5
         9     0.0354       2.99
        10     0.0401       3.33
[35mdone in 0.021 seconds[0m
Expected: 0.094 Actual: 0.084
violated KL constraint. shrinking step.
Expected: 0.094 Actual: 0.045
Stepsize OK!
[35mvf[0m
[35mdone in 0.045 seconds[0m
------------------------------------------
| EpLenMean               | 668          |
| EpRewMean               | -4.85        |
| EpThisIter              | 2            |
| EpisodesSoFar           | 494          |
| TimeElaps

Stepsize OK!
[35mvf[0m
[35mdone in 0.043 seconds[0m
-----------------------------------------
| EpLenMean               | 669         |
| EpRewMean               | -4.92       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 503         |
| TimeElapsed             | 314         |
| TimestepsSoFar          | 326656      |
| entloss                 | 0.0         |
| entropy                 | 1.327767    |
| explained_variance_t... | 0.732       |
| meankl                  | 0.011389295 |
| optimgain               | 0.07862818  |
| surrgain                | 0.07862818  |
-----------------------------------------
********** Iteration 319 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.656 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0497          0
         1     0.0415      0.176
         2     0.0346      0.337
         3     0.0457      0.582


********** Iteration 325 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.670 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.012 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0553          0
         1     0.0416     0.0887
         2     0.0739      0.321
         3     0.0783      0.721
         4     0.0494      0.996
         5     0.0751       1.34
         6     0.0841       1.76
         7     0.0345        2.2
         8     0.0432       2.57
         9     0.0315       3.06
        10     0.0289        3.6
[35mdone in 0.022 seconds[0m
Expected: 0.091 Actual: 0.093
Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
-----------------------------------------
| EpLenMean               | 686         |
| EpRewMean               | -4.95       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 513         |
| TimeElapsed             | 319         |
| TimestepsSoFar          | 333824      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.040 seconds[0m
-----------------------------------------
| EpLenMean               | 684         |
| EpRewMean               | -4.95       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 522         |
| TimeElapsed             | 324         |
| TimestepsSoFar          | 339968      |
| entloss                 | 0.0         |
| entropy                 | 1.2876282   |
| explained_variance_t... | 0.723       |
| meankl                  | 0.013271882 |
| optimgain               | 0.065218545 |
| surrgain                | 0.065218545 |
-----------------------------------------
********** Iteration 332 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.659 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.071          0
         1     0.0473      0.153
         2     0.0657      0.288
         3     0.0586      0.458


********** Iteration 338 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.667 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0436          0
         1     0.0448      0.106
         2     0.0767      0.267
         3      0.056      0.607
         4     0.0759      0.791
         5     0.0555       1.15
         6     0.0434       1.44
         7     0.0326       1.73
         8     0.0282       1.98
         9     0.0202       2.21
        10     0.0152       2.63
[35mdone in 0.022 seconds[0m
Expected: 0.078 Actual: 0.069
Stepsize OK!
[35mvf[0m
[35mdone in 0.038 seconds[0m
-----------------------------------------
| EpLenMean               | 657         |
| EpRewMean               | -4.97       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 534         |
| TimeElapsed             | 329         |
| TimestepsSoFar          | 347136      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
-----------------------------------------
| EpLenMean               | 677         |
| EpRewMean               | -5          |
| EpThisIter              | 2           |
| EpisodesSoFar           | 543         |
| TimeElapsed             | 339         |
| TimestepsSoFar          | 353280      |
| entloss                 | 0.0         |
| entropy                 | 1.2752566   |
| explained_variance_t... | 0.807       |
| meankl                  | 0.011531013 |
| optimgain               | 0.051583238 |
| surrgain                | 0.051583238 |
-----------------------------------------
********** Iteration 345 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.721 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0585          0
         1     0.0924      0.129
         2     0.0626      0.287
         3     0.0594      0.502


********** Iteration 351 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.642 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0397          0
         1     0.0533      0.113
         2       0.07      0.312
         3     0.0518      0.627
         4     0.0938      0.898
         5     0.0397       1.14
         6      0.032       1.38
         7     0.0284       1.68
         8     0.0228          2
         9     0.0256       2.22
        10     0.0215       2.67
[35mdone in 0.022 seconds[0m
Expected: 0.078 Actual: 0.080
Stepsize OK!
[35mvf[0m
[35mdone in 0.039 seconds[0m
-----------------------------------------
| EpLenMean               | 674         |
| EpRewMean               | -4.95       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 553         |
| TimeElapsed             | 344         |
| TimestepsSoFar          | 360448      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.037 seconds[0m
-----------------------------------------
| EpLenMean               | 670         |
| EpRewMean               | -4.97       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 562         |
| TimeElapsed             | 349         |
| TimestepsSoFar          | 366592      |
| entloss                 | 0.0         |
| entropy                 | 1.2399864   |
| explained_variance_t... | 0.672       |
| meankl                  | 0.012119148 |
| optimgain               | 0.07129601  |
| surrgain                | 0.07129601  |
-----------------------------------------
********** Iteration 358 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.647 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0364          0
         1     0.0638      0.177
         2     0.0294      0.338
         3     0.0519      0.667


********** Iteration 364 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.674 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0712          0
         1     0.0386     0.0994
         2     0.0427      0.186
         3     0.0445      0.399
         4     0.0552      0.598
         5     0.0441       0.77
         6     0.0428       1.09
         7     0.0274       1.61
         8     0.0308          2
         9     0.0231       2.27
        10     0.0209       2.64
[35mdone in 0.023 seconds[0m
Expected: 0.075 Actual: 0.111
Stepsize OK!
[35mvf[0m
[35mdone in 0.044 seconds[0m
-----------------------------------------
| EpLenMean               | 687         |
| EpRewMean               | -4.88       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 573         |
| TimeElapsed             | 354         |
| TimestepsSoFar          | 373760      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.040 seconds[0m
-----------------------------------------
| EpLenMean               | 678         |
| EpRewMean               | -4.83       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 582         |
| TimeElapsed             | 364         |
| TimestepsSoFar          | 379904      |
| entloss                 | 0.0         |
| entropy                 | 1.2528739   |
| explained_variance_t... | 0.603       |
| meankl                  | 0.012788855 |
| optimgain               | 0.08791606  |
| surrgain                | 0.08791606  |
-----------------------------------------
********** Iteration 371 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.657 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0284          0
         1     0.0622      0.105
         2     0.0706      0.355
         3     0.0428      0.638


********** Iteration 377 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.675 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0322          0
         1     0.0485      0.165
         2     0.0545      0.399
         3     0.0364      0.565
         4     0.0228       0.71
         5     0.0256      0.907
         6     0.0233       1.15
         7     0.0197       1.37
         8     0.0169       1.57
         9     0.0136       1.81
        10     0.0133       1.99
[35mdone in 0.022 seconds[0m
Expected: 0.065 Actual: 0.063
Stepsize OK!
[35mvf[0m
[35mdone in 0.038 seconds[0m
-----------------------------------------
| EpLenMean               | 666         |
| EpRewMean               | -4.88       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 593         |
| TimeElapsed             | 369         |
| TimestepsSoFar          | 387072      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
-----------------------------------------
| EpLenMean               | 659         |
| EpRewMean               | -4.88       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 602         |
| TimeElapsed             | 374         |
| TimestepsSoFar          | 393216      |
| entloss                 | 0.0         |
| entropy                 | 1.2484194   |
| explained_variance_t... | 0.613       |
| meankl                  | 0.012059789 |
| optimgain               | 0.08842527  |
| surrgain                | 0.08842527  |
-----------------------------------------
********** Iteration 384 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.693 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.078          0
         1     0.0483      0.126
         2     0.0768      0.308
         3      0.051      0.541


********** Iteration 390 ************
Optimizing Policy...
[35msampling[0m
Eval num_timesteps=399360, episode_reward=-4.60 +/- 0.49
Episode length: 728.10 +/- 188.40
[35mdone in 5.834 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.008 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.034          0
         1     0.0309     0.0905
         2     0.0363       0.28
         3     0.0372      0.421
         4     0.0369        0.7
         5     0.0351      0.992
         6     0.0299       1.23
         7     0.0345       1.54
         8      0.018       1.76
         9     0.0191       2.04
        10     0.0181       2.29
[35mdone in 0.020 seconds[0m
Expected: 0.066 Actual: 0.057
Stepsize OK!
[35mvf[0m
[35mdone in 0.040 seconds[0m
-----------------------------------------
| EpLenMean               | 646         |
| EpRewMean               | -4.92       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 613      

Stepsize OK!
[35mvf[0m
[35mdone in 0.047 seconds[0m
-----------------------------------------
| EpLenMean               | 668         |
| EpRewMean               | -5          |
| EpThisIter              | 2           |
| EpisodesSoFar           | 622         |
| TimeElapsed             | 389         |
| TimestepsSoFar          | 406528      |
| entloss                 | 0.0         |
| entropy                 | 1.200547    |
| explained_variance_t... | 0.777       |
| meankl                  | 0.013397544 |
| optimgain               | 0.08706061  |
| surrgain                | 0.08706061  |
-----------------------------------------
********** Iteration 397 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.669 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0426          0
         1     0.0584     0.0995
         2     0.0694      0.219
         3     0.0544       0.46


********** Iteration 403 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.732 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.096          0
         1      0.114      0.154
         2      0.119      0.321
         3     0.0694      0.592
         4     0.0562      0.773
         5     0.0728       1.11
         6     0.0551       1.29
         7     0.0472       1.59
         8     0.0272       1.88
         9     0.0282       2.55
        10      0.034       2.96
[35mdone in 0.025 seconds[0m
Expected: 0.094 Actual: 0.103
Stepsize OK!
[35mvf[0m
[35mdone in 0.047 seconds[0m
-----------------------------------------
| EpLenMean               | 664         |
| EpRewMean               | -4.97       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 633         |
| TimeElapsed             | 395         |
| TimestepsSoFar          | 413696      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.047 seconds[0m
-----------------------------------------
| EpLenMean               | 670         |
| EpRewMean               | -4.95       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 642         |
| TimeElapsed             | 400         |
| TimestepsSoFar          | 419840      |
| entloss                 | 0.0         |
| entropy                 | 1.2164156   |
| explained_variance_t... | 0.717       |
| meankl                  | 0.012042198 |
| optimgain               | 0.079402015 |
| surrgain                | 0.079402015 |
-----------------------------------------
********** Iteration 410 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.751 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0588          0
         1     0.0592      0.109
         2     0.0537      0.267
         3     0.0425      0.424


********** Iteration 416 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.679 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0408          0
         1     0.0344     0.0998
         2     0.0311      0.185
         3     0.0326      0.341
         4     0.0233      0.545
         5     0.0308      0.765
         6     0.0552      0.992
         7     0.0286       1.38
         8      0.019       1.69
         9     0.0154       2.14
        10     0.0132       2.43
[35mdone in 0.023 seconds[0m
Expected: 0.066 Actual: 0.059
Stepsize OK!
[35mvf[0m
[35mdone in 0.038 seconds[0m
-----------------------------------------
| EpLenMean               | 693         |
| EpRewMean               | -4.9        |
| EpThisIter              | 1           |
| EpisodesSoFar           | 652         |
| TimeElapsed             | 410         |
| TimestepsSoFar          | 427008      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.037 seconds[0m
-----------------------------------------
| EpLenMean               | 667         |
| EpRewMean               | -4.83       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 662         |
| TimeElapsed             | 414         |
| TimestepsSoFar          | 433152      |
| entloss                 | 0.0         |
| entropy                 | 1.2427652   |
| explained_variance_t... | 0.616       |
| meankl                  | 0.014460784 |
| optimgain               | 0.06410503  |
| surrgain                | 0.06410503  |
-----------------------------------------
********** Iteration 423 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.671 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0671          0
         1     0.0571     0.0762
         2     0.0472      0.174
         3     0.0487      0.438


********** Iteration 429 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.669 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0703          0
         1     0.0528     0.0778
         2     0.0567      0.166
         3      0.051      0.344
         4     0.0596      0.554
         5     0.0835      0.892
         6     0.0266       1.19
         7      0.027       1.54
         8     0.0331       1.94
         9     0.0278        2.3
        10     0.0262        2.6
[35mdone in 0.021 seconds[0m
Expected: 0.077 Actual: 0.063
Stepsize OK!
[35mvf[0m
[35mdone in 0.040 seconds[0m
-----------------------------------------
| EpLenMean               | 664         |
| EpRewMean               | -4.85       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 673         |
| TimeElapsed             | 420         |
| TimestepsSoFar          | 440320      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.049 seconds[0m
-----------------------------------------
| EpLenMean               | 660         |
| EpRewMean               | -4.9        |
| EpThisIter              | 2           |
| EpisodesSoFar           | 683         |
| TimeElapsed             | 424         |
| TimestepsSoFar          | 446464      |
| entloss                 | 0.0         |
| entropy                 | 1.2075257   |
| explained_variance_t... | 0.757       |
| meankl                  | 0.011946578 |
| optimgain               | 0.06735355  |
| surrgain                | 0.06735355  |
-----------------------------------------
********** Iteration 436 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.760 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.010 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0934          0
         1     0.0461     0.0836
         2     0.0563      0.177
         3     0.0343      0.342


********** Iteration 442 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.675 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.011 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0238          0
         1     0.0623      0.127
         2     0.0825      0.403
         3     0.0362      0.683
         4     0.0342      0.867
         5     0.0339       1.03
         6     0.0381       1.25
         7     0.0156       1.59
         8     0.0179       2.03
         9     0.0255       2.31
        10     0.0122       2.57
[35mdone in 0.022 seconds[0m
Expected: 0.072 Actual: 0.066
Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
-----------------------------------------
| EpLenMean               | 632         |
| EpRewMean               | -4.97       |
| EpThisIter              | 1           |
| EpisodesSoFar           | 694         |
| TimeElapsed             | 434         |
| TimestepsSoFar          | 453632      |
| 

Stepsize OK!
[35mvf[0m
[35mdone in 0.043 seconds[0m
-----------------------------------------
| EpLenMean               | 633         |
| EpRewMean               | -4.97       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 704         |
| TimeElapsed             | 439         |
| TimestepsSoFar          | 459776      |
| entloss                 | 0.0         |
| entropy                 | 1.1582243   |
| explained_variance_t... | 0.805       |
| meankl                  | 0.009993459 |
| optimgain               | 0.065453276 |
| surrgain                | 0.065453276 |
-----------------------------------------
********** Iteration 449 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.725 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.008 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0829          0
         1     0.0468      0.133
         2     0.0491      0.258
         3     0.0769       0.41


********** Iteration 455 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.685 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0525          0
         1     0.0367     0.0748
         2     0.0339      0.178
         3     0.0416      0.332
         4     0.0239       0.45
         5     0.0148      0.573
         6     0.0317      0.739
         7     0.0136       1.02
         8     0.0208        1.2
         9    0.00937       1.56
        10    0.00789       1.74
[35mdone in 0.023 seconds[0m
Expected: 0.057 Actual: 0.064
Stepsize OK!
[35mvf[0m
[35mdone in 0.042 seconds[0m
-----------------------------------------
| EpLenMean               | 612         |
| EpRewMean               | -4.92       |
| EpThisIter              | 2           |
| EpisodesSoFar           | 716         |
| TimeElapsed             | 445         |
| TimestepsSoFar          | 466944      |
| 

violated KL constraint. shrinking step.
Expected: 0.075 Actual: 0.039
Stepsize OK!
[35mvf[0m
[35mdone in 0.047 seconds[0m
------------------------------------------
| EpLenMean               | 622          |
| EpRewMean               | -4.9         |
| EpThisIter              | 2            |
| EpisodesSoFar           | 726          |
| TimeElapsed             | 449          |
| TimestepsSoFar          | 473088       |
| entloss                 | 0.0          |
| entropy                 | 1.1718634    |
| explained_variance_t... | 0.68         |
| meankl                  | 0.0037844372 |
| optimgain               | 0.03917236   |
| surrgain                | 0.03917236   |
------------------------------------------
********** Iteration 462 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.688 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0603          0
         1     

********** Iteration 468 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.667 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.009 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0618          0
         1     0.0449      0.117
         2     0.0664      0.274
         3     0.0555       0.49
         4     0.0399      0.639
         5     0.0418       0.84
         6     0.0406      0.993
         7      0.039        1.3
         8     0.0185       1.55
         9     0.0225       1.87
        10     0.0153       2.13
[35mdone in 0.021 seconds[0m
Expected: 0.071 Actual: 0.058
Stepsize OK!
[35mvf[0m
[35mdone in 0.040 seconds[0m
------------------------------------------
| EpLenMean               | 630          |
| EpRewMean               | -4.85        |
| EpThisIter              | 2            |
| EpisodesSoFar           | 736          |
| TimeElapsed             | 459          |
| TimestepsSoFar          | 480256    

Stepsize OK!
[35mvf[0m
[35mdone in 0.041 seconds[0m
-----------------------------------------
| EpLenMean               | 642         |
| EpRewMean               | -4.8        |
| EpThisIter              | 1           |
| EpisodesSoFar           | 745         |
| TimeElapsed             | 464         |
| TimestepsSoFar          | 486400      |
| entloss                 | 0.0         |
| entropy                 | 1.1935837   |
| explained_variance_t... | 0.714       |
| meankl                  | 0.013716961 |
| optimgain               | 0.1026043   |
| surrgain                | 0.1026043   |
-----------------------------------------
********** Iteration 475 ************
Optimizing Policy...
[35msampling[0m


In [None]:
video_filename = 'trpo.mp4'
record_game(
    model=trained_model['trpo'],
    env=gym.make("SlimeVolley-v0"),
    num_episodes=5,
    video_filename=video_filename
)
embed_mp4(video_filename)