In [12]:
import gymnasium as gym
from imitation.algorithms.adversarial.gail import GAIL
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.rewards.reward_nets import BasicRewardNet
from imitation.util.networks import RunningNorm
from imitation.rewards.reward_nets import BasicRewardNet
from stable_baselines3 import PPO
from stable_baselines3.a2c import MlpPolicy
from imitation.data import rollout
import numpy as np
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common import monitor
from Wordle import WordleEnv
import os

env = monitor.Monitor(WordleEnv())
venv = DummyVecEnv([lambda: RolloutInfoWrapper(env)])  # Wrap a single environment -- only useful for simple testing like this
venv.render_mode="human"

#rollouts = trajectories
rollouts = np.load('data/trajectories_all.npy', allow_pickle=True)

transitions = rollout.flatten_trajectories_with_rew(rollouts)

In [13]:
#discriminator
learner = PPO(
    env=env,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0004,
    gamma=0.95,
    n_epochs=5,
    seed=None,
)

#generator
reward_net = BasicRewardNet(
    observation_space=env.observation_space,
    action_space=env.action_space,
    normalize_input_layer=RunningNorm,
)

#GAIL
gail_trainer = GAIL(
    demonstrations=transitions,
    demo_batch_size=1024,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=8,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
    allow_variable_horizon=True
)

Running with `allow_variable_horizon` set to True. Some algorithms are biased towards shorter or longer episodes, which may significantly confound results. Additionally, even unbiased algorithms can exploit the information leak from the termination condition, producing spuriously high performance. See https://imitation.readthedocs.io/en/latest/getting-started/variable-horizon.html for more information.


In [16]:
# evaluate the learner before training
learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 100, return_episode_rewards=True,
)

In [20]:
# train the learner and evaluate again
gail_trainer.train(100000)  # Train for 800_000 steps to match expert.
learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 100, return_episode_rewards=True,
)

round:   0%|          | 0/48 [00:00<?, ?it/s]

------------------------------------------
| raw/                        |          |
|    gen/rollout/ep_len_mean  | 6        |
|    gen/rollout/ep_rew_mean  | -8.25    |
|    gen/time/fps             | 72       |
|    gen/time/iterations      | 1        |
|    gen/time/time_elapsed    | 28       |
|    gen/time/total_timesteps | 2048     |
------------------------------------------
--------------------------------------------------
| raw/                                |          |
|    disc/disc_acc                    | 0.499    |
|    disc/disc_acc_expert             | 0.998    |
|    disc/disc_acc_gen                | 0        |
|    disc/disc_entropy                | 0.692    |
|    disc/disc_loss                   | 0.695    |
|    disc/disc_proportion_expert_pred | 0.999    |
|    disc/disc_proportion_expert_true | 0.5      |
|    disc/global_step                 | 1        |
|    disc/n_expert                    | 1.02e+03 |
|    disc/n_generated                 | 1.02e+03 |
-

round:   2%|▏         | 1/48 [00:37<29:16, 37.37s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.12      |
|    gen/rollout/ep_rew_wrapped_mean | 4.38       |
|    gen/time/fps                    | 71         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 28         |
|    gen/time/total_timesteps        | 4096       |
|    gen/train/approx_kl             | 0.04380278 |
|    gen/train/clip_fraction         | 0.492      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9.47      |
|    gen/train/explained_variance    | 0.0228     |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | -0.131     |
|    gen/train/n_updates             | 5          |
|    gen/train/policy_gradient_loss  | -0.113     |
|    gen/train/value_loss            | 0.383      |
------------

round:   4%|▍         | 2/48 [01:15<28:46, 37.53s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -8.17       |
|    gen/rollout/ep_rew_wrapped_mean | 4.47        |
|    gen/time/fps                    | 67          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 30          |
|    gen/time/total_timesteps        | 6144        |
|    gen/train/approx_kl             | 0.050360154 |
|    gen/train/clip_fraction         | 0.665       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -9.46       |
|    gen/train/explained_variance    | 0.875       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.0802     |
|    gen/train/n_updates             | 10          |
|    gen/train/policy_gradient_loss  | -0.118      |
|    gen/train/value_loss            | 0.112  

round:   6%|▋         | 3/48 [01:54<28:45, 38.35s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.31      |
|    gen/rollout/ep_rew_wrapped_mean | 4.31       |
|    gen/time/fps                    | 66         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 30         |
|    gen/time/total_timesteps        | 8192       |
|    gen/train/approx_kl             | 0.05569288 |
|    gen/train/clip_fraction         | 0.705      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9.46      |
|    gen/train/explained_variance    | 0.895      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | -0.105     |
|    gen/train/n_updates             | 15         |
|    gen/train/policy_gradient_loss  | -0.121     |
|    gen/train/value_loss            | 0.122      |
------------

round:   8%|▊         | 4/48 [02:34<28:36, 39.02s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.36      |
|    gen/rollout/ep_rew_wrapped_mean | 3.9        |
|    gen/time/fps                    | 57         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 35         |
|    gen/time/total_timesteps        | 10240      |
|    gen/train/approx_kl             | 0.07030367 |
|    gen/train/clip_fraction         | 0.708      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9.45      |
|    gen/train/explained_variance    | 0.848      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | -0.0725    |
|    gen/train/n_updates             | 20         |
|    gen/train/policy_gradient_loss  | -0.121     |
|    gen/train/value_loss            | 0.182      |
------------

round:  10%|█         | 5/48 [03:21<29:57, 41.79s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.43      |
|    gen/rollout/ep_rew_wrapped_mean | 3.53       |
|    gen/time/fps                    | 53         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 38         |
|    gen/time/total_timesteps        | 12288      |
|    gen/train/approx_kl             | 0.07712114 |
|    gen/train/clip_fraction         | 0.689      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9.44      |
|    gen/train/explained_variance    | 0.696      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.0198     |
|    gen/train/n_updates             | 25         |
|    gen/train/policy_gradient_loss  | -0.113     |
|    gen/train/value_loss            | 0.405      |
------------

round:  12%|█▎        | 6/48 [04:12<31:29, 44.99s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.41      |
|    gen/rollout/ep_rew_wrapped_mean | 2.95       |
|    gen/time/fps                    | 55         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 36         |
|    gen/time/total_timesteps        | 14336      |
|    gen/train/approx_kl             | 0.08671473 |
|    gen/train/clip_fraction         | 0.696      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9.43      |
|    gen/train/explained_variance    | 0.56       |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.176      |
|    gen/train/n_updates             | 30         |
|    gen/train/policy_gradient_loss  | -0.113     |
|    gen/train/value_loss            | 0.599      |
------------

round:  15%|█▍        | 7/48 [04:59<31:09, 45.60s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.1       |
|    gen/rollout/ep_rew_wrapped_mean | 2.54       |
|    gen/time/fps                    | 58         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 34         |
|    gen/time/total_timesteps        | 16384      |
|    gen/train/approx_kl             | 0.09061104 |
|    gen/train/clip_fraction         | 0.681      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9.42      |
|    gen/train/explained_variance    | 0.378      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.68       |
|    gen/train/n_updates             | 35         |
|    gen/train/policy_gradient_loss  | -0.105     |
|    gen/train/value_loss            | 1.2        |
------------

round:  17%|█▋        | 8/48 [05:43<30:10, 45.26s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -8.41       |
|    gen/rollout/ep_rew_wrapped_mean | 2.57        |
|    gen/time/fps                    | 64          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 31          |
|    gen/time/total_timesteps        | 18432       |
|    gen/train/approx_kl             | 0.091727495 |
|    gen/train/clip_fraction         | 0.663       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -9.41       |
|    gen/train/explained_variance    | 0.285       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.476       |
|    gen/train/n_updates             | 40          |
|    gen/train/policy_gradient_loss  | -0.103      |
|    gen/train/value_loss            | 1.98   

round:  19%|█▉        | 9/48 [06:24<28:29, 43.83s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -8.12       |
|    gen/rollout/ep_rew_wrapped_mean | 2.68        |
|    gen/time/fps                    | 68          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 29          |
|    gen/time/total_timesteps        | 20480       |
|    gen/train/approx_kl             | 0.095568925 |
|    gen/train/clip_fraction         | 0.671       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -9.39       |
|    gen/train/explained_variance    | 0.313       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.384       |
|    gen/train/n_updates             | 45          |
|    gen/train/policy_gradient_loss  | -0.108      |
|    gen/train/value_loss            | 1.61   

round:  21%|██        | 10/48 [07:03<26:46, 42.27s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 5.97       |
|    gen/rollout/ep_rew_mean         | -8.22      |
|    gen/rollout/ep_rew_wrapped_mean | 2.62       |
|    gen/time/fps                    | 62         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 32         |
|    gen/time/total_timesteps        | 22528      |
|    gen/train/approx_kl             | 0.10266513 |
|    gen/train/clip_fraction         | 0.669      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9.37      |
|    gen/train/explained_variance    | 0.259      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.56       |
|    gen/train/n_updates             | 50         |
|    gen/train/policy_gradient_loss  | -0.106     |
|    gen/train/value_loss            | 1.7        |
------------

round:  23%|██▎       | 11/48 [07:45<26:04, 42.29s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.17      |
|    gen/rollout/ep_rew_wrapped_mean | 2.29       |
|    gen/time/fps                    | 62         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 32         |
|    gen/time/total_timesteps        | 24576      |
|    gen/train/approx_kl             | 0.11291435 |
|    gen/train/clip_fraction         | 0.673      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9.34      |
|    gen/train/explained_variance    | 0.26       |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.633      |
|    gen/train/n_updates             | 55         |
|    gen/train/policy_gradient_loss  | -0.101     |
|    gen/train/value_loss            | 1.68       |
------------

round:  25%|██▌       | 12/48 [08:27<25:22, 42.28s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.27      |
|    gen/rollout/ep_rew_wrapped_mean | 2.41       |
|    gen/time/fps                    | 66         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 30         |
|    gen/time/total_timesteps        | 26624      |
|    gen/train/approx_kl             | 0.11188843 |
|    gen/train/clip_fraction         | 0.669      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9.3       |
|    gen/train/explained_variance    | 0.26       |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.263      |
|    gen/train/n_updates             | 60         |
|    gen/train/policy_gradient_loss  | -0.102     |
|    gen/train/value_loss            | 1.37       |
------------

round:  27%|██▋       | 13/48 [09:07<24:12, 41.50s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.38      |
|    gen/rollout/ep_rew_wrapped_mean | 1.9        |
|    gen/time/fps                    | 60         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 33         |
|    gen/time/total_timesteps        | 28672      |
|    gen/train/approx_kl             | 0.12613377 |
|    gen/train/clip_fraction         | 0.697      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9.26      |
|    gen/train/explained_variance    | 0.27       |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.344      |
|    gen/train/n_updates             | 65         |
|    gen/train/policy_gradient_loss  | -0.103     |
|    gen/train/value_loss            | 1.2        |
------------

round:  29%|██▉       | 14/48 [09:51<23:55, 42.23s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.23      |
|    gen/rollout/ep_rew_wrapped_mean | 2.16       |
|    gen/time/fps                    | 64         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 31         |
|    gen/time/total_timesteps        | 30720      |
|    gen/train/approx_kl             | 0.12869218 |
|    gen/train/clip_fraction         | 0.692      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9.21      |
|    gen/train/explained_variance    | 0.251      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.603      |
|    gen/train/n_updates             | 70         |
|    gen/train/policy_gradient_loss  | -0.101     |
|    gen/train/value_loss            | 1.27       |
------------

round:  31%|███▏      | 15/48 [10:33<23:09, 42.10s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -8.09       |
|    gen/rollout/ep_rew_wrapped_mean | 1.99        |
|    gen/time/fps                    | 59          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 34          |
|    gen/time/total_timesteps        | 32768       |
|    gen/train/approx_kl             | 0.118172236 |
|    gen/train/clip_fraction         | 0.672       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -9.14       |
|    gen/train/explained_variance    | 0.281       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.234       |
|    gen/train/n_updates             | 75          |
|    gen/train/policy_gradient_loss  | -0.101      |
|    gen/train/value_loss            | 1.08   

round:  33%|███▎      | 16/48 [11:17<22:52, 42.89s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.15      |
|    gen/rollout/ep_rew_wrapped_mean | 1.99       |
|    gen/time/fps                    | 64         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 31         |
|    gen/time/total_timesteps        | 34816      |
|    gen/train/approx_kl             | 0.11931125 |
|    gen/train/clip_fraction         | 0.678      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9.08      |
|    gen/train/explained_variance    | 0.28       |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.292      |
|    gen/train/n_updates             | 80         |
|    gen/train/policy_gradient_loss  | -0.0992    |
|    gen/train/value_loss            | 1.38       |
------------

round:  35%|███▌      | 17/48 [11:58<21:47, 42.17s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 5.96       |
|    gen/rollout/ep_rew_mean         | -7.75      |
|    gen/rollout/ep_rew_wrapped_mean | 2.05       |
|    gen/time/fps                    | 68         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 29         |
|    gen/time/total_timesteps        | 36864      |
|    gen/train/approx_kl             | 0.12228093 |
|    gen/train/clip_fraction         | 0.659      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -9         |
|    gen/train/explained_variance    | 0.3        |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.239      |
|    gen/train/n_updates             | 85         |
|    gen/train/policy_gradient_loss  | -0.099     |
|    gen/train/value_loss            | 1.15       |
------------

round:  38%|███▊      | 18/48 [12:37<20:34, 41.15s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -8.45       |
|    gen/rollout/ep_rew_wrapped_mean | 2.06        |
|    gen/time/fps                    | 65          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 31          |
|    gen/time/total_timesteps        | 38912       |
|    gen/train/approx_kl             | 0.123983055 |
|    gen/train/clip_fraction         | 0.654       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -8.92       |
|    gen/train/explained_variance    | 0.341       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.41        |
|    gen/train/n_updates             | 90          |
|    gen/train/policy_gradient_loss  | -0.0993     |
|    gen/train/value_loss            | 0.912  

round:  40%|███▉      | 19/48 [13:17<19:46, 40.93s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -8.17       |
|    gen/rollout/ep_rew_wrapped_mean | 2.01        |
|    gen/time/fps                    | 69          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 29          |
|    gen/time/total_timesteps        | 40960       |
|    gen/train/approx_kl             | 0.112066545 |
|    gen/train/clip_fraction         | 0.642       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -8.85       |
|    gen/train/explained_variance    | 0.312       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.225       |
|    gen/train/n_updates             | 95          |
|    gen/train/policy_gradient_loss  | -0.0952     |
|    gen/train/value_loss            | 0.979  

round:  42%|████▏     | 20/48 [13:56<18:45, 40.20s/it]

--------------------------------------------------
| raw/                               |           |
|    gen/rollout/ep_len_mean         | 6         |
|    gen/rollout/ep_rew_mean         | -8.04     |
|    gen/rollout/ep_rew_wrapped_mean | 2.23      |
|    gen/time/fps                    | 67        |
|    gen/time/iterations             | 1         |
|    gen/time/time_elapsed           | 30        |
|    gen/time/total_timesteps        | 43008     |
|    gen/train/approx_kl             | 0.1161676 |
|    gen/train/clip_fraction         | 0.661     |
|    gen/train/clip_range            | 0.2       |
|    gen/train/entropy_loss          | -8.78     |
|    gen/train/explained_variance    | 0.286     |
|    gen/train/learning_rate         | 0.0004    |
|    gen/train/loss                  | 0.397     |
|    gen/train/n_updates             | 100       |
|    gen/train/policy_gradient_loss  | -0.0945   |
|    gen/train/value_loss            | 1.02      |
-------------------------------

round:  44%|████▍     | 21/48 [14:35<17:57, 39.90s/it]

--------------------------------------------------
| raw/                               |           |
|    gen/rollout/ep_len_mean         | 6         |
|    gen/rollout/ep_rew_mean         | -8.44     |
|    gen/rollout/ep_rew_wrapped_mean | 1.86      |
|    gen/time/fps                    | 69        |
|    gen/time/iterations             | 1         |
|    gen/time/time_elapsed           | 29        |
|    gen/time/total_timesteps        | 45056     |
|    gen/train/approx_kl             | 0.1224586 |
|    gen/train/clip_fraction         | 0.672     |
|    gen/train/clip_range            | 0.2       |
|    gen/train/entropy_loss          | -8.67     |
|    gen/train/explained_variance    | 0.305     |
|    gen/train/learning_rate         | 0.0004    |
|    gen/train/loss                  | 0.199     |
|    gen/train/n_updates             | 105       |
|    gen/train/policy_gradient_loss  | -0.0996   |
|    gen/train/value_loss            | 0.935     |
-------------------------------

round:  46%|████▌     | 22/48 [15:13<17:07, 39.52s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -7.9        |
|    gen/rollout/ep_rew_wrapped_mean | 1.96        |
|    gen/time/fps                    | 67          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 30          |
|    gen/time/total_timesteps        | 47104       |
|    gen/train/approx_kl             | 0.107632995 |
|    gen/train/clip_fraction         | 0.643       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -8.59       |
|    gen/train/explained_variance    | 0.305       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.524       |
|    gen/train/n_updates             | 110         |
|    gen/train/policy_gradient_loss  | -0.0926     |
|    gen/train/value_loss            | 1.31   

round:  48%|████▊     | 23/48 [15:53<16:28, 39.53s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -8.35       |
|    gen/rollout/ep_rew_wrapped_mean | 2.05        |
|    gen/time/fps                    | 67          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 30          |
|    gen/time/total_timesteps        | 49152       |
|    gen/train/approx_kl             | 0.105569065 |
|    gen/train/clip_fraction         | 0.605       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -8.51       |
|    gen/train/explained_variance    | 0.301       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.374       |
|    gen/train/n_updates             | 115         |
|    gen/train/policy_gradient_loss  | -0.0896     |
|    gen/train/value_loss            | 0.896  

round:  50%|█████     | 24/48 [16:32<15:48, 39.52s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.05      |
|    gen/rollout/ep_rew_wrapped_mean | 1.69       |
|    gen/time/fps                    | 68         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 29         |
|    gen/time/total_timesteps        | 51200      |
|    gen/train/approx_kl             | 0.10723608 |
|    gen/train/clip_fraction         | 0.625      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -8.43      |
|    gen/train/explained_variance    | 0.325      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.161      |
|    gen/train/n_updates             | 120        |
|    gen/train/policy_gradient_loss  | -0.0889    |
|    gen/train/value_loss            | 0.752      |
------------

round:  52%|█████▏    | 25/48 [17:11<15:04, 39.34s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -8.14       |
|    gen/rollout/ep_rew_wrapped_mean | 1.3         |
|    gen/time/fps                    | 67          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 30          |
|    gen/time/total_timesteps        | 53248       |
|    gen/train/approx_kl             | 0.100925334 |
|    gen/train/clip_fraction         | 0.605       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -8.35       |
|    gen/train/explained_variance    | 0.324       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.294       |
|    gen/train/n_updates             | 125         |
|    gen/train/policy_gradient_loss  | -0.0842     |
|    gen/train/value_loss            | 0.725  

round:  54%|█████▍    | 26/48 [17:51<14:24, 39.29s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.13      |
|    gen/rollout/ep_rew_wrapped_mean | 2.07       |
|    gen/time/fps                    | 68         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 29         |
|    gen/time/total_timesteps        | 55296      |
|    gen/train/approx_kl             | 0.09823075 |
|    gen/train/clip_fraction         | 0.616      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -8.29      |
|    gen/train/explained_variance    | 0.313      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.415      |
|    gen/train/n_updates             | 130        |
|    gen/train/policy_gradient_loss  | -0.0862    |
|    gen/train/value_loss            | 1.11       |
------------

round:  56%|█████▋    | 27/48 [18:29<13:41, 39.12s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.26      |
|    gen/rollout/ep_rew_wrapped_mean | 1.47       |
|    gen/time/fps                    | 67         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 30         |
|    gen/time/total_timesteps        | 57344      |
|    gen/train/approx_kl             | 0.09759509 |
|    gen/train/clip_fraction         | 0.593      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -8.2       |
|    gen/train/explained_variance    | 0.279      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.37       |
|    gen/train/n_updates             | 135        |
|    gen/train/policy_gradient_loss  | -0.0825    |
|    gen/train/value_loss            | 0.823      |
------------

round:  58%|█████▊    | 28/48 [19:09<13:04, 39.22s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -8.5        |
|    gen/rollout/ep_rew_wrapped_mean | 1.69        |
|    gen/time/fps                    | 67          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 30          |
|    gen/time/total_timesteps        | 59392       |
|    gen/train/approx_kl             | 0.097358644 |
|    gen/train/clip_fraction         | 0.609       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -8.14       |
|    gen/train/explained_variance    | 0.256       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.229       |
|    gen/train/n_updates             | 140         |
|    gen/train/policy_gradient_loss  | -0.0829     |
|    gen/train/value_loss            | 1.06   

round:  60%|██████    | 29/48 [19:48<12:26, 39.27s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.32      |
|    gen/rollout/ep_rew_wrapped_mean | 1.58       |
|    gen/time/fps                    | 66         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 30         |
|    gen/time/total_timesteps        | 61440      |
|    gen/train/approx_kl             | 0.09000831 |
|    gen/train/clip_fraction         | 0.593      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -8.08      |
|    gen/train/explained_variance    | 0.283      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.298      |
|    gen/train/n_updates             | 145        |
|    gen/train/policy_gradient_loss  | -0.0817    |
|    gen/train/value_loss            | 1.06       |
------------

round:  62%|██████▎   | 30/48 [20:28<11:49, 39.44s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -8          |
|    gen/rollout/ep_rew_wrapped_mean | 1.72        |
|    gen/time/fps                    | 68          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 29          |
|    gen/time/total_timesteps        | 63488       |
|    gen/train/approx_kl             | 0.088633135 |
|    gen/train/clip_fraction         | 0.571       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -8.01       |
|    gen/train/explained_variance    | 0.265       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.279       |
|    gen/train/n_updates             | 150         |
|    gen/train/policy_gradient_loss  | -0.079      |
|    gen/train/value_loss            | 1.06   

round:  65%|██████▍   | 31/48 [21:07<11:08, 39.32s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -8.18       |
|    gen/rollout/ep_rew_wrapped_mean | 1.4         |
|    gen/time/fps                    | 66          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 30          |
|    gen/time/total_timesteps        | 65536       |
|    gen/train/approx_kl             | 0.090699464 |
|    gen/train/clip_fraction         | 0.566       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -7.94       |
|    gen/train/explained_variance    | 0.296       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.232       |
|    gen/train/n_updates             | 155         |
|    gen/train/policy_gradient_loss  | -0.0769     |
|    gen/train/value_loss            | 0.828  

round:  67%|██████▋   | 32/48 [21:47<10:31, 39.46s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.11      |
|    gen/rollout/ep_rew_wrapped_mean | 1.68       |
|    gen/time/fps                    | 68         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 29         |
|    gen/time/total_timesteps        | 67584      |
|    gen/train/approx_kl             | 0.07953444 |
|    gen/train/clip_fraction         | 0.564      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -7.86      |
|    gen/train/explained_variance    | 0.306      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.414      |
|    gen/train/n_updates             | 160        |
|    gen/train/policy_gradient_loss  | -0.0753    |
|    gen/train/value_loss            | 1.14       |
------------

round:  69%|██████▉   | 33/48 [22:25<09:47, 39.20s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.16      |
|    gen/rollout/ep_rew_wrapped_mean | 1.17       |
|    gen/time/fps                    | 67         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 30         |
|    gen/time/total_timesteps        | 69632      |
|    gen/train/approx_kl             | 0.07654163 |
|    gen/train/clip_fraction         | 0.534      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -7.8       |
|    gen/train/explained_variance    | 0.278      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.853      |
|    gen/train/n_updates             | 165        |
|    gen/train/policy_gradient_loss  | -0.0728    |
|    gen/train/value_loss            | 1.11       |
------------

round:  71%|███████   | 34/48 [23:05<09:08, 39.20s/it]

--------------------------------------------------
| raw/                               |           |
|    gen/rollout/ep_len_mean         | 5.95      |
|    gen/rollout/ep_rew_mean         | -7.6      |
|    gen/rollout/ep_rew_wrapped_mean | 1.59      |
|    gen/time/fps                    | 66        |
|    gen/time/iterations             | 1         |
|    gen/time/time_elapsed           | 30        |
|    gen/time/total_timesteps        | 71680     |
|    gen/train/approx_kl             | 0.0766107 |
|    gen/train/clip_fraction         | 0.549     |
|    gen/train/clip_range            | 0.2       |
|    gen/train/entropy_loss          | -7.72     |
|    gen/train/explained_variance    | 0.292     |
|    gen/train/learning_rate         | 0.0004    |
|    gen/train/loss                  | 0.316     |
|    gen/train/n_updates             | 170       |
|    gen/train/policy_gradient_loss  | -0.0784   |
|    gen/train/value_loss            | 1.24      |
-------------------------------

round:  73%|███████▎  | 35/48 [23:45<08:33, 39.50s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -8.03      |
|    gen/rollout/ep_rew_wrapped_mean | 1.65       |
|    gen/time/fps                    | 68         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 30         |
|    gen/time/total_timesteps        | 73728      |
|    gen/train/approx_kl             | 0.07796724 |
|    gen/train/clip_fraction         | 0.53       |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -7.67      |
|    gen/train/explained_variance    | 0.271      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.443      |
|    gen/train/n_updates             | 175        |
|    gen/train/policy_gradient_loss  | -0.0738    |
|    gen/train/value_loss            | 1.38       |
------------

round:  75%|███████▌  | 36/48 [24:24<07:51, 39.33s/it]

--------------------------------------------------
| raw/                               |           |
|    gen/rollout/ep_len_mean         | 6         |
|    gen/rollout/ep_rew_mean         | -8.03     |
|    gen/rollout/ep_rew_wrapped_mean | 1.62      |
|    gen/time/fps                    | 69        |
|    gen/time/iterations             | 1         |
|    gen/time/time_elapsed           | 29        |
|    gen/time/total_timesteps        | 75776     |
|    gen/train/approx_kl             | 0.0665568 |
|    gen/train/clip_fraction         | 0.507     |
|    gen/train/clip_range            | 0.2       |
|    gen/train/entropy_loss          | -7.55     |
|    gen/train/explained_variance    | 0.321     |
|    gen/train/learning_rate         | 0.0004    |
|    gen/train/loss                  | 0.934     |
|    gen/train/n_updates             | 180       |
|    gen/train/policy_gradient_loss  | -0.0773   |
|    gen/train/value_loss            | 1.76      |
-------------------------------

round:  77%|███████▋  | 37/48 [25:02<07:10, 39.11s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -8.15       |
|    gen/rollout/ep_rew_wrapped_mean | 1.94        |
|    gen/time/fps                    | 65          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 31          |
|    gen/time/total_timesteps        | 77824       |
|    gen/train/approx_kl             | 0.073014185 |
|    gen/train/clip_fraction         | 0.537       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -7.23       |
|    gen/train/explained_variance    | 0.331       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.614       |
|    gen/train/n_updates             | 185         |
|    gen/train/policy_gradient_loss  | -0.0763     |
|    gen/train/value_loss            | 1.68   

round:  79%|███████▉  | 38/48 [25:43<06:34, 39.48s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -7.87      |
|    gen/rollout/ep_rew_wrapped_mean | 1.71       |
|    gen/time/fps                    | 67         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 30         |
|    gen/time/total_timesteps        | 79872      |
|    gen/train/approx_kl             | 0.07320762 |
|    gen/train/clip_fraction         | 0.536      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -6.77      |
|    gen/train/explained_variance    | 0.281      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.648      |
|    gen/train/n_updates             | 190        |
|    gen/train/policy_gradient_loss  | -0.0756    |
|    gen/train/value_loss            | 1.42       |
------------

round:  81%|████████▏ | 39/48 [26:22<05:54, 39.37s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -7.93      |
|    gen/rollout/ep_rew_wrapped_mean | 2.13       |
|    gen/time/fps                    | 70         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 29         |
|    gen/time/total_timesteps        | 81920      |
|    gen/train/approx_kl             | 0.06404711 |
|    gen/train/clip_fraction         | 0.541      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -6.25      |
|    gen/train/explained_variance    | 0.298      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.446      |
|    gen/train/n_updates             | 195        |
|    gen/train/policy_gradient_loss  | -0.0795    |
|    gen/train/value_loss            | 1.89       |
------------

round:  83%|████████▎ | 40/48 [27:01<05:13, 39.24s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -7.79       |
|    gen/rollout/ep_rew_wrapped_mean | 3.16        |
|    gen/time/fps                    | 67          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 30          |
|    gen/time/total_timesteps        | 83968       |
|    gen/train/approx_kl             | 0.053910382 |
|    gen/train/clip_fraction         | 0.577       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -5.72       |
|    gen/train/explained_variance    | 0.322       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.962       |
|    gen/train/n_updates             | 200         |
|    gen/train/policy_gradient_loss  | -0.0871     |
|    gen/train/value_loss            | 2.91   

round:  85%|████████▌ | 41/48 [27:40<04:34, 39.28s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -7.65       |
|    gen/rollout/ep_rew_wrapped_mean | 2.7         |
|    gen/time/fps                    | 66          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 30          |
|    gen/time/total_timesteps        | 86016       |
|    gen/train/approx_kl             | 0.060116597 |
|    gen/train/clip_fraction         | 0.64        |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -5.28       |
|    gen/train/explained_variance    | 0.332       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.401       |
|    gen/train/n_updates             | 205         |
|    gen/train/policy_gradient_loss  | -0.0913     |
|    gen/train/value_loss            | 1.55   

round:  88%|████████▊ | 42/48 [28:20<03:56, 39.43s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -7.42       |
|    gen/rollout/ep_rew_wrapped_mean | 2.49        |
|    gen/time/fps                    | 67          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 30          |
|    gen/time/total_timesteps        | 88064       |
|    gen/train/approx_kl             | 0.049678344 |
|    gen/train/clip_fraction         | 0.629       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -4.68       |
|    gen/train/explained_variance    | 0.408       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.552       |
|    gen/train/n_updates             | 210         |
|    gen/train/policy_gradient_loss  | -0.0893     |
|    gen/train/value_loss            | 1.38   

round:  90%|████████▉ | 43/48 [28:59<03:16, 39.36s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6          |
|    gen/rollout/ep_rew_mean         | -6.92      |
|    gen/rollout/ep_rew_wrapped_mean | 2.14       |
|    gen/time/fps                    | 64         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 31         |
|    gen/time/total_timesteps        | 90112      |
|    gen/train/approx_kl             | 0.05404295 |
|    gen/train/clip_fraction         | 0.553      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -3.88      |
|    gen/train/explained_variance    | 0.383      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | 0.304      |
|    gen/train/n_updates             | 215        |
|    gen/train/policy_gradient_loss  | -0.0757    |
|    gen/train/value_loss            | 1.01       |
------------

round:  92%|█████████▏| 44/48 [29:42<02:41, 40.32s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -7.2        |
|    gen/rollout/ep_rew_wrapped_mean | 2.51        |
|    gen/time/fps                    | 56          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 36          |
|    gen/time/total_timesteps        | 92160       |
|    gen/train/approx_kl             | 0.065875076 |
|    gen/train/clip_fraction         | 0.426       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -2.88       |
|    gen/train/explained_variance    | 0.358       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.249       |
|    gen/train/n_updates             | 220         |
|    gen/train/policy_gradient_loss  | -0.0621     |
|    gen/train/value_loss            | 0.788  

round:  94%|█████████▍| 45/48 [30:27<02:05, 41.88s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -7.78       |
|    gen/rollout/ep_rew_wrapped_mean | 2.1         |
|    gen/time/fps                    | 60          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 33          |
|    gen/time/total_timesteps        | 94208       |
|    gen/train/approx_kl             | 0.036248285 |
|    gen/train/clip_fraction         | 0.177       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -2.48       |
|    gen/train/explained_variance    | 0.442       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0815      |
|    gen/train/n_updates             | 225         |
|    gen/train/policy_gradient_loss  | -0.0351     |
|    gen/train/value_loss            | 0.464  

round:  96%|█████████▌| 46/48 [31:10<01:24, 42.27s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.96        |
|    gen/rollout/ep_rew_mean         | -7.07       |
|    gen/rollout/ep_rew_wrapped_mean | 1.83        |
|    gen/time/fps                    | 62          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 32          |
|    gen/time/total_timesteps        | 96256       |
|    gen/train/approx_kl             | 0.042109817 |
|    gen/train/clip_fraction         | 0.182       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -2.89       |
|    gen/train/explained_variance    | 0.133       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.626       |
|    gen/train/n_updates             | 230         |
|    gen/train/policy_gradient_loss  | -0.0322     |
|    gen/train/value_loss            | 0.977  

round:  98%|█████████▊| 47/48 [31:52<00:42, 42.17s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6           |
|    gen/rollout/ep_rew_mean         | -7.61       |
|    gen/rollout/ep_rew_wrapped_mean | 1.86        |
|    gen/time/fps                    | 65          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 31          |
|    gen/time/total_timesteps        | 98304       |
|    gen/train/approx_kl             | 0.096188486 |
|    gen/train/clip_fraction         | 0.333       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -3.5        |
|    gen/train/explained_variance    | 0.165       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.62        |
|    gen/train/n_updates             | 235         |
|    gen/train/policy_gradient_loss  | -0.0474     |
|    gen/train/value_loss            | 1.2    

round: 100%|██████████| 48/48 [32:32<00:00, 40.68s/it]


In [26]:
print(
    "Rewards before training:",
    np.mean(learner_rewards_before_training),
    "+/-",
    np.std(learner_rewards_before_training),
)
print(
    "Rewards after training:",
    np.mean(learner_rewards_after_training),
    "+/-",
    np.std(learner_rewards_after_training),
)

Rewards before training: -8.5 +/- 2.2561028345356955
Rewards after training: -6.0 +/- 0.0
