[download this notebook here](https://github.com/HumanCompatibleAI/imitation/blob/master/docs/tutorials/5_train_preference_comparisons.ipynb)
# We want to compare pairwise comparison with groupwise comparison

We will use synthetic feedback based on the true reward function of the reacher environment to evaluate and compare pairwise comparison and pairwise group comparison.

To set up the preference comparisons algorithm, we first need to set up a lot of its internals beforehand:

In [12]:
import random
from imitation.algorithms import preference_comparisons
from imitation.rewards.reward_nets import BasicRewardNet, RewardEnsemble
from imitation.util.networks import RunningNorm
from imitation.util.util import make_vec_env
from imitation.policies.base import FeedForward32Policy, NormalizeFeaturesExtractor
from imitation.regularization.regularizers import LpRegularizer
from imitation.regularization.updaters import IntervalParamScaler
import gymnasium as gym
from stable_baselines3 import PPO
import numpy as np
from imitation.util import logger
import stable_baselines3.common.logger as sb_logger


rng = np.random.default_rng(0)
def intantiate_and_train(pairwise):
    venv = make_vec_env("Reacher-v4", rng=rng, render_mode='rgb_array', n_envs=8)

    reward_net_members = [BasicRewardNet(venv.observation_space, venv.action_space, normalize_input_layer=RunningNorm) for _ in range(5)]
    reward_net = RewardEnsemble(venv.observation_space, venv.action_space, reward_net_members)

    preference_model = preference_comparisons.PreferenceModel(reward_net)
    # reward_trainer = preference_comparisons.BasicRewardTrainer(
    #     preference_model=preference_model,
    #     loss=preference_comparisons.CrossEntropyRewardLoss(),
    #     epochs=3,
    #     rng=rng,
    # )


    # Create a lambda updater
    scaling_factor = 0.1
    tolerable_interval = (0.9, 1.1) 
    lambda_updater = IntervalParamScaler(scaling_factor, tolerable_interval)
    # Create a RegularizerFactory
    regularizer_factory = LpRegularizer.create(initial_lambda=0.1, lambda_updater=lambda_updater, p=2, val_split=0.1)

    reward_trainer = preference_comparisons.EnsembleTrainer(
        preference_model,
        loss=preference_comparisons.CrossEntropyRewardLoss(),
        rng=rng,
        epochs=5,
        batch_size = 4,
        minibatch_size = 2,
        # lr: float = 1e-3,
        # custom_logger: Optional[imit_logger.HierarchicalLogger] = None,
        regularizer_factory = regularizer_factory,
    )
    if pairwise:
        base_fragmenter = preference_comparisons.RandomFragmenter(
            warning_threshold=0,
            rng=rng,
        )
        fragmenter = preference_comparisons.ActiveSelectionFragmenter(
                preference_model,
                base_fragmenter,
                2.0,
        )
        gatherer = preference_comparisons.SyntheticGatherer(rng=rng)
    else:
        fragmenter = preference_comparisons.AbsoluteUncertaintyFragmenter(
            preference_model,
            2.0,
            rng=rng,
        )
        gatherer = preference_comparisons.SyntheticGathererForGroupComparisons(rng=rng)
    # Several hyperparameters (reward_epochs, ppo_clip_range, ppo_ent_coef,
    # ppo_gae_lambda, ppo_n_epochs, discount_factor, use_sde, sde_sample_freq,
    # ppo_lr, exploration_frac, num_iterations, initial_comparison_frac,
    # initial_epoch_multiplier, query_schedule) used in this example have been
    # approximately fine-tuned to reach a reasonable level of performance.
    agent = PPO(
        policy=FeedForward32Policy,
        policy_kwargs=dict(
            features_extractor_class=NormalizeFeaturesExtractor,
            features_extractor_kwargs=dict(normalize_class=RunningNorm),
        ),
        env=venv,
        seed=0,
        n_steps=2048 // venv.num_envs,
        batch_size=64,
        ent_coef=0.01,
        learning_rate=2e-3,
        clip_range=0.1,
        gae_lambda=0.95,
        gamma=0.97,
        n_epochs=10,
        tensorboard_log="tensorboard_logs/",
    )

    trajectory_generator = preference_comparisons.AgentTrainer(
        algorithm=agent,
        reward_fn=reward_net,
        venv=venv,
        rng=rng,
        exploration_frac=0.05,
    )

    default_logger = sb_logger.Logger(folder='/logs', output_formats='stdout,log,csv,tensorboard')
    custom_logger = logger.HierarchicalLogger(default_logger=default_logger)

    pref_comparisons = preference_comparisons.PreferenceComparisons(
        trajectory_generator,
        reward_net,
        num_iterations=5,  # Set to 60 for better performance
        fragmenter=fragmenter,
        preference_gatherer=gatherer,
        reward_trainer=reward_trainer,
        fragment_length=50,
        transition_oversampling=1,
        initial_comparison_frac=0.1,
        allow_variable_horizon=False,
        initial_epoch_multiplier=4,
        query_schedule="hyperbolic",
        custom_logger=custom_logger,
    )

    return pref_comparisons.train(
        total_timesteps=100_000,
        total_comparisons=1000,
    )

Classical pairwise comparison (baseline):

In [None]:

pairwise_comparison_result = intantiate_and_train(True)    

print(pairwise_comparison_result)

Pairwise Group comparison:

In [14]:
pairwise_group_comparison_result = intantiate_and_train(False)    

print(pairwise_group_comparison_result)

Query schedule: [50, 127, 102, 85, 73, 63]
Collecting 100 fragments (5000 transitions)
Requested 4750 transitions but only 0 in buffer. Sampling 4750 additional transitions.
Sampling 250 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 50 comparisons


Training reward model:   0%|          | 0/12 [00:00<?, ?it/s]

Training agent for 10000 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 50       |
|    agent/rollout/ep_rew_mean         | -60.2    |
|    agent/rollout/ep_rew_wrapped_mean | -26.4    |
|    agent/time/fps                    | 1081     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 1        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -60.8        |
|    agent/rollout/ep_rew_wrapped_mean | -17.2        |
|    agent/time/fps                    | 905          |
|    agent/time/iterations             | 2            |
|    agent/time/time_elapsed           | 4           

Collecting 254 fragments (12700 transitions)
Requested 12065 transitions but only 10200 in buffer. Sampling 1865 additional transitions.
Sampling 635 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 180 comparisons


Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -60.7        |
|    agent/rollout/ep_rew_wrapped_mean | -4.38        |
|    agent/time/fps                    | 1103         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0073185386 |
|    agent/train/clip_fraction         | 0.284        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -2.83        |
|    agent/train/explained_variance    | 0.694        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0284      |
|    agent/train/n_updates             | 50           |
|    agent/tr

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -60         |
|    agent/rollout/ep_rew_wrapped_mean | -3.77       |
|    agent/time/fps                    | 1102        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.008780796 |
|    agent/train/clip_fraction         | 0.305       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.71       |
|    agent/train/explained_variance    | 0.793       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0203     |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -58.8       |
|    agent/rollout/ep_rew_wrapped_mean | -3.75       |
|    agent/time/fps                    | 1098        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.009421687 |
|    agent/train/clip_fraction         | 0.33        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.52       |
|    agent/train/explained_variance    | 0.833       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0213     |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -59.6       |
|    agent/rollout/ep_rew_wrapped_mean | -3.54       |
|    agent/time/fps                    | 1086        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.009126718 |
|    agent/train/clip_fraction         | 0.329       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.32       |
|    agent/train/explained_variance    | 0.936       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0375     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -59         |
|    agent/rollout/ep_rew_wrapped_mean | -3.39       |
|    agent/time/fps                    | 990         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.013037488 |
|    agent/train/clip_fraction         | 0.392       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.11       |
|    agent/train/explained_variance    | 0.962       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0275     |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/12 [00:00<?, ?it/s]

Training agent for 10000 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 50       |
|    agent/rollout/ep_rew_mean         | -60.2    |
|    agent/rollout/ep_rew_wrapped_mean | -28.7    |
|    agent/time/fps                    | 1062     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 1        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -61         |
|    agent/rollout/ep_rew_wrapped_mean | -18.4       |
|    agent/time/fps                    | 812         |
|    agent/time/iterations             | 2           |
|    agent/time/time_elapsed           | 5           |
|    

Collecting 254 fragments (12700 transitions)
Requested 12065 transitions but only 10200 in buffer. Sampling 1865 additional transitions.
Sampling 635 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 180 comparisons


Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -61.6        |
|    agent/rollout/ep_rew_wrapped_mean | -5.97        |
|    agent/time/fps                    | 902          |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 2            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0060605095 |
|    agent/train/clip_fraction         | 0.244        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -2.84        |
|    agent/train/explained_variance    | 0.259        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.03        |
|    agent/train/n_updates             | 50           |
|    agent/tr

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -61.8       |
|    agent/rollout/ep_rew_wrapped_mean | -6.06       |
|    agent/time/fps                    | 682         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 3           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.008941688 |
|    agent/train/clip_fraction         | 0.294       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.78       |
|    agent/train/explained_variance    | 0.76        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0301     |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 50         |
|    agent/rollout/ep_rew_mean         | -61.3      |
|    agent/rollout/ep_rew_wrapped_mean | -6.11      |
|    agent/time/fps                    | 993        |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 2          |
|    agent/time/total_timesteps        | 32768      |
|    agent/train/approx_kl             | 0.00873569 |
|    agent/train/clip_fraction         | 0.32       |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -2.69      |
|    agent/train/explained_variance    | 0.901      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0445    |
|    agent/train/n_updates             | 150        |
|    agent/train/policy_gradient_loss  | -0.012

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -58.2       |
|    agent/rollout/ep_rew_wrapped_mean | -6.28       |
|    agent/time/fps                    | 993         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.012673536 |
|    agent/train/clip_fraction         | 0.363       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.46       |
|    agent/train/explained_variance    | 0.898       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0577     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -57.1       |
|    agent/rollout/ep_rew_wrapped_mean | -6.48       |
|    agent/time/fps                    | 935         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.012068499 |
|    agent/train/clip_fraction         | 0.364       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.26       |
|    agent/train/explained_variance    | 0.928       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0361     |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/12 [00:00<?, ?it/s]

Training agent for 10000 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 50       |
|    agent/rollout/ep_rew_mean         | -60.2    |
|    agent/rollout/ep_rew_wrapped_mean | -28.7    |
|    agent/time/fps                    | 1066     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 1        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -60.7        |
|    agent/rollout/ep_rew_wrapped_mean | -18.2        |
|    agent/time/fps                    | 867          |
|    agent/time/iterations             | 2            |
|    agent/time/time_elapsed           | 4           

Collecting 254 fragments (12700 transitions)
Requested 12065 transitions but only 10200 in buffer. Sampling 1865 additional transitions.
Sampling 635 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 180 comparisons


Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -60.9        |
|    agent/rollout/ep_rew_wrapped_mean | -5.37        |
|    agent/time/fps                    | 1087         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0057599708 |
|    agent/train/clip_fraction         | 0.277        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -2.83        |
|    agent/train/explained_variance    | 0.574        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0313      |
|    agent/train/n_updates             | 50           |
|    agent/tr

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -61.7       |
|    agent/rollout/ep_rew_wrapped_mean | -5.97       |
|    agent/time/fps                    | 962         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.007324839 |
|    agent/train/clip_fraction         | 0.286       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.79       |
|    agent/train/explained_variance    | 0.83        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0342     |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -61.7       |
|    agent/rollout/ep_rew_wrapped_mean | -6.2        |
|    agent/time/fps                    | 1069        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.009024402 |
|    agent/train/clip_fraction         | 0.307       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.71       |
|    agent/train/explained_variance    | 0.873       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0274     |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -60.6       |
|    agent/rollout/ep_rew_wrapped_mean | -6.21       |
|    agent/time/fps                    | 1064        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.010549548 |
|    agent/train/clip_fraction         | 0.362       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.53       |
|    agent/train/explained_variance    | 0.826       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0438     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -58.1       |
|    agent/rollout/ep_rew_wrapped_mean | -6.31       |
|    agent/time/fps                    | 959         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.011453337 |
|    agent/train/clip_fraction         | 0.344       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.33       |
|    agent/train/explained_variance    | 0.909       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0105     |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/12 [00:00<?, ?it/s]

Training agent for 10000 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 50       |
|    agent/rollout/ep_rew_mean         | -60.2    |
|    agent/rollout/ep_rew_wrapped_mean | -28.7    |
|    agent/time/fps                    | 1036     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 1        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -60.9        |
|    agent/rollout/ep_rew_wrapped_mean | -18.4        |
|    agent/time/fps                    | 790          |
|    agent/time/iterations             | 2            |
|    agent/time/time_elapsed           | 5           

Collecting 254 fragments (12700 transitions)
Requested 12065 transitions but only 10200 in buffer. Sampling 1865 additional transitions.
Sampling 635 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 180 comparisons


Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -61.6       |
|    agent/rollout/ep_rew_wrapped_mean | -5.74       |
|    agent/time/fps                    | 1061        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 12288       |
|    agent/train/approx_kl             | 0.008199505 |
|    agent/train/clip_fraction         | 0.268       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.82       |
|    agent/train/explained_variance    | 0.447       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0387     |
|    agent/train/n_updates             | 50          |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -61.3       |
|    agent/rollout/ep_rew_wrapped_mean | -7.27       |
|    agent/time/fps                    | 1032        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.009716862 |
|    agent/train/clip_fraction         | 0.313       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.76       |
|    agent/train/explained_variance    | 0.814       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0418     |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -58.7       |
|    agent/rollout/ep_rew_wrapped_mean | -7.84       |
|    agent/time/fps                    | 951         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.010233745 |
|    agent/train/clip_fraction         | 0.316       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.58       |
|    agent/train/explained_variance    | 0.941       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0417     |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 50         |
|    agent/rollout/ep_rew_mean         | -53.6      |
|    agent/rollout/ep_rew_wrapped_mean | -8.03      |
|    agent/time/fps                    | 949        |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 2          |
|    agent/time/total_timesteps        | 43008      |
|    agent/train/approx_kl             | 0.01363267 |
|    agent/train/clip_fraction         | 0.359      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -2.31      |
|    agent/train/explained_variance    | 0.936      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0291    |
|    agent/train/n_updates             | 200        |
|    agent/train/policy_gradient_loss  | -0.012

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -49.4       |
|    agent/rollout/ep_rew_wrapped_mean | -7.95       |
|    agent/time/fps                    | 1025        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.013616441 |
|    agent/train/clip_fraction         | 0.401       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2          |
|    agent/train/explained_variance    | 0.965       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0238     |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/12 [00:00<?, ?it/s]

Training agent for 10000 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 50       |
|    agent/rollout/ep_rew_mean         | -60.2    |
|    agent/rollout/ep_rew_wrapped_mean | -28.7    |
|    agent/time/fps                    | 695      |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 2        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -60.8       |
|    agent/rollout/ep_rew_wrapped_mean | -18.2       |
|    agent/time/fps                    | 581         |
|    agent/time/iterations             | 2           |
|    agent/time/time_elapsed           | 7           |
|    

Collecting 254 fragments (12700 transitions)
Requested 12065 transitions but only 10200 in buffer. Sampling 1865 additional transitions.
Sampling 635 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 180 comparisons


Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -60.9       |
|    agent/rollout/ep_rew_wrapped_mean | -5.52       |
|    agent/time/fps                    | 846         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 12288       |
|    agent/train/approx_kl             | 0.005668029 |
|    agent/train/clip_fraction         | 0.252       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.83       |
|    agent/train/explained_variance    | 0.537       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0302     |
|    agent/train/n_updates             | 50          |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -60.2       |
|    agent/rollout/ep_rew_wrapped_mean | -6.2        |
|    agent/time/fps                    | 900         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.008439384 |
|    agent/train/clip_fraction         | 0.317       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.74       |
|    agent/train/explained_variance    | 0.788       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0108     |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 50         |
|    agent/rollout/ep_rew_mean         | -59.5      |
|    agent/rollout/ep_rew_wrapped_mean | -6         |
|    agent/time/fps                    | 914        |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 2          |
|    agent/time/total_timesteps        | 32768      |
|    agent/train/approx_kl             | 0.00960023 |
|    agent/train/clip_fraction         | 0.336      |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -2.58      |
|    agent/train/explained_variance    | 0.815      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0502    |
|    agent/train/n_updates             | 150        |
|    agent/train/policy_gradient_loss  | -0.015

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -58.8       |
|    agent/rollout/ep_rew_wrapped_mean | -6.75       |
|    agent/time/fps                    | 957         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.010389901 |
|    agent/train/clip_fraction         | 0.349       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.4        |
|    agent/train/explained_variance    | 0.962       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0479     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -56.6       |
|    agent/rollout/ep_rew_wrapped_mean | -5.8        |
|    agent/time/fps                    | 1044        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.008376967 |
|    agent/train/clip_fraction         | 0.303       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.19       |
|    agent/train/explained_variance    | 0.952       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00727    |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/12 [00:00<?, ?it/s]

Training agent for 10000 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 50       |
|    agent/rollout/ep_rew_mean         | -60.2    |
|    agent/rollout/ep_rew_wrapped_mean | -28.7    |
|    agent/time/fps                    | 998      |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 2        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -60.9        |
|    agent/rollout/ep_rew_wrapped_mean | -17.5        |
|    agent/time/fps                    | 823          |
|    agent/time/iterations             | 2            |
|    agent/time/time_elapsed           | 4           

Collecting 254 fragments (12700 transitions)
Requested 12065 transitions but only 10200 in buffer. Sampling 1865 additional transitions.
Sampling 635 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 180 comparisons


Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -62.9        |
|    agent/rollout/ep_rew_wrapped_mean | -2.09        |
|    agent/time/fps                    | 1111         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0064050416 |
|    agent/train/clip_fraction         | 0.307        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -2.86        |
|    agent/train/explained_variance    | 0.736        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0453      |
|    agent/train/n_updates             | 50           |
|    agent/tr

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -63.6       |
|    agent/rollout/ep_rew_wrapped_mean | -6.53       |
|    agent/time/fps                    | 1055        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.007966993 |
|    agent/train/clip_fraction         | 0.279       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.83       |
|    agent/train/explained_variance    | 0.57        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00956    |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -62.4        |
|    agent/rollout/ep_rew_wrapped_mean | -6.24        |
|    agent/time/fps                    | 1092         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 32768        |
|    agent/train/approx_kl             | 0.0072165076 |
|    agent/train/clip_fraction         | 0.294        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -2.76        |
|    agent/train/explained_variance    | 0.822        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0283      |
|    agent/train/n_updates             | 150          |
|    agent/tr

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -61.4       |
|    agent/rollout/ep_rew_wrapped_mean | -6.65       |
|    agent/time/fps                    | 1102        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.009471059 |
|    agent/train/clip_fraction         | 0.34        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.68       |
|    agent/train/explained_variance    | 0.885       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0479     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -60.2       |
|    agent/rollout/ep_rew_wrapped_mean | -6.36       |
|    agent/time/fps                    | 1027        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.013639806 |
|    agent/train/clip_fraction         | 0.376       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.46       |
|    agent/train/explained_variance    | 0.945       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0361     |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/12 [00:00<?, ?it/s]

Training agent for 10000 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 50       |
|    agent/rollout/ep_rew_mean         | -60.2    |
|    agent/rollout/ep_rew_wrapped_mean | -28.7    |
|    agent/time/fps                    | 1075     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 1        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -61          |
|    agent/rollout/ep_rew_wrapped_mean | -17.6        |
|    agent/time/fps                    | 863          |
|    agent/time/iterations             | 2            |
|    agent/time/time_elapsed           | 4           

Collecting 254 fragments (12700 transitions)
Requested 12065 transitions but only 10200 in buffer. Sampling 1865 additional transitions.
Sampling 635 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 180 comparisons


Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -62.8       |
|    agent/rollout/ep_rew_wrapped_mean | -2.97       |
|    agent/time/fps                    | 1103        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 12288       |
|    agent/train/approx_kl             | 0.006442847 |
|    agent/train/clip_fraction         | 0.252       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.87       |
|    agent/train/explained_variance    | 0.855       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0407     |
|    agent/train/n_updates             | 50          |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -62.9        |
|    agent/rollout/ep_rew_wrapped_mean | -5.38        |
|    agent/time/fps                    | 1107         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 22528        |
|    agent/train/approx_kl             | 0.0070858533 |
|    agent/train/clip_fraction         | 0.301        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -2.87        |
|    agent/train/explained_variance    | 0.7          |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0443      |
|    agent/train/n_updates             | 100          |
|    agent/tr

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -63.3       |
|    agent/rollout/ep_rew_wrapped_mean | -5.33       |
|    agent/time/fps                    | 1118        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.007856287 |
|    agent/train/clip_fraction         | 0.31        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.77       |
|    agent/train/explained_variance    | 0.895       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0245     |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -61.2       |
|    agent/rollout/ep_rew_wrapped_mean | -5.39       |
|    agent/time/fps                    | 1074        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.009948848 |
|    agent/train/clip_fraction         | 0.329       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.58       |
|    agent/train/explained_variance    | 0.933       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0444     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -60.8       |
|    agent/rollout/ep_rew_wrapped_mean | -4.88       |
|    agent/time/fps                    | 1088        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.010434477 |
|    agent/train/clip_fraction         | 0.372       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.36       |
|    agent/train/explained_variance    | 0.94        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0413     |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/12 [00:00<?, ?it/s]

Training agent for 10000 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 50       |
|    agent/rollout/ep_rew_mean         | -60.2    |
|    agent/rollout/ep_rew_wrapped_mean | -28.7    |
|    agent/time/fps                    | 1108     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 1        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -60.9        |
|    agent/rollout/ep_rew_wrapped_mean | -18.7        |
|    agent/time/fps                    | 869          |
|    agent/time/iterations             | 2            |
|    agent/time/time_elapsed           | 4           

Collecting 254 fragments (12700 transitions)
Requested 12065 transitions but only 10200 in buffer. Sampling 1865 additional transitions.
Sampling 635 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 180 comparisons


Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -60.8       |
|    agent/rollout/ep_rew_wrapped_mean | -6.35       |
|    agent/time/fps                    | 1108        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 12288       |
|    agent/train/approx_kl             | 0.006401297 |
|    agent/train/clip_fraction         | 0.247       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.84       |
|    agent/train/explained_variance    | 0.718       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0252     |
|    agent/train/n_updates             | 50          |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -60.8       |
|    agent/rollout/ep_rew_wrapped_mean | -6.32       |
|    agent/time/fps                    | 1117        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.008305134 |
|    agent/train/clip_fraction         | 0.289       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.76       |
|    agent/train/explained_variance    | 0.88        |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0372     |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
-----------------------------------------------------
| raw/                                 |            |
|    agent/rollout/ep_len_mean         | 50         |
|    agent/rollout/ep_rew_mean         | -59.1      |
|    agent/rollout/ep_rew_wrapped_mean | -6.1       |
|    agent/time/fps                    | 912        |
|    agent/time/iterations             | 1          |
|    agent/time/time_elapsed           | 2          |
|    agent/time/total_timesteps        | 32768      |
|    agent/train/approx_kl             | 0.01132974 |
|    agent/train/clip_fraction         | 0.33       |
|    agent/train/clip_range            | 0.1        |
|    agent/train/entropy_loss          | -2.6       |
|    agent/train/explained_variance    | 0.948      |
|    agent/train/learning_rate         | 0.002      |
|    agent/train/loss                  | -0.0259    |
|    agent/train/n_updates             | 150        |
|    agent/train/policy_gradient_loss  | -0.013

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -56.8       |
|    agent/rollout/ep_rew_wrapped_mean | -6.43       |
|    agent/time/fps                    | 977         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.010507662 |
|    agent/train/clip_fraction         | 0.334       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.41       |
|    agent/train/explained_variance    | 0.941       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.022      |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -52.7       |
|    agent/rollout/ep_rew_wrapped_mean | -6.59       |
|    agent/time/fps                    | 1062        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.011558592 |
|    agent/train/clip_fraction         | 0.366       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.19       |
|    agent/train/explained_variance    | 0.856       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0298     |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/12 [00:00<?, ?it/s]

Training agent for 10000 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 50       |
|    agent/rollout/ep_rew_mean         | -60.2    |
|    agent/rollout/ep_rew_wrapped_mean | -28.7    |
|    agent/time/fps                    | 1054     |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 1        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -60.9        |
|    agent/rollout/ep_rew_wrapped_mean | -18.1        |
|    agent/time/fps                    | 875          |
|    agent/time/iterations             | 2            |
|    agent/time/time_elapsed           | 4           

Collecting 254 fragments (12700 transitions)
Requested 12065 transitions but only 10200 in buffer. Sampling 1865 additional transitions.
Sampling 635 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 180 comparisons


Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -61.9        |
|    agent/rollout/ep_rew_wrapped_mean | -5.05        |
|    agent/time/fps                    | 1047         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0063374992 |
|    agent/train/clip_fraction         | 0.256        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -2.85        |
|    agent/train/explained_variance    | 0.761        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0251      |
|    agent/train/n_updates             | 50           |
|    agent/tr

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -63         |
|    agent/rollout/ep_rew_wrapped_mean | -6.61       |
|    agent/time/fps                    | 1013        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.007153519 |
|    agent/train/clip_fraction         | 0.275       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.78       |
|    agent/train/explained_variance    | 0.809       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.00651    |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -61.9       |
|    agent/rollout/ep_rew_wrapped_mean | -7.05       |
|    agent/time/fps                    | 1054        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.008388509 |
|    agent/train/clip_fraction         | 0.292       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.69       |
|    agent/train/explained_variance    | 0.936       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0384     |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -58.5       |
|    agent/rollout/ep_rew_wrapped_mean | -7.19       |
|    agent/time/fps                    | 1078        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.009874223 |
|    agent/train/clip_fraction         | 0.306       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.57       |
|    agent/train/explained_variance    | 0.961       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0492     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -58         |
|    agent/rollout/ep_rew_wrapped_mean | -6.74       |
|    agent/time/fps                    | 1037        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.012332324 |
|    agent/train/clip_fraction         | 0.35        |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.37       |
|    agent/train/explained_variance    | 0.974       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0273     |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/12 [00:00<?, ?it/s]

Training agent for 10000 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_len_mean         | 50       |
|    agent/rollout/ep_rew_mean         | -60.2    |
|    agent/rollout/ep_rew_wrapped_mean | -28.7    |
|    agent/time/fps                    | 999      |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 2        |
|    agent/time/total_timesteps        | 2048     |
---------------------------------------------------
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -60.9        |
|    agent/rollout/ep_rew_wrapped_mean | -18.2        |
|    agent/time/fps                    | 847          |
|    agent/time/iterations             | 2            |
|    agent/time/time_elapsed           | 4           

Collecting 254 fragments (12700 transitions)
Requested 12065 transitions but only 10200 in buffer. Sampling 1865 additional transitions.
Sampling 635 exploratory transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 180 comparisons


Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
-------------------------------------------------------
| raw/                                 |              |
|    agent/rollout/ep_len_mean         | 50           |
|    agent/rollout/ep_rew_mean         | -62.4        |
|    agent/rollout/ep_rew_wrapped_mean | -5.3         |
|    agent/time/fps                    | 1025         |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 1            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0076466994 |
|    agent/train/clip_fraction         | 0.276        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -2.89        |
|    agent/train/explained_variance    | 0.656        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0341      |
|    agent/train/n_updates             | 50           |
|    agent/tr

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -62.4       |
|    agent/rollout/ep_rew_wrapped_mean | -6.09       |
|    agent/time/fps                    | 982         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 22528       |
|    agent/train/approx_kl             | 0.008117765 |
|    agent/train/clip_fraction         | 0.278       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.82       |
|    agent/train/explained_variance    | 0.859       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0329     |
|    agent/train/n_updates             | 100         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -60.8       |
|    agent/rollout/ep_rew_wrapped_mean | -6.76       |
|    agent/time/fps                    | 1045        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 32768       |
|    agent/train/approx_kl             | 0.009796307 |
|    agent/train/clip_fraction         | 0.323       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.72       |
|    agent/train/explained_variance    | 0.955       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0413     |
|    agent/train/n_updates             | 150         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -58.3       |
|    agent/rollout/ep_rew_wrapped_mean | -6.88       |
|    agent/time/fps                    | 1024        |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 1           |
|    agent/time/total_timesteps        | 43008       |
|    agent/train/approx_kl             | 0.010252158 |
|    agent/train/clip_fraction         | 0.343       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.6        |
|    agent/train/explained_variance    | 0.906       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0246     |
|    agent/train/n_updates             | 200         |
|    agent/train/policy_gradie

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 10000 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_len_mean         | 50          |
|    agent/rollout/ep_rew_mean         | -55.8       |
|    agent/rollout/ep_rew_wrapped_mean | -7.16       |
|    agent/time/fps                    | 974         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 53248       |
|    agent/train/approx_kl             | 0.011565978 |
|    agent/train/clip_fraction         | 0.365       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.37       |
|    agent/train/explained_variance    | 0.867       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | -0.0331     |
|    agent/train/n_updates             | 250         |
|    agent/train/policy_gradie

Active selection fragmenter based on random fragmenter:

In [None]:
pref_comparisons_3.train(
    total_timesteps=50_000,
    total_comparisons=500,
)

After we trained the reward network using the preference comparisons algorithm, we can wrap our environment with that learned reward.

In [5]:
from imitation.rewards.reward_wrapper import RewardVecEnvWrapper

learned_reward_venv = RewardVecEnvWrapper(venv, reward_net.predict_processed)

Next, we train an agent that sees only the shaped, learned reward.

In [6]:
learner = PPO(
    seed=0,
    policy=FeedForward32Policy,
    policy_kwargs=dict(
        features_extractor_class=NormalizeFeaturesExtractor,
        features_extractor_kwargs=dict(normalize_class=RunningNorm),
    ),
    env=learned_reward_venv,
    batch_size=64,
    ent_coef=0.01,
    n_epochs=10,
    n_steps=2048 // learned_reward_venv.num_envs,
    clip_range=0.1,
    gae_lambda=0.95,
    gamma=0.97,
    learning_rate=2e-3,
)
learner.learn(100_000)  # Note: set to 100_000 to train a proficient expert

<stable_baselines3.ppo.ppo.PPO at 0x7a1b8839c4c0>

Then we can evaluate it using the original reward.

In [7]:
from stable_baselines3.common.evaluation import evaluate_policy

n_eval_episodes = 100
reward_mean, reward_std = evaluate_policy(learner.policy, venv, n_eval_episodes)
reward_stderr = reward_std / np.sqrt(n_eval_episodes)
print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

Reward: -11 +/- 0


In [8]:
learner.save('imitation_ppo')

In [9]:
from gymnasium.wrappers import RecordVideo

# Create the environment
env = gym.make("Reacher-v4", render_mode='rgb_array')
env = RecordVideo(env, './evaluation_videos', name_prefix="reacher", episode_trigger=lambda x: x % 1 == 0) 

# Run the model in the environment
obs, info = env.reset()
for _ in range(1000):
        action, _states = learner.predict(obs, deterministic=True)
        obs, reward, _ ,done, info = env.step(action)
        if done:
            obs, info = env.reset()
            

env.close()

  logger.warn(


Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-0.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-0.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-0.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-1.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-1.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-1.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-2.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-2.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-2.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-3.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-3.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-3.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-4.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-4.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-4.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-5.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-5.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-5.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-6.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-6.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-6.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-7.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-7.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-7.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-8.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-8.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-8.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-9.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-9.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-9.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-10.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-10.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-10.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-11.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-11.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-11.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-12.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-12.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-12.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-13.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-13.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-13.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-14.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-14.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-14.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-15.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-15.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-15.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-16.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-16.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-16.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-17.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-17.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-17.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-18.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-18.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-18.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-19.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-19.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-19.mp4
Moviepy - Building video /imitation/docs/tutorials/videos/training-episode-20.mp4.
Moviepy - Writing video /imitation/docs/tutorials/videos/training-episode-20.mp4



                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /imitation/docs/tutorials/videos/training-episode-20.mp4
