In [2]:
%load_ext autoreload
%autoreload 2

In [64]:
import numpy as np
import torch as th

from imitation.algorithms import preference_comparisons

from imitation_modules import DeterministicMDPTrajGenerator, NonImageCnnRewardNet, SyntheticValueGatherer
from stealing_gridworld import StealingGridworld

In [73]:
GRID_SIZE = 3
HORIZON = 20

rng = np.random.default_rng(0)

env = StealingGridworld(
    grid_size=GRID_SIZE,
    max_steps=HORIZON,
    reward_for_stealing=-200,
)

reward_net = NonImageCnnRewardNet(
    env.observation_space,
    env.action_space,
    hid_channels=(32,32),
    kernel_size=3,
)

fragmenter = preference_comparisons.RandomFragmenter(warning_threshold=0, rng=rng)
gatherer = SyntheticValueGatherer(
    env,
    # temperature=0,
    rng=rng,
    value_coeff=0.01,
)
preference_model = preference_comparisons.PreferenceModel(reward_net)
reward_trainer = preference_comparisons.BasicRewardTrainer(
    preference_model=preference_model,
    loss=preference_comparisons.CrossEntropyRewardLoss(),
    epochs=3,
    rng=rng,
)
trajectory_generator = DeterministicMDPTrajGenerator(
    reward_fn=reward_net,
    env=env,
    rng=None,
    epsilon=0.1,
    vi_gamma=0.5,
)
pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=20,
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    fragment_length=1,
    # initial_comparison_frac=0.01,
    initial_epoch_multiplier=10,
)

Enumerating states: 100%|██████████| 9/9 [00:00<00:00, 112.49it/s]
Constructing transition matrix: 100%|██████████| 3492/3492 [00:02<00:00, 1524.75it/s]
Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1157.21it/s]


In [74]:
result = pref_comparisons.train(
    total_timesteps=50000,
    total_comparisons=10000,
)
rlhf_policy = trajectory_generator.policy

Query schedule: [1000, 647, 614, 585, 558, 534, 512, 491, 472, 455, 439, 424, 409, 396, 384, 372, 361, 351, 341, 332, 323]
Collecting 2000 fragments (2000 transitions)
Creating fragment pairs
Gathering preferences
Dataset now contains 1000 comparisons


Training reward model: 100%|██████████| 30/30 [01:45<00:00,  3.51s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1198.68it/s]

------------------------------------------------------
| mean/                                   |          |
|    preferences/entropy                  | 0.692    |
|    reward/epoch-0/train/accuracy        | 0.514    |
|    reward/epoch-0/train/gt_reward_loss  | 0.692    |
|    reward/epoch-0/train/loss            | 0.69     |
|    reward/epoch-1/train/accuracy        | 0.56     |
|    reward/epoch-1/train/gt_reward_loss  | 0.692    |
|    reward/epoch-1/train/loss            | 0.679    |
|    reward/epoch-10/train/accuracy       | 0.641    |
|    reward/epoch-10/train/gt_reward_loss | 0.692    |
|    reward/epoch-10/train/loss           | 0.646    |
|    reward/epoch-11/train/accuracy       | 0.649    |
|    reward/epoch-11/train/gt_reward_loss | 0.692    |
|    reward/epoch-11/train/loss           | 0.639    |
|    reward/epoch-12/train/accuracy       | 0.661    |
|    reward/epoch-12/train/gt_reward_loss | 0.692    |
|    reward/epoch-12/train/loss           | 0.64     |
|    rewar




Creating fragment pairs
Gathering preferences
Dataset now contains 1647 comparisons


Training reward model: 100%|██████████| 3/3 [00:14<00:00,  4.99s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1233.58it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.69     |
|    reward/epoch-0/train/accuracy       | 0.614    |
|    reward/epoch-0/train/gt_reward_loss | 0.691    |
|    reward/epoch-0/train/loss           | 0.651    |
|    reward/epoch-1/train/accuracy       | 0.618    |
|    reward/epoch-1/train/gt_reward_loss | 0.691    |
|    reward/epoch-1/train/loss           | 0.643    |
|    reward/epoch-2/train/accuracy       | 0.634    |
|    reward/epoch-2/train/gt_reward_loss | 0.691    |
|    reward/epoch-2/train/loss           | 0.638    |
| reward/                                |          |
|    final/train/accuracy                | 0.634    |
|    final/train/gt_reward_loss          | 0.691    |
|    final/train/loss                    | 0.638    |
-----------------------------------------------------
Collecting 1228 fragments (1228 transitions)





Creating fragment pairs
Gathering preferences
Dataset now contains 2261 comparisons


Training reward model: 100%|██████████| 3/3 [00:21<00:00,  7.06s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1218.37it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.685    |
|    reward/epoch-0/train/accuracy       | 0.608    |
|    reward/epoch-0/train/gt_reward_loss | 0.689    |
|    reward/epoch-0/train/loss           | 0.651    |
|    reward/epoch-1/train/accuracy       | 0.629    |
|    reward/epoch-1/train/gt_reward_loss | 0.689    |
|    reward/epoch-1/train/loss           | 0.643    |
|    reward/epoch-2/train/accuracy       | 0.633    |
|    reward/epoch-2/train/gt_reward_loss | 0.689    |
|    reward/epoch-2/train/loss           | 0.639    |
| reward/                                |          |
|    final/train/accuracy                | 0.633    |
|    final/train/gt_reward_loss          | 0.689    |
|    final/train/loss                    | 0.639    |
-----------------------------------------------------
Collecting 1170 fragments (1170 transitions)





Creating fragment pairs
Gathering preferences
Dataset now contains 2846 comparisons


Training reward model: 100%|██████████| 3/3 [00:26<00:00,  8.78s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1229.03it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.672    |
|    reward/epoch-0/train/accuracy       | 0.61     |
|    reward/epoch-0/train/gt_reward_loss | 0.686    |
|    reward/epoch-0/train/loss           | 0.653    |
|    reward/epoch-1/train/accuracy       | 0.619    |
|    reward/epoch-1/train/gt_reward_loss | 0.686    |
|    reward/epoch-1/train/loss           | 0.646    |
|    reward/epoch-2/train/accuracy       | 0.627    |
|    reward/epoch-2/train/gt_reward_loss | 0.686    |
|    reward/epoch-2/train/loss           | 0.644    |
| reward/                                |          |
|    final/train/accuracy                | 0.627    |
|    final/train/gt_reward_loss          | 0.686    |
|    final/train/loss                    | 0.644    |
-----------------------------------------------------
Collecting 1116 fragments (1116 transitions)
Creating fragment pairs





Gathering preferences
Dataset now contains 3404 comparisons


Training reward model: 100%|██████████| 3/3 [00:30<00:00, 10.30s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1220.69it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.682    |
|    reward/epoch-0/train/accuracy       | 0.606    |
|    reward/epoch-0/train/gt_reward_loss | 0.685    |
|    reward/epoch-0/train/loss           | 0.654    |
|    reward/epoch-1/train/accuracy       | 0.617    |
|    reward/epoch-1/train/gt_reward_loss | 0.685    |
|    reward/epoch-1/train/loss           | 0.651    |
|    reward/epoch-2/train/accuracy       | 0.621    |
|    reward/epoch-2/train/gt_reward_loss | 0.685    |
|    reward/epoch-2/train/loss           | 0.647    |
| reward/                                |          |
|    final/train/accuracy                | 0.621    |
|    final/train/gt_reward_loss          | 0.685    |
|    final/train/loss                    | 0.647    |
-----------------------------------------------------
Collecting 1068 fragments (1068 transitions)
Creating fragment pairs
Gathering pre




Dataset now contains 3938 comparisons


Training reward model: 100%|██████████| 3/3 [00:34<00:00, 11.61s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1237.02it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.691    |
|    reward/epoch-0/train/accuracy       | 0.599    |
|    reward/epoch-0/train/gt_reward_loss | 0.686    |
|    reward/epoch-0/train/loss           | 0.657    |
|    reward/epoch-1/train/accuracy       | 0.613    |
|    reward/epoch-1/train/gt_reward_loss | 0.686    |
|    reward/epoch-1/train/loss           | 0.653    |
|    reward/epoch-2/train/accuracy       | 0.62     |
|    reward/epoch-2/train/gt_reward_loss | 0.686    |
|    reward/epoch-2/train/loss           | 0.647    |
| reward/                                |          |
|    final/train/accuracy                | 0.62     |
|    final/train/gt_reward_loss          | 0.686    |
|    final/train/loss                    | 0.647    |
-----------------------------------------------------
Collecting 1024 fragments (1024 transitions)
Creating fragment pairs
Gathering pre




Dataset now contains 4450 comparisons


Training reward model: 100%|██████████| 3/3 [00:40<00:00, 13.57s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1219.06it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.688    |
|    reward/epoch-0/train/accuracy       | 0.609    |
|    reward/epoch-0/train/gt_reward_loss | 0.686    |
|    reward/epoch-0/train/loss           | 0.655    |
|    reward/epoch-1/train/accuracy       | 0.612    |
|    reward/epoch-1/train/gt_reward_loss | 0.686    |
|    reward/epoch-1/train/loss           | 0.652    |
|    reward/epoch-2/train/accuracy       | 0.619    |
|    reward/epoch-2/train/gt_reward_loss | 0.686    |
|    reward/epoch-2/train/loss           | 0.65     |
| reward/                                |          |
|    final/train/accuracy                | 0.619    |
|    final/train/gt_reward_loss          | 0.686    |
|    final/train/loss                    | 0.65     |
-----------------------------------------------------
Collecting 982 fragments (982 transitions)
Creating fragment pairs
Gathering prefe




Dataset now contains 4941 comparisons


Training reward model: 100%|██████████| 3/3 [00:45<00:00, 15.16s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1222.19it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.678    |
|    reward/epoch-0/train/accuracy       | 0.606    |
|    reward/epoch-0/train/gt_reward_loss | 0.685    |
|    reward/epoch-0/train/loss           | 0.655    |
|    reward/epoch-1/train/accuracy       | 0.615    |
|    reward/epoch-1/train/gt_reward_loss | 0.685    |
|    reward/epoch-1/train/loss           | 0.651    |
|    reward/epoch-2/train/accuracy       | 0.619    |
|    reward/epoch-2/train/gt_reward_loss | 0.685    |
|    reward/epoch-2/train/loss           | 0.647    |
| reward/                                |          |
|    final/train/accuracy                | 0.619    |
|    final/train/gt_reward_loss          | 0.685    |
|    final/train/loss                    | 0.647    |
-----------------------------------------------------
Collecting 944 fragments (944 transitions)
Creating fragment pairs
Gathering prefe




Dataset now contains 5413 comparisons


Training reward model: 100%|██████████| 3/3 [00:51<00:00, 17.01s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1220.85it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.693    |
|    reward/epoch-0/train/accuracy       | 0.61     |
|    reward/epoch-0/train/gt_reward_loss | 0.686    |
|    reward/epoch-0/train/loss           | 0.653    |
|    reward/epoch-1/train/accuracy       | 0.62     |
|    reward/epoch-1/train/gt_reward_loss | 0.686    |
|    reward/epoch-1/train/loss           | 0.65     |
|    reward/epoch-2/train/accuracy       | 0.624    |
|    reward/epoch-2/train/gt_reward_loss | 0.685    |
|    reward/epoch-2/train/loss           | 0.646    |
| reward/                                |          |
|    final/train/accuracy                | 0.624    |
|    final/train/gt_reward_loss          | 0.685    |
|    final/train/loss                    | 0.646    |
-----------------------------------------------------
Collecting 910 fragments (910 transitions)
Creating fragment pairs
Gathering prefe




Dataset now contains 5868 comparisons


Training reward model: 100%|██████████| 3/3 [00:54<00:00, 18.33s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1170.97it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.686    |
|    reward/epoch-0/train/accuracy       | 0.615    |
|    reward/epoch-0/train/gt_reward_loss | 0.686    |
|    reward/epoch-0/train/loss           | 0.651    |
|    reward/epoch-1/train/accuracy       | 0.622    |
|    reward/epoch-1/train/gt_reward_loss | 0.686    |
|    reward/epoch-1/train/loss           | 0.647    |
|    reward/epoch-2/train/accuracy       | 0.621    |
|    reward/epoch-2/train/gt_reward_loss | 0.686    |
|    reward/epoch-2/train/loss           | 0.643    |
| reward/                                |          |
|    final/train/accuracy                | 0.621    |
|    final/train/gt_reward_loss          | 0.686    |
|    final/train/loss                    | 0.643    |
-----------------------------------------------------
Collecting 878 fragments (878 transitions)





Creating fragment pairs
Gathering preferences
Dataset now contains 6307 comparisons


Training reward model: 100%|██████████| 3/3 [01:00<00:00, 20.26s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1225.62it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.688    |
|    reward/epoch-0/train/accuracy       | 0.614    |
|    reward/epoch-0/train/gt_reward_loss | 0.686    |
|    reward/epoch-0/train/loss           | 0.647    |
|    reward/epoch-1/train/accuracy       | 0.619    |
|    reward/epoch-1/train/gt_reward_loss | 0.686    |
|    reward/epoch-1/train/loss           | 0.644    |
|    reward/epoch-2/train/accuracy       | 0.621    |
|    reward/epoch-2/train/gt_reward_loss | 0.686    |
|    reward/epoch-2/train/loss           | 0.643    |
| reward/                                |          |
|    final/train/accuracy                | 0.621    |
|    final/train/gt_reward_loss          | 0.686    |
|    final/train/loss                    | 0.643    |
-----------------------------------------------------
Collecting 848 fragments (848 transitions)
Creating fragment pairs
Gathering prefe




Dataset now contains 6731 comparisons


Training reward model: 100%|██████████| 3/3 [01:04<00:00, 21.35s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1188.78it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.693    |
|    reward/epoch-0/train/accuracy       | 0.62     |
|    reward/epoch-0/train/gt_reward_loss | 0.687    |
|    reward/epoch-0/train/loss           | 0.646    |
|    reward/epoch-1/train/accuracy       | 0.622    |
|    reward/epoch-1/train/gt_reward_loss | 0.687    |
|    reward/epoch-1/train/loss           | 0.644    |
|    reward/epoch-2/train/accuracy       | 0.623    |
|    reward/epoch-2/train/gt_reward_loss | 0.687    |
|    reward/epoch-2/train/loss           | 0.641    |
| reward/                                |          |
|    final/train/accuracy                | 0.623    |
|    final/train/gt_reward_loss          | 0.687    |
|    final/train/loss                    | 0.641    |
-----------------------------------------------------
Collecting 818 fragments (818 transitions)
Creating fragment pairs





Gathering preferences
Dataset now contains 7140 comparisons


Training reward model: 100%|██████████| 3/3 [01:14<00:00, 24.81s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1240.31it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.693    |
|    reward/epoch-0/train/accuracy       | 0.617    |
|    reward/epoch-0/train/gt_reward_loss | 0.687    |
|    reward/epoch-0/train/loss           | 0.645    |
|    reward/epoch-1/train/accuracy       | 0.617    |
|    reward/epoch-1/train/gt_reward_loss | 0.687    |
|    reward/epoch-1/train/loss           | 0.642    |
|    reward/epoch-2/train/accuracy       | 0.624    |
|    reward/epoch-2/train/gt_reward_loss | 0.687    |
|    reward/epoch-2/train/loss           | 0.64     |
| reward/                                |          |
|    final/train/accuracy                | 0.624    |
|    final/train/gt_reward_loss          | 0.687    |
|    final/train/loss                    | 0.64     |
-----------------------------------------------------
Collecting 792 fragments (792 transitions)
Creating fragment pairs
Gathering prefe




Dataset now contains 7536 comparisons


Training reward model: 100%|██████████| 3/3 [01:13<00:00, 24.44s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1185.90it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.693    |
|    reward/epoch-0/train/accuracy       | 0.624    |
|    reward/epoch-0/train/gt_reward_loss | 0.687    |
|    reward/epoch-0/train/loss           | 0.644    |
|    reward/epoch-1/train/accuracy       | 0.622    |
|    reward/epoch-1/train/gt_reward_loss | 0.687    |
|    reward/epoch-1/train/loss           | 0.642    |
|    reward/epoch-2/train/accuracy       | 0.62     |
|    reward/epoch-2/train/gt_reward_loss | 0.687    |
|    reward/epoch-2/train/loss           | 0.641    |
| reward/                                |          |
|    final/train/accuracy                | 0.62     |
|    final/train/gt_reward_loss          | 0.687    |
|    final/train/loss                    | 0.641    |
-----------------------------------------------------
Collecting 768 fragments (768 transitions)
Creating fragment pairs





Gathering preferences
Dataset now contains 7920 comparisons


Training reward model: 100%|██████████| 3/3 [01:18<00:00, 26.31s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1212.72it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.691    |
|    reward/epoch-0/train/accuracy       | 0.618    |
|    reward/epoch-0/train/gt_reward_loss | 0.687    |
|    reward/epoch-0/train/loss           | 0.644    |
|    reward/epoch-1/train/accuracy       | 0.619    |
|    reward/epoch-1/train/gt_reward_loss | 0.687    |
|    reward/epoch-1/train/loss           | 0.643    |
|    reward/epoch-2/train/accuracy       | 0.631    |
|    reward/epoch-2/train/gt_reward_loss | 0.687    |
|    reward/epoch-2/train/loss           | 0.641    |
| reward/                                |          |
|    final/train/accuracy                | 0.631    |
|    final/train/gt_reward_loss          | 0.687    |
|    final/train/loss                    | 0.641    |
-----------------------------------------------------
Collecting 744 fragments (744 transitions)
Creating fragment pairs
Gathering prefe




Dataset now contains 8292 comparisons


Training reward model: 100%|██████████| 3/3 [01:17<00:00, 25.99s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1218.11it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.693    |
|    reward/epoch-0/train/accuracy       | 0.618    |
|    reward/epoch-0/train/gt_reward_loss | 0.688    |
|    reward/epoch-0/train/loss           | 0.643    |
|    reward/epoch-1/train/accuracy       | 0.621    |
|    reward/epoch-1/train/gt_reward_loss | 0.688    |
|    reward/epoch-1/train/loss           | 0.642    |
|    reward/epoch-2/train/accuracy       | 0.621    |
|    reward/epoch-2/train/gt_reward_loss | 0.688    |
|    reward/epoch-2/train/loss           | 0.639    |
| reward/                                |          |
|    final/train/accuracy                | 0.621    |
|    final/train/gt_reward_loss          | 0.688    |
|    final/train/loss                    | 0.639    |
-----------------------------------------------------
Collecting 722 fragments (722 transitions)
Creating fragment pairs
Gathering prefe




Dataset now contains 8653 comparisons


Training reward model: 100%|██████████| 3/3 [01:27<00:00, 29.15s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1208.02it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.689    |
|    reward/epoch-0/train/accuracy       | 0.622    |
|    reward/epoch-0/train/gt_reward_loss | 0.688    |
|    reward/epoch-0/train/loss           | 0.644    |
|    reward/epoch-1/train/accuracy       | 0.624    |
|    reward/epoch-1/train/gt_reward_loss | 0.688    |
|    reward/epoch-1/train/loss           | 0.639    |
|    reward/epoch-2/train/accuracy       | 0.627    |
|    reward/epoch-2/train/gt_reward_loss | 0.688    |
|    reward/epoch-2/train/loss           | 0.639    |
| reward/                                |          |
|    final/train/accuracy                | 0.627    |
|    final/train/gt_reward_loss          | 0.688    |
|    final/train/loss                    | 0.639    |
-----------------------------------------------------
Collecting 702 fragments (702 transitions)
Creating fragment pairs
Gathering prefe


Training reward model: 100%|██████████| 3/3 [01:33<00:00, 31.22s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1215.88it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.691    |
|    reward/epoch-0/train/accuracy       | 0.624    |
|    reward/epoch-0/train/gt_reward_loss | 0.688    |
|    reward/epoch-0/train/loss           | 0.642    |
|    reward/epoch-1/train/accuracy       | 0.623    |
|    reward/epoch-1/train/gt_reward_loss | 0.688    |
|    reward/epoch-1/train/loss           | 0.641    |
|    reward/epoch-2/train/accuracy       | 0.62     |
|    reward/epoch-2/train/gt_reward_loss | 0.688    |
|    reward/epoch-2/train/loss           | 0.639    |
| reward/                                |          |
|    final/train/accuracy                | 0.62     |
|    final/train/gt_reward_loss          | 0.688    |
|    final/train/loss                    | 0.639    |
-----------------------------------------------------
Collecting 682 fragments (682 transitions)
Creating fragment pairs
Gathering prefe


Training reward model: 100%|██████████| 3/3 [01:26<00:00, 28.91s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1194.04it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.683    |
|    reward/epoch-0/train/accuracy       | 0.625    |
|    reward/epoch-0/train/gt_reward_loss | 0.688    |
|    reward/epoch-0/train/loss           | 0.639    |
|    reward/epoch-1/train/accuracy       | 0.626    |
|    reward/epoch-1/train/gt_reward_loss | 0.688    |
|    reward/epoch-1/train/loss           | 0.639    |
|    reward/epoch-2/train/accuracy       | 0.628    |
|    reward/epoch-2/train/gt_reward_loss | 0.688    |
|    reward/epoch-2/train/loss           | 0.637    |
| reward/                                |          |
|    final/train/accuracy                | 0.628    |
|    final/train/gt_reward_loss          | 0.688    |
|    final/train/loss                    | 0.637    |
-----------------------------------------------------
Collecting 664 fragments (664 transitions)
Creating fragment pairs
Gathering prefe


Training reward model: 100%|██████████| 3/3 [01:35<00:00, 31.81s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1170.89it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.691    |
|    reward/epoch-0/train/accuracy       | 0.624    |
|    reward/epoch-0/train/gt_reward_loss | 0.688    |
|    reward/epoch-0/train/loss           | 0.64     |
|    reward/epoch-1/train/accuracy       | 0.627    |
|    reward/epoch-1/train/gt_reward_loss | 0.688    |
|    reward/epoch-1/train/loss           | 0.636    |
|    reward/epoch-2/train/accuracy       | 0.625    |
|    reward/epoch-2/train/gt_reward_loss | 0.688    |
|    reward/epoch-2/train/loss           | 0.638    |
| reward/                                |          |
|    final/train/accuracy                | 0.625    |
|    final/train/gt_reward_loss          | 0.688    |
|    final/train/loss                    | 0.638    |
-----------------------------------------------------
Collecting 646 fragments (646 transitions)
Creating fragment pairs
Gathering prefe




Dataset now contains 10000 comparisons


Training reward model: 100%|██████████| 3/3 [01:44<00:00, 34.86s/it]

Training agent for 2500 timesteps



Value iteration: 100%|██████████| 20/20 [00:00<00:00, 1141.62it/s]

-----------------------------------------------------
| mean/                                  |          |
|    preferences/entropy                 | 0.687    |
|    reward/epoch-0/train/accuracy       | 0.622    |
|    reward/epoch-0/train/gt_reward_loss | 0.688    |
|    reward/epoch-0/train/loss           | 0.641    |
|    reward/epoch-1/train/accuracy       | 0.623    |
|    reward/epoch-1/train/gt_reward_loss | 0.688    |
|    reward/epoch-1/train/loss           | 0.637    |
|    reward/epoch-2/train/accuracy       | 0.62     |
|    reward/epoch-2/train/gt_reward_loss | 0.688    |
|    reward/epoch-2/train/loss           | 0.637    |
| reward/                                |          |
|    final/train/accuracy                | 0.62     |
|    final/train/gt_reward_loss          | 0.688    |
|    final/train/loss                    | 0.637    |
-----------------------------------------------------





In [75]:
# What does the reward net think is the best state?

_, reward_vector = env.get_sparse_transition_matrix_and_reward_vector(reward_net)

best_state_idx, best_action = np.unravel_index(
    reward_vector.argmax(),
    (len(env.states), len(env.actions)),
)
env.render(env.states[best_state_idx])
print(f"best action: {env._action_to_string(env.actions[best_action])}")

reward_net.predict(
    np.array([env.states[best_state_idx]] * 5),
    np.array(np.arange(5)),
    np.array(env.states[:5]),
    np.array([False] * 5),
)

+---+---+---+
|   |   |   |
+---+---+---+
|   |3H |   |
+---+---+---+
|   |   |   |
+---+---+---+
best action: RIGHT


array([ 0.79211015, -2.5380538 ,  0.60095835,  2.4222856 ,  0.69246906],
      dtype=float32)

In [76]:
# Rollouts of the RLHF policy

def log_reward_net_prediction(state, action, _):
    reward_prection = reward_net.predict(
        np.array([state]),
        np.array([action]),
        np.array([state]),
        np.array([False]),
    )[0]
    print(f"action: {env._action_to_string(action)}")
    print(f"Predicted reward: {reward_prection}")


_ = env.rollout_with_policy(rlhf_policy, render=True, logging_callback=log_reward_net_prediction)

+---+---+---+
| . |   |   |
+---+---+---+
|   |0H | . |
+---+---+---+
|   |   | x |
+---+---+---+
action: RIGHT
Predicted reward: 0.5517340898513794
+---+---+---+
| . |   |   |
+---+---+---+
|   | H |0. |
+---+---+---+
|   |   | x |
+---+---+---+
action: RIGHT
Predicted reward: 0.9585872292518616
Repeated final state with action 3
Total reward: 0


In [63]:
# Reward net evaluated at final state of above rollout

state = env._get_observation()

env.render(state)
reward_net.predict(
    np.array([state] * 5),
    np.array(np.arange(5)),
    np.array(env.states[:5]),
    np.array([False] * 5),
)

+---+---+---+
| . |2  |   |
+---+---+---+
|   | H |   |
+---+---+---+
|   |   |   |
+---+---+---+


array([27.317883, 31.444069, 25.671495, 24.866295, 30.610226],
      dtype=float32)

In [15]:
# Trying to inspect activations of reward net

import torch as th

home_location = np.array([GRID_SIZE // 2, GRID_SIZE // 2])
free_pellet_location = np.array([0, 0])
owned_pellet_location = np.array([0, 1])

agent_location = home_location

state = np.zeros((5, GRID_SIZE, GRID_SIZE), dtype=np.int16)
state[0, agent_location[0], agent_location[1]] = 1
state[1, free_pellet_location[0], free_pellet_location[1]] = 1
state[2, owned_pellet_location[0], owned_pellet_location[1]] = 1
state[3, GRID_SIZE // 2, GRID_SIZE // 2] = 1
state[4, :, :] = 1

env.render(state)

conv0_activations = reward_net.cnn[0](th.Tensor([state]))
relu0_activations = reward_net.cnn[1](conv0_activations)
conv1_activations = reward_net.cnn[2](relu0_activations)
relu1_activations = reward_net.cnn[3](conv1_activations)
pool_activations = reward_net.cnn[4](relu1_activations)
flat_activations = reward_net.cnn[5](pool_activations)
outputs = reward_net.cnn[-1](flat_activations)

print(f"layer 0 activations:\n{relu0_activations[0]}")
print(f"layer 1 activations:\n{relu1_activations[0]}")
print(f"flat_activations:\n{flat_activations[0]}")
print(f"outputs:\n{outputs[0]}")

+---+---+---+
| . | x |   |
+---+---+---+
|   |1H |   |
+---+---+---+
|   |   |   |
+---+---+---+


  conv0_activations = reward_net.cnn[0](th.Tensor([state]))


IndexError: index 5 is out of range

In [128]:
# Trying to inspect weights of reward net

for weight, bias in zip(reward_net.cnn[0].weight, reward_net.cnn[0].bias):
    print(f"weight: {weight.detach().numpy().flatten()}")
    print(f"bias: {bias.detach().numpy()}")

print()

# for weight, bias in zip(reward_net.cnn[2].weight, reward_net.cnn[2].bias):
#     print(f"weight: {weight.detach().numpy().flatten()}")
#     print(f"bias: {bias.detach().numpy()}")

print()

for weight, bias in zip(reward_net.cnn[-1].weight, reward_net.cnn[-1].bias):
    print(f"weight: {weight.detach().numpy().flatten()}")
    print(f"bias: {bias.detach().numpy()}")

weight: [-0.10977456 -0.38889107 -0.1206716  -0.38175228 -0.28921402]
bias: 0.34487465023994446
weight: [ 0.11769152  0.23302093 -0.40303385  0.11292398  0.17897412]
bias: 0.3653581142425537


weight: [-0.17857413 -0.36234197]
bias: -0.5028236508369446
weight: [0.5715578  0.04654616]
bias: -0.3627457618713379
weight: [-0.45337042  0.31298742]
bias: 0.548060417175293
weight: [-0.06371312 -0.561359  ]
bias: -0.6052233576774597
weight: [ 0.08518305 -0.09132026]
bias: -0.05851120129227638


In [129]:
# Trying to write down optimal weights for reward net

optimal_conv0_weights = np.array([
    [1, 0, 0, 1, 0.1],  # Is agent at home with pellets?
    [1, 0, 1, 0, 0],  # Is agent at owned pellet?
]).reshape(reward_net.cnn[0].weight.shape)
optimal_conv0_bias = np.array([-2, -1]).reshape(reward_net.cnn[0].bias.shape)
# optimal_conv1_weights = np.array([
#     [0, 1],  # Is agent at owned pellet?
#     [10, 0],  # Num pellets to deposit
# ]).reshape(reward_net.cnn[2].weight.shape)
# optimal_conv1_bias = np.array([0, 0]).reshape(reward_net.cnn[2].bias.shape)
optimal_fc_weights = np.array([
    [0, 0],  # UP
    [0, 0],  # DOWN
    [0, 0],  # LEFT
    [0, 0],  # RIGHT
    [10, -2],  # INTERACT
]).reshape(reward_net.cnn[-1].weight.shape)
optimal_fc_bias = np.array([0, 0, 0, 0, 0]).reshape(reward_net.cnn[-1].bias.shape)

def set_weights(cnn):
    cnn[0].weight.data = th.Tensor(optimal_conv0_weights)
    cnn[0].bias.data = th.Tensor(optimal_conv0_bias)
    # cnn[2].weight.data = th.Tensor(optimal_conv1_weights)
    # cnn[2].bias.data = th.Tensor(optimal_conv1_bias)
    cnn[-1].weight.data = th.Tensor(optimal_fc_weights)
    cnn[-1].bias.data = th.Tensor(optimal_fc_bias)

set_weights(reward_net.cnn)
    