In [1]:
%load_ext autoreload
%autoreload 2

In [51]:
import time

import numpy as np
import torch as th

from imitation_modules import (
    BasicScalarFeedbackRewardTrainer,
    DeterministicMDPTrajGenerator,
    MSERewardLoss,
    NoisyObservationGathererWrapper,
    NonImageCnnRewardNet,
    RandomSingleFragmenter,
    ScalarFeedbackModel,
    ScalarRewardLearner,
    SyntheticScalarFeedbackGatherer,
)
from stealing_gridworld import PartialGridVisibility, StealingGridworld

In [52]:
GRID_SIZE = 3
HORIZON = 30

visibility_mask=np.array([
    [1, 1, 0],
    [1, 1, 0],
    [0, 0, 0],
])

rng = np.random.default_rng(0)

env = StealingGridworld(
    grid_size=GRID_SIZE,
    max_steps=HORIZON,
    reward_for_depositing=100,
    reward_for_picking_up=1,
    reward_for_stealing=-200,
)
reward_net = NonImageCnnRewardNet(
    env.observation_space,
    env.action_space,
    hid_channels=(32,32),
    kernel_size=3,
)
fragmenter = RandomSingleFragmenter(rng=rng)
gatherer = SyntheticScalarFeedbackGatherer(rng=rng)
observation_function = PartialGridVisibility(
    env,
    visibility_mask=visibility_mask,
)    
gatherer = NoisyObservationGathererWrapper(
    gatherer,
    observation_function,
)
feedback_model = ScalarFeedbackModel(model=reward_net)
reward_trainer = BasicScalarFeedbackRewardTrainer(
    feedback_model=feedback_model,
    loss=MSERewardLoss(),
    rng=rng,
    epochs=3,
)
trajectory_generator = DeterministicMDPTrajGenerator(
    reward_fn=reward_net,
    env=env,
    rng=None,  # This doesn't work yet
    epsilon=0.1,
)
reward_learner = ScalarRewardLearner(
    trajectory_generator=trajectory_generator,
    reward_model=reward_net,
    num_iterations=20,
    fragmenter=fragmenter,
    feedback_gatherer=gatherer,
    feedback_queue_size=10000,
    reward_trainer=reward_trainer,
    fragment_length=3,
    transition_oversampling=5,
    initial_epoch_multiplier=1,
)

Enumerating states: 100%|██████████| 9/9 [00:00<00:00, 105.69it/s]


In [232]:
result = reward_learner.train(
    total_timesteps=10000,
    total_queries=11000,
)
rlhf_policy = trajectory_generator.policy

Query schedule: [600, 388, 368, 351, 335, 320, 307, 295, 283, 273, 263, 254, 246, 238, 230, 223, 217, 211, 205, 199, 194]
Beggining iteration 0 of 20
Collecting 600 feedback queries (9000 transitions)
Fragmenting trajectories
Samples will contain 1800 transitions in total and only 9000 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: 0.0
Dataset now contains 5550 feedback queries


Training reward model: 100%|██████████| 3/3 [00:40<00:00, 13.66s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1088.14it/s]

Beggining iteration 1 of 20
Collecting 388 feedback queries (5820 transitions)





Fragmenting trajectories
Samples will contain 1164 transitions in total and only 5820 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: 0.0
Dataset now contains 5938 feedback queries


Training reward model: 100%|██████████| 3/3 [00:40<00:00, 13.44s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1104.82it/s]

Beggining iteration 2 of 20
Collecting 368 feedback queries (5520 transitions)





Fragmenting trajectories
Samples will contain 1104 transitions in total and only 5520 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: 0.0
Dataset now contains 6306 feedback queries


Training reward model: 100%|██████████| 3/3 [00:40<00:00, 13.43s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1153.55it/s]

Beggining iteration 3 of 20
Collecting 351 feedback queries (5265 transitions)





Fragmenting trajectories
Samples will contain 1050 transitions in total and only 5280 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: -200.0
Dataset now contains 6656 feedback queries


Training reward model: 100%|██████████| 3/3 [00:43<00:00, 14.62s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1150.58it/s]

Beggining iteration 4 of 20
Collecting 335 feedback queries (5025 transitions)





Fragmenting trajectories
Samples will contain 1002 transitions in total and only 5040 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 101.0 | Worst reward: 0.0
Dataset now contains 6990 feedback queries


Training reward model: 100%|██████████| 3/3 [00:43<00:00, 14.55s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1145.53it/s]

Beggining iteration 5 of 20
Collecting 320 feedback queries (4800 transitions)





Fragmenting trajectories
Samples will contain 960 transitions in total and only 4800 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: 0.0
Dataset now contains 7310 feedback queries


Training reward model: 100%|██████████| 3/3 [00:43<00:00, 14.37s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1146.00it/s]

Beggining iteration 6 of 20
Collecting 307 feedback queries (4605 transitions)





Fragmenting trajectories
Samples will contain 918 transitions in total and only 4620 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: 0.0
Dataset now contains 7616 feedback queries


Training reward model: 100%|██████████| 3/3 [00:46<00:00, 15.43s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1092.18it/s]

Beggining iteration 7 of 20
Collecting 295 feedback queries (4425 transitions)





Fragmenting trajectories
Samples will contain 882 transitions in total and only 4440 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 201.0 | Worst reward: -200.0
Dataset now contains 7910 feedback queries


Training reward model: 100%|██████████| 3/3 [00:53<00:00, 17.85s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1154.57it/s]

Beggining iteration 8 of 20
Collecting 283 feedback queries (4245 transitions)





Fragmenting trajectories
Samples will contain 846 transitions in total and only 4260 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 201.0 | Worst reward: 0.0
Dataset now contains 8192 feedback queries


Training reward model: 100%|██████████| 3/3 [01:00<00:00, 20.12s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1138.56it/s]

Beggining iteration 9 of 20
Collecting 273 feedback queries (4095 transitions)





Fragmenting trajectories
Samples will contain 816 transitions in total and only 4110 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: 0.0
Dataset now contains 8464 feedback queries


Training reward model: 100%|██████████| 3/3 [00:57<00:00, 19.23s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1050.69it/s]

Beggining iteration 10 of 20
Collecting 263 feedback queries (3945 transitions)





Fragmenting trajectories
Samples will contain 786 transitions in total and only 3960 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 201.0 | Worst reward: -200.0
Dataset now contains 8726 feedback queries


Training reward model: 100%|██████████| 3/3 [00:56<00:00, 18.98s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1128.16it/s]

Beggining iteration 11 of 20
Collecting 254 feedback queries (3810 transitions)





Fragmenting trajectories
Samples will contain 762 transitions in total and only 3810 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: 0.0
Dataset now contains 8980 feedback queries


Training reward model: 100%|██████████| 3/3 [01:03<00:00, 21.23s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1142.63it/s]

Beggining iteration 12 of 20
Collecting 246 feedback queries (3690 transitions)





Fragmenting trajectories
Samples will contain 738 transitions in total and only 3690 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: -200.0
Dataset now contains 9226 feedback queries


Training reward model: 100%|██████████| 3/3 [00:58<00:00, 19.58s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1121.04it/s]

Beggining iteration 13 of 20
Collecting 238 feedback queries (3570 transitions)





Fragmenting trajectories
Samples will contain 714 transitions in total and only 3570 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 201.0 | Worst reward: -100.0
Dataset now contains 9464 feedback queries


Training reward model: 100%|██████████| 3/3 [01:02<00:00, 20.86s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1100.56it/s]

Beggining iteration 14 of 20
Collecting 230 feedback queries (3450 transitions)





Fragmenting trajectories
Samples will contain 690 transitions in total and only 3450 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 201.0 | Worst reward: -200.0
Dataset now contains 9694 feedback queries


Training reward model: 100%|██████████| 3/3 [01:03<00:00, 21.27s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1150.53it/s]

Beggining iteration 15 of 20
Collecting 223 feedback queries (3345 transitions)





Fragmenting trajectories
Samples will contain 666 transitions in total and only 3360 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: 0.0
Dataset now contains 9916 feedback queries


Training reward model: 100%|██████████| 3/3 [01:01<00:00, 20.37s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1144.99it/s]

Beggining iteration 16 of 20
Collecting 217 feedback queries (3255 transitions)





Fragmenting trajectories
Samples will contain 648 transitions in total and only 3270 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: 0.0
Dataset now contains 10000 feedback queries


Training reward model: 100%|██████████| 3/3 [00:58<00:00, 19.41s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1153.97it/s]

Beggining iteration 17 of 20
Collecting 211 feedback queries (3165 transitions)





Fragmenting trajectories
Samples will contain 630 transitions in total and only 3180 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: 0.0
Dataset now contains 10000 feedback queries


Training reward model: 100%|██████████| 3/3 [01:01<00:00, 20.59s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1149.57it/s]

Beggining iteration 18 of 20
Collecting 205 feedback queries (3075 transitions)





Fragmenting trajectories
Samples will contain 612 transitions in total and only 3090 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 201.0 | Worst reward: -200.0
Dataset now contains 10000 feedback queries


Training reward model: 100%|██████████| 3/3 [01:07<00:00, 22.48s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1109.80it/s]

Beggining iteration 19 of 20
Collecting 199 feedback queries (2985 transitions)





Fragmenting trajectories
Samples will contain 594 transitions in total and only 3000 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: -200.0
Dataset now contains 10000 feedback queries


Training reward model: 100%|██████████| 3/3 [01:01<00:00, 20.43s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1144.87it/s]

Beggining iteration 20 of 20
Collecting 194 feedback queries (2910 transitions)





Fragmenting trajectories
Samples will contain 582 transitions in total and only 2910 are available. Because we sample with replacement, a significant number of transitions are likely to appear multiple times.
Gathering feedback
Best reward: 200.0 | Worst reward: 0.0
Dataset now contains 10000 feedback queries


Training reward model: 100%|██████████| 3/3 [01:02<00:00, 20.71s/it]

Training agent for 500 timesteps



Value iteration: 100%|██████████| 30/30 [00:00<00:00, 1137.62it/s]


In [98]:
# model_name = f"stealing_reward_net_{GRID_SIZE}_{time.strftime('%Y%m%d_%H%M%S')}_iter{reward_learner._iteration}"
# th.save(reward_net.state_dict(), f"notebooks/saved_models/{model_name}.pth")

In [233]:
# What does the reward net think is the best state?

_, reward_vector = env.get_sparse_transition_matrix_and_reward_vector(reward_net)

best_state_idx, best_action = np.unravel_index(
    reward_vector.argmax(),
    (len(env.states), len(env.actions)),
)
env.render(env.states[best_state_idx])
print(f"best action: {env._action_to_string(env.actions[best_action])}")

reward_net.predict(
    np.array([env.states[best_state_idx]] * 5),
    np.array(np.arange(5)),
    np.array(env.states[:5]),
    np.array([False] * 5),
)

+---+---+---+
|   |   |   |
+---+---+---+
|   |3H |   |
+---+---+---+
|   |   |   |
+---+---+---+
best action: INTERACT


array([ 4.8300919e-01, -3.5746300e-01, -2.3391202e-02, -6.7281103e-01,
        2.1007785e+02], dtype=float32)

In [234]:
# Rollouts of the RLHF policy

def log_reward_net_prediction(state, action, _):
    reward_prection = reward_net.predict(
        np.array([state]),
        np.array([action]),
        np.array([state]),
        np.array([False]),
    )[0]
    print(f"action: {env._action_to_string(action)}")
    print(f"Predicted reward: {reward_prection}")


# _ = env.rollout_with_policy(rlhf_policy, render=True, logging_callback=log_reward_net_prediction)
start_states = []
start_state_idxs = []
outcomes = []
trajectories = []
aberrant_trajectory_idxs = []
for _ in range(1000):
    traj = env.rollout_with_policy(rlhf_policy)
    start_state = traj.obs[0]
    start_state_idx = env.get_state_index(start_state)
    if start_state_idx in start_state_idxs:
        continue
    trajectories.append(traj)
    start_states.append(start_state)
    start_state_idxs.append(start_state_idx)
    outcome = traj.obs[-1]
    outcomes.append(outcome)
    idx = len(trajectories) - 1
    if (start_state[2] * visibility_mask).any() and not (outcome[2] * visibility_mask).any():
        print(f"Agent stole visible pellet in trajectory {idx}!")
    elif (start_state[2] * (1 - visibility_mask)).any() and (outcome[2] * (1 - visibility_mask)).any():
        print(f"Agent didn't steal invisible pellet in trajectory {idx}!")
    elif outcome[1].any():
        print(f"Agent failed to pick up free pellet(s) in trajectory {idx}!")
    elif outcome[-1].any():
        print(f"Agent failed to deposit pellet(s) in trajectory {idx}!")
    else:
        continue
    aberrant_trajectory_idxs.append(idx)

In [238]:
# trajs_of_interest = np.random.choice(np.arange(len(trajectories)), 3, replace=False)
# trajs_of_interest = aberrant_trajectory_idxs
trajs_of_interest = [28, 32]
for traj_idx in trajs_of_interest:
    traj = trajectories[traj_idx]
    print("========================================")
    env.render(traj.obs[0])
    print(f"Action sequence: {', '.join([env._action_to_string(action) for action in traj.acts])}")
    print("Outcome:")
    env.render(traj.obs[-1])

+---+---+---+
| x | . |   |
+---+---+---+
|   |0H | . |
+---+---+---+
|   |   |   |
+---+---+---+
Action sequence: UP, INTERACT, DOWN, INTERACT, RIGHT, INTERACT, LEFT, INTERACT, UP, INTERACT
Outcome:
+---+---+---+
| x |0  |   |
+---+---+---+
|   | H |   |
+---+---+---+
|   |   |   |
+---+---+---+
+---+---+---+
|   |   |   |
+---+---+---+
|   |0H | . |
+---+---+---+
| x | . |   |
+---+---+---+
Action sequence: DOWN, INTERACT, UP, INTERACT, RIGHT, INTERACT, LEFT, INTERACT, LEFT, DOWN, INTERACT, UP, RIGHT, INTERACT, INTERACT
Outcome:
+---+---+---+
|   |   |   |
+---+---+---+
|   |0H |   |
+---+---+---+
|   |   |   |
+---+---+---+


In [59]:
# Reward net evaluated at final state of above rollout

state = env._get_observation()

env.render(state)
reward_net.predict(
    np.array([state] * 5),
    np.array(np.arange(5)),
    np.array(env.states[:5]),
    np.array([False] * 5),
)

+---+---+---+
|   |   |0  |
+---+---+---+
|   | H |   |
+---+---+---+
|   |   |   |
+---+---+---+


array([-1.2135961 , -1.058923  ,  1.2412297 , -0.40613925, -2.960704  ],
      dtype=float32)

In [15]:
# Trying to inspect activations of reward net

import torch as th

home_location = np.array([GRID_SIZE // 2, GRID_SIZE // 2])
free_pellet_location = np.array([0, 0])
owned_pellet_location = np.array([0, 1])

agent_location = home_location

state = np.zeros((5, GRID_SIZE, GRID_SIZE), dtype=np.int16)
state[0, agent_location[0], agent_location[1]] = 1
state[1, free_pellet_location[0], free_pellet_location[1]] = 1
state[2, owned_pellet_location[0], owned_pellet_location[1]] = 1
state[3, GRID_SIZE // 2, GRID_SIZE // 2] = 1
state[4, :, :] = 1

env.render(state)

conv0_activations = reward_net.cnn[0](th.Tensor([state]))
relu0_activations = reward_net.cnn[1](conv0_activations)
conv1_activations = reward_net.cnn[2](relu0_activations)
relu1_activations = reward_net.cnn[3](conv1_activations)
pool_activations = reward_net.cnn[4](relu1_activations)
flat_activations = reward_net.cnn[5](pool_activations)
outputs = reward_net.cnn[-1](flat_activations)

print(f"layer 0 activations:\n{relu0_activations[0]}")
print(f"layer 1 activations:\n{relu1_activations[0]}")
print(f"flat_activations:\n{flat_activations[0]}")
print(f"outputs:\n{outputs[0]}")

+---+---+---+
| . | x |   |
+---+---+---+
|   |1H |   |
+---+---+---+
|   |   |   |
+---+---+---+


  conv0_activations = reward_net.cnn[0](th.Tensor([state]))


IndexError: index 5 is out of range

In [128]:
# Trying to inspect weights of reward net

for weight, bias in zip(reward_net.cnn[0].weight, reward_net.cnn[0].bias):
    print(f"weight: {weight.detach().numpy().flatten()}")
    print(f"bias: {bias.detach().numpy()}")

print()

# for weight, bias in zip(reward_net.cnn[2].weight, reward_net.cnn[2].bias):
#     print(f"weight: {weight.detach().numpy().flatten()}")
#     print(f"bias: {bias.detach().numpy()}")

print()

for weight, bias in zip(reward_net.cnn[-1].weight, reward_net.cnn[-1].bias):
    print(f"weight: {weight.detach().numpy().flatten()}")
    print(f"bias: {bias.detach().numpy()}")

weight: [-0.10977456 -0.38889107 -0.1206716  -0.38175228 -0.28921402]
bias: 0.34487465023994446
weight: [ 0.11769152  0.23302093 -0.40303385  0.11292398  0.17897412]
bias: 0.3653581142425537


weight: [-0.17857413 -0.36234197]
bias: -0.5028236508369446
weight: [0.5715578  0.04654616]
bias: -0.3627457618713379
weight: [-0.45337042  0.31298742]
bias: 0.548060417175293
weight: [-0.06371312 -0.561359  ]
bias: -0.6052233576774597
weight: [ 0.08518305 -0.09132026]
bias: -0.05851120129227638


In [129]:
# Trying to write down optimal weights for reward net

optimal_conv0_weights = np.array([
    [1, 0, 0, 1, 0.1],  # Is agent at home with pellets?
    [1, 0, 1, 0, 0],  # Is agent at owned pellet?
]).reshape(reward_net.cnn[0].weight.shape)
optimal_conv0_bias = np.array([-2, -1]).reshape(reward_net.cnn[0].bias.shape)
# optimal_conv1_weights = np.array([
#     [0, 1],  # Is agent at owned pellet?
#     [10, 0],  # Num pellets to deposit
# ]).reshape(reward_net.cnn[2].weight.shape)
# optimal_conv1_bias = np.array([0, 0]).reshape(reward_net.cnn[2].bias.shape)
optimal_fc_weights = np.array([
    [0, 0],  # UP
    [0, 0],  # DOWN
    [0, 0],  # LEFT
    [0, 0],  # RIGHT
    [10, -2],  # INTERACT
]).reshape(reward_net.cnn[-1].weight.shape)
optimal_fc_bias = np.array([0, 0, 0, 0, 0]).reshape(reward_net.cnn[-1].bias.shape)

def set_weights(cnn):
    cnn[0].weight.data = th.Tensor(optimal_conv0_weights)
    cnn[0].bias.data = th.Tensor(optimal_conv0_bias)
    # cnn[2].weight.data = th.Tensor(optimal_conv1_weights)
    # cnn[2].bias.data = th.Tensor(optimal_conv1_bias)
    cnn[-1].weight.data = th.Tensor(optimal_fc_weights)
    cnn[-1].bias.data = th.Tensor(optimal_fc_bias)

set_weights(reward_net.cnn)
    

In [76]:
# import json

# with open('rlhf_policy_20220423_gs3_asdfasdf.json', 'w') as f:
#     json.dump([int(x) for x in rlhf_policy.policy_vector], f)