In [7]:
import numpy as np
import gymnasium as gym
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env
from imitation.data.wrappers import RolloutInfoWrapper

env = make_vec_env(
    "seals:seals/CartPole-v0",
    rng=np.random.default_rng(),
    post_wrappers=[
        lambda env, _: RolloutInfoWrapper(env)
    ],  # needed for computing rollouts later
)
expert = load_policy(
    "ppo-huggingface",
    organization="HumanCompatibleAI",
    env_name="seals/CartPole-v0",
    venv=env,
)

from stable_baselines3.common.evaluation import evaluate_policy

reward, _ = evaluate_policy(expert, env, 10)
print(reward)

from imitation.data import rollout

rng = np.random.default_rng()
rollouts = rollout.rollout(
    expert,
    env,
    rollout.make_sample_until(min_timesteps=None, min_episodes=50),
    rng=rng,
)
transitions = rollout.flatten_trajectories(rollouts)

print(
    f"""The `rollout` function generated a list of {len(rollouts)} {type(rollouts[0])}.
After flattening, this list is turned into a {type(transitions)} object containing {len(transitions)} transitions.
The transitions object contains arrays for: {', '.join(transitions.__dict__.keys())}."
"""
)

from imitation.algorithms import bc

bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng,
)

reward_before_training, _ = evaluate_policy(bc_trainer.policy, env, 10)
print(f"Reward before training: {reward_before_training}")


rew: 10.0, state_ok: True, my_index: 559
rew: 10.0, state_ok: True, my_index: 629
rew: 10.0, state_ok: True, my_index: 192
rew: 10.0, state_ok: True, my_index: 835
rew: 10.0, state_ok: True, my_index: 763
rew: 10.0, state_ok: True, my_index: 707
rew: 10.0, state_ok: True, my_index: 359
rew: 10.0, state_ok: True, my_index: 9
rew: 10.0, state_ok: True, my_index: 559
rew: 10.0, state_ok: True, my_index: 629
rew: 10.0, state_ok: True, my_index: 192
rew: 10.0, state_ok: True, my_index: 835
rew: 10.0, state_ok: True, my_index: 763
rew: 10.0, state_ok: True, my_index: 707
rew: 10.0, state_ok: True, my_index: 359
rew: 10.0, state_ok: True, my_index: 9
rew: 10.0, state_ok: True, my_index: 559
rew: 10.0, state_ok: True, my_index: 629
rew: 10.0, state_ok: True, my_index: 192
rew: 10.0, state_ok: True, my_index: 835
rew: 10.0, state_ok: True, my_index: 763
rew: 10.0, state_ok: True, my_index: 707
rew: 10.0, state_ok: True, my_index: 359
rew: 10.0, state_ok: True, my_index: 9
rew: 10.0, state_ok: T

In [8]:
bc_trainer.train(n_epochs=1)
reward_after_training, _ = evaluate_policy(bc_trainer.policy, env, 10)
print(f"Reward after training: {reward_after_training}")

0batch [00:00, ?batch/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_gather)