### Getting Started

In [None]:
import os
import shutil

import numpy as np
import gymnasium as gym
from gymnasium.wrappers import RecordVideo, TimeLimit
from stable_baselines3 import PPO
from seals.util import AbsorbAfterDoneWrapper

In [None]:
env_ppo = gym.make("Acrobot-v1", render_mode="rgb_array")
env_ppo = AbsorbAfterDoneWrapper(env_ppo)
env_ppo = TimeLimit(env_ppo, max_episode_steps=512)

### Training "expert" policy

In [None]:
make_videos = False
if make_videos:
    def wrap_around_video_recorder(env, folder):
        return RecordVideo(
            env, folder,
            episode_trigger=(lambda episode_id: True),
        )
    
    # Initialize video recording output directory
    output_folder = os.path.join(os.getcwd(), 'gym-results', 'oracle')
    os.makedirs(output_folder, exist_ok=True)
    
    train_output_folder = os.path.join(output_folder, 'train')
    shutil.rmtree(train_output_folder, ignore_errors=True)
    os.makedirs(train_output_folder, exist_ok=True)
    
    # Wrap around video recorder
    env_ppo = wrap_around_video_recorder(
        env_ppo,
        train_output_folder,
    )

# Train a policy
model = PPO("MlpPolicy", env_ppo, verbose=1, seed=0).learn(total_timesteps=16384)
env_ppo.close()
model.save('my-model')

### Imitation Learning Using AIRL

In [None]:
SEED = 42

In [None]:
from imitation.util.util import make_vec_env
from imitation.data.wrappers import RolloutInfoWrapper
from stable_baselines3.common.monitor import Monitor


airl_log_dir = './gym-results/airl/'


env_airl = make_vec_env(
    'Acrobot-v1',
    rng=np.random.default_rng(SEED),
    n_envs=8,
    post_wrappers=[
        lambda x, _: AbsorbAfterDoneWrapper(x),
        lambda x, _: TimeLimit(x, max_episode_steps=512),
        lambda x, _: Monitor(x, filename=airl_log_dir),
        lambda x, _: RolloutInfoWrapper(x),  # to compute rollouts
    ],
)
env_airl.envs

In [None]:
#expert = model.policy
from imitation.policies.serialize import load_policy
expert = load_policy("ppo", env_airl, path="my-model.zip")
expert

In [None]:
from imitation.data import rollout
rollouts = rollout.rollout(
    expert,
    env_airl,
    rollout.make_sample_until(min_episodes=60),
    rng=np.random.default_rng(SEED),
)

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

learner = PPO(
    env=env_airl,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
    tensorboard_log=airl_log_dir,
)
# same as PPO2
# multi agent learning, parallelism: https://stable-baselines.readthedocs.io/en/master/guide/vec_envs.html

In [None]:
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.util.networks import RunningNorm

reward_net = BasicShapedRewardNet(
    observation_space=env_airl.observation_space,
    action_space=env_airl.action_space,
    normalize_input_layer=RunningNorm,
)

In [None]:
from imitation.algorithms.adversarial.airl import AIRL

airl_trainer = AIRL(
    demonstrations=rollouts,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=env_airl,
    gen_algo=learner,
    reward_net=reward_net,
    log_dir=airl_log_dir,
)

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

env_airl.seed(SEED)
learner_rollouts_before_training = rollout.rollout(
    learner,
    env_airl,
    rollout.make_sample_until(min_episodes=100),
    rng=np.random.default_rng(SEED),
)
# learner_rewards_before_training, _ = evaluate_policy(
#     learner,
#     env_airl,
#     n_eval_episodes=100,
#     return_episode_rewards=True,
#     warn=True,
# )

In [None]:
N_STEPS = 2000000              # Train for 2_000_000 steps to match expert.
airl_trainer.train(N_STEPS)

env_airl.seed(SEED)
learner_rollouts_after_training = rollout.rollout(
    learner,
    env_airl,
    rollout.make_sample_until(min_episodes=100),
    rng=np.random.default_rng(SEED),
)
# learner_rewards_after_training, _ = evaluate_policy(
#     learner,
#     env_airl,
#     n_eval_episodes=100,
#     return_episode_rewards=True,
# )

In [None]:
# print("mean reward after training:", np.mean(learner_rewards_after_training))
# print("mean reward before training:", np.mean(learner_rewards_before_training))

### Imitation Learning Using Our Improvement