In [None]:
import os
import shutil
from functools import partial

import numpy as np
import gymnasium as gym
from gymnasium import Wrapper, Env
from gymnasium.wrappers import RecordVideo, TimeLimit
from stable_baselines3 import PPO
from seals.util import AbsorbAfterDoneWrapper

In [None]:
envPPO = gym.make("Acrobot-v1", render_mode="rgb_array")
envPPO =AbsorbAfterDoneWrapper(envPPO)
envPPO = TimeLimit(envPPO, max_episode_steps=512)

In [None]:
make_vid = False
if make_vid:
    def wrap_around_video_recorder(env, folder):
        return RecordVideo(
            env, folder,
            episode_trigger=(lambda episode_id: True),
        )
    
    # Initialize video recording output directory
    output_folder = os.path.join(os.getcwd(), 'gym-results', 'oracle')
    os.makedirs(output_folder, exist_ok=True)
    
    train_output_folder = os.path.join(output_folder, 'train')
    shutil.rmtree(train_output_folder, ignore_errors=True)
    os.makedirs(train_output_folder, exist_ok=True)
    
    # Wrap around video recorder
    env = wrap_around_video_recorder(
        env,
        train_output_folder,
    )

# Train a policy
model = PPO("MlpPolicy", envPPO, verbose=1, seed=0).learn(total_timesteps=16384)
envPPO.close()
model.save('my-model')
#import ipdb; ipdb.set_trace()

We now generate oracle dataset using the trained policy model.

In [None]:
# # Generate oracle data using the trained policy
# test_output_folder = os.path.join(output_folder, 'test')
# shutil.rmtree(test_output_folder, ignore_errors=True)
# os.makedirs(test_output_folder, exist_ok=True)

# env = wrap_around_video_recorder(
#     get_env(),
#     test_output_folder,
# )

# trajectories = []  # trajectories is a list of lists
# seed = 1
# trajectories.append([])
# obs, info = env.reset(seed=seed)
# T = 10000
# for i in range(T):
#     trajectories[-1].append(obs)
#     action, _state = model.predict(obs, deterministic=True)
#     trajectories[-1].append(action)
#     obs, reward, done, truncated, info = env.step(action)
#     trajectories[-1].append(reward)
#     if done or truncated:
#         if i + 1 == T:
#             # will close outside the loop, no need to reset
#             continue
#         trajectories[-1].append(obs)
#         seed += 1  # try a new seed
#         trajectories.append([])
#         obs, info = env.reset(seed=seed)
# trajectories[-1].append(obs)
# env.close()

In [None]:
# data_output_folder = os.path.join(output_folder, 'data')
# shutil.rmtree(data_output_folder, ignore_errors=True)
# os.makedirs(data_output_folder, exist_ok=True)

# for e_idx, trajectory in enumerate(trajectories):
#     assert len(trajectory) % 3 == 1
#     states, actions, rewards = [], [], []
#     for idx, elem in enumerate(trajectory):
#         if idx % 3 == 0:    # state
#             states.append(elem)
#         elif idx % 3 == 1:  # action
#             actions.append(elem)
#         else:               # reward
#             rewards.append(elem)
#     data = {
#         'states': np.stack(states),
#         'actions': np.stack(actions),
#         'rewards': np.array(rewards),
#     }
#     filename = f'episode-{e_idx}.npz'
#     filepath = os.path.join(data_output_folder, filename)
#     np.savez_compressed(filepath, **data)


### Imitation Learning Using AIRL

In [None]:
# model.save('my-model')

# from stable_baselines3.ppo import MlpPolicy
# newmodel = PPO.load('my-model.zip')
# type(newmodel)

In [None]:
#from stable_baselines3.common.evaluation import evaluate_policy

#from imitation.algorithms.adversarial.airl import AIRL
#from imitation.data import rollout
#from imitation.data.wrappers import RolloutInfoWrapper
#from imitation.policies.serialize import load_policy
#from imitation.rewards.reward_nets import BasicShapedRewardNet
#from imitation.util.networks import RunningNorm
#from imitation.util.util import make_vec_env

In [None]:
SEED = 42
#numEnvs =8

In [None]:
from imitation.util.util import make_vec_env
from imitation.data.wrappers import RolloutInfoWrapper
from stable_baselines3.common.monitor import Monitor

envAIL = make_vec_env(
    'Acrobot-v1',
    rng=np.random.default_rng(SEED),
    n_envs=8,
    post_wrappers=[
        lambda x, _: AbsorbAfterDoneWrapper(x),
        lambda x, _: TimeLimit(x, max_episode_steps=512),
        lambda x, _: RolloutInfoWrapper(x),  # to compute rollouts
        lambda x, _: Monitor(x)
    ],
)

In [None]:
#import gym
#from stable_baselines3.common.env_util import make_vec_env
#from stable_baselines3.common.vec_env import VecNormalize

# SEED and n_envs are assumed to be predefined
#env_id = "Acrobot-v1"
#env = make_vec_env(env_id, n_envs=numEnvs, seed=SEED)
#env = VecNormalize(env)
#env = AbsorbAfterDoneWrapper(env)
#env = TimeLimit(env, max_episode_steps=512)

In [None]:
#expert = model.policy
from imitation.policies.serialize import load_policy
expert = load_policy("ppo", envAIL, path="my-model.zip")
expert

In [None]:
from imitation.data import rollout
rollouts = rollout.rollout(
    expert,
    envAIL,
    rollout.make_sample_until(min_episodes=60),
    rng=np.random.default_rng(SEED),
)

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

learner = PPO(
    env=envAIL,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)
#same as PPO2
#multi agent learning, parallelism: https://stable-baselines.readthedocs.io/en/master/guide/vec_envs.html

In [None]:
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.util.networks import RunningNorm

reward_net = BasicShapedRewardNet(
    observation_space=envAIL.observation_space,
    action_space=envAIL.action_space,
    normalize_input_layer=RunningNorm,
)

In [None]:
from imitation.algorithms.adversarial.airl import AIRL
airl_trainer = AIRL(
    demonstrations=rollouts,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=envAIL,
    gen_algo=learner,
    reward_net=reward_net,
)

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

#envAIL.seed(SEED)
learner_rewards_before_training, _ = evaluate_policy(
    learner, envAIL, n_eval_episodes = 100, return_episode_rewards=True, warn = True
)


In [None]:
airl_trainer.train(20000)  # Train for 2_000_000 steps to match expert.
#envAIL.seed(SEED)
learner_rewards_after_training, _ = evaluate_policy(
    learner, envAIL, 100, return_episode_rewards=True,
)
#check if evaluate policy has option  has option deterministic=True


In [None]:
print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

In [None]:
#EvalCallback for time series of learning curve
#RL evaluation source: https://arxiv.org/abs/1709.06560