# Test MiniGrid sequential tasks with PPO + EWC

Default Jupyter settings

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
#%load_ext autoreload
#%autoreload 2
%reload_ext autoreload
%autoreload 2

Import libraries

In [2]:
import sys, os, time
import numpy as np
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim
from datetime import date

import pickle
import torch
from arguments_rl import get_args

from collections import deque
from rl_module.a2c_ppo_acktr.envs import make_vec_envs
from rl_module.a2c_ppo_acktr.storage import RolloutStorage
from rl_module.train_ppo import train_ppo
from stable_baselines3.common.utils import set_random_seed
from torch_ac.utils import DictList

# Gym MiniGrid specific
import gym
from gym import spaces
from gym.wrappers import Monitor
import gym_minigrid
from gym_minigrid.wrappers import FlatObsWrapper, ImgObsWrapper, RGBImgPartialObsWrapper, RGBImgObsWrapper
from gym.wrappers import Monitor

  from .autonotebook import tqdm as notebook_tqdm


Load arguments

In [3]:
args = {
'algo':'ppo',
'experiment':'minigrid-ewc-doorkey-wallgap-lavagap',
'approach':'ewc',#'fine-tuning',#'blip',
'optimizer':'Adam',#'RMSProp',#'Adam',
'gail':False,
'gail_experts_dir':'./gail_experts',
'gail_batch_size':128,
'gail_epoch':5,
'lr':2.5e-4,#7e-4,#1e-4,
'eps':1e-8,#1e-5,
'gamma':0.99,
'use_gae':True,
'gae_lambda':0.95,#0.99,
'entropy_coef':0.01,
'value_loss_coef':0.5,
'max_grad_norm':0.5,
'seed':1,
'cuda_deterministic':False,
'num_processes':16,
'num_steps':128,#5,
'ppo_epoch':4,
'num_mini_batch':256,#8,#32,
'clip_param':0.2,#0.1,
'log_interval':10,
'save_interval':10,
'eval_interval':100,
'num_env_steps':5e5,
'env_name':'PongNoFrameskip-v4',
'log_dir':'./logs/',
'save_dir':'./trained_models/',
'no_cuda':True,
'use_proper_time_limits':False,
'recurrent_policy':False,
'use_linear_lr_decay':False,
'ewc_lambda':5000.0,
'ewc_online':True,#False,
'ewc_epochs':100,
'num_ewc_steps':20,
'save_name':None,
'date':date.today(),
'task_id':None,
'single_task':False,
'F_prior':1e-15,
'input_padding':False,
'sample':False,
'samples':1,
# render arguments
'render_ckpt_path':'',
'render_task_idx':0,
'num_render_traj':1000
}

args = DictList(args)

args.cuda = not args.no_cuda and torch.cuda.is_available()

assert args.algo in ['a2c', 'ppo', 'acktr']
if args.recurrent_policy:
    assert args.algo in ['a2c', 'ppo'], \
        'Recurrent policy is not implemented for ACKTR'
conv_experiment = [
    'atari',
]

print(args)

{'algo': 'ppo', 'experiment': 'minigrid-ewc-doorkey-wallgap-lavagap', 'approach': 'ewc', 'optimizer': 'Adam', 'gail': False, 'gail_experts_dir': './gail_experts', 'gail_batch_size': 128, 'gail_epoch': 5, 'lr': 0.00025, 'eps': 1e-08, 'gamma': 0.99, 'use_gae': True, 'gae_lambda': 0.95, 'entropy_coef': 0.01, 'value_loss_coef': 0.5, 'max_grad_norm': 0.5, 'seed': 1, 'cuda_deterministic': False, 'num_processes': 16, 'num_steps': 128, 'ppo_epoch': 4, 'num_mini_batch': 256, 'clip_param': 0.2, 'log_interval': 10, 'save_interval': 10, 'eval_interval': 100, 'num_env_steps': 500000.0, 'env_name': 'PongNoFrameskip-v4', 'log_dir': './logs/', 'save_dir': './trained_models/', 'no_cuda': True, 'use_proper_time_limits': False, 'recurrent_policy': False, 'use_linear_lr_decay': False, 'ewc_lambda': 5000.0, 'ewc_online': True, 'ewc_epochs': 100, 'num_ewc_steps': 20, 'save_name': None, 'date': datetime.date(2023, 1, 15), 'task_id': None, 'single_task': False, 'F_prior': 1e-15, 'input_padding': False, 'sampl

In [40]:
# Split

if args.approach == 'fine-tuning' or args.approach == 'ft-fix':
    log_name = '{}_{}_{}_{}'.format(args.date, args.experiment, args.approach,args.seed)
elif args.approach == 'ewc' in args.approach:
    log_name = '{}_{}_{}_{}_lamb_{}'.format(args.date, args.experiment, args.approach, args.seed, args.ewc_lambda)
elif args.approach == 'blip':
    log_name = '{}_{}_{}_{}_F_prior_{}'.format(args.date, args.experiment, args.approach, args.seed, args.F_prior)

if args.experiment in conv_experiment:
    log_name = log_name + '_conv'

# Seed
set_random_seed(args.seed)

# Inits
if args.cuda:
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
else:
    torch.set_default_tensor_type('torch.FloatTensor')

device = torch.device("cuda" if args.cuda else "cpu")

#taskcla = [(0,14), (1,18), (2,18), (3,18), (4,18), (5,6)]
#task_sequences = [(0, 'KungFuMasterNoFrameskip-v4'), (1, 'BoxingNoFrameskip-v4'), (2, 'JamesbondNoFrameskip-v4'), (3, 'KrullNoFrameskip-v4'), (4, 'RiverraidNoFrameskip-v4'), (5, 'SpaceInvadersNoFrameskip-v4')]

taskcla = [(0,7), (1,7), (2,7)]
task_sequences = [
    (0, 'MiniGrid-DoorKey-6x6-v0'), 
    (1, 'MiniGrid-WallGapS6-v0'), 
    (2, 'MiniGrid-LavaGapS6-v0')
    ]

# hard coded for atari environment
#obs_shape = (4,84,84)

# for FlatObsWrapper Minigrid environment
obs_shape = (2739,)

if args.approach == 'blip':
    from rl_module.ppo_model import QPolicy
    print('using fisher prior of: ', args.F_prior)
    actor_critic = QPolicy(obs_shape,
        taskcla,
        base_kwargs={'F_prior': args.F_prior, 'recurrent': args.recurrent_policy}).to(device)
else:
    from rl_module.ppo_model import Policy
    actor_critic = Policy(obs_shape,
        taskcla,
        base_kwargs={'recurrent': args.recurrent_policy}).to(device)

# Args -- Approach
if args.approach == 'fine-tuning' or args.approach == 'ft-fix':
    from rl_module.ppo import PPO as approach

    agent = approach(actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
            use_clipped_value_loss=True,
            optimizer=args.optimizer)
elif args.approach == 'ewc':
    from rl_module.ppo_ewc import PPO_EWC as approach

    agent = approach(
        actor_critic,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=args.lr,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm,
        use_clipped_value_loss=True,
        ewc_lambda= args.ewc_lambda,
        online = args.ewc_online,
        optimizer=args.optimizer)

elif args.approach == 'blip':
    from rl_module.ppo_blip import PPO_BLIP as approach

    agent = approach(
        actor_critic,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=args.lr,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm,
        use_clipped_value_loss=True,
        optimizer=args.optimizer)

ewc_lambda :  5000.0


In [7]:
tr_reward_arr = []
te_reward_arr = {}

for _type in (['mean', 'max', 'min']):
    te_reward_arr[_type] = {}
    for idx in range(len(taskcla)):
        te_reward_arr[_type]['task' + str(idx)] = []

for task_idx,env_name in task_sequences:
    print(env_name)
    # renew optimizer
    agent.renew_optimizer()

    # FlatObsWrapper for MiniGrid
    envs = make_vec_envs(env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, wrapper_class=FlatObsWrapper)
    obs = envs.reset()

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                    obs_shape, envs.action_space,
                                    actor_critic.recurrent_hidden_state_size)

    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes

    train_ppo(actor_critic, agent, rollouts, task_idx, env_name, task_sequences, envs,  obs_shape, args, episode_rewards, tr_reward_arr, te_reward_arr, num_updates, log_name, device, wrapper_class=FlatObsWrapper)

    # post-processing
    if args.approach == 'fine-tuning':
        if args.single_task == True:
            envs.close()
            break
        else:
            envs.close()
    elif args.approach == 'ft-fix':
        # fix the backbone
        for param in actor_critic.features.parameters():
            param.requires_grad = False
        if args.single_task == True:
            envs.close()
            break
        else:
            envs.close()
    elif args.approach == 'ewc':
        agent.update_fisher(rollouts, task_idx)
        envs.close()
    elif args.approach == 'blip':
        agent.ng_post_processing(rollouts, task_idx)
        # save the model here so that bit allocation is saved
        save_path = os.path.join(args.save_dir, args.algo)
        torch.save(actor_critic.state_dict(),
            os.path.join(save_path, log_name + '_task_' + str(task_idx) + ".pth"))
        envs.close()

MiniGrid-DoorKey-6x6-v0
Task 0: Evaluation using 10 episodes: mean reward 0.00000 

len task_sequences :  3


  4%|▍         | 10/244 [00:34<13:40,  3.50s/it]

Updates 9, num timesteps 20480, FPS 585 
 Last 10 training episodes: mean/median reward 0.1/0.0, min/max reward 0.0/0.7



  8%|▊         | 20/244 [01:08<12:26,  3.33s/it]

Updates 19, num timesteps 40960, FPS 600 
 Last 10 training episodes: mean/median reward 0.0/0.0, min/max reward 0.0/0.0



 12%|█▏        | 30/244 [01:41<11:52,  3.33s/it]

Updates 29, num timesteps 61440, FPS 605 
 Last 10 training episodes: mean/median reward 0.3/0.2, min/max reward 0.0/0.9



 16%|█▋        | 40/244 [02:14<11:18,  3.33s/it]

Updates 39, num timesteps 81920, FPS 607 
 Last 10 training episodes: mean/median reward 0.7/0.7, min/max reward 0.4/0.9



 20%|██        | 50/244 [02:48<10:52,  3.36s/it]

Updates 49, num timesteps 102400, FPS 608 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.8/1.0



 25%|██▍       | 60/244 [03:21<10:19,  3.37s/it]

Updates 59, num timesteps 122880, FPS 608 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 29%|██▊       | 70/244 [03:55<09:50,  3.39s/it]

Updates 69, num timesteps 143360, FPS 607 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



 33%|███▎      | 80/244 [04:29<09:12,  3.37s/it]

Updates 79, num timesteps 163840, FPS 607 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 37%|███▋      | 90/244 [05:03<08:36,  3.36s/it]

Updates 89, num timesteps 184320, FPS 608 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



 41%|████      | 99/244 [05:33<08:29,  3.51s/it]

Updates 99, num timesteps 204800, FPS 606 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 41%|████      | 100/244 [05:43<12:34,  5.24s/it]

Task 0: Evaluation using 10 episodes: mean reward 0.96675 

len task_sequences :  3


 45%|████▌     | 110/244 [06:21<08:01,  3.59s/it]

Updates 109, num timesteps 225280, FPS 591 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 49%|████▉     | 120/244 [06:54<06:56,  3.36s/it]

Updates 119, num timesteps 245760, FPS 592 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 53%|█████▎    | 130/244 [07:31<07:19,  3.86s/it]

Updates 129, num timesteps 266240, FPS 589 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 57%|█████▋    | 140/244 [08:05<05:51,  3.38s/it]

Updates 139, num timesteps 286720, FPS 590 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 61%|██████▏   | 150/244 [08:39<05:15,  3.36s/it]

Updates 149, num timesteps 307200, FPS 591 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



 66%|██████▌   | 160/244 [09:13<04:50,  3.46s/it]

Updates 159, num timesteps 327680, FPS 591 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 70%|██████▉   | 170/244 [09:50<04:48,  3.90s/it]

Updates 169, num timesteps 348160, FPS 589 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



 74%|███████▍  | 180/244 [10:26<03:40,  3.44s/it]

Updates 179, num timesteps 368640, FPS 588 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



 78%|███████▊  | 190/244 [11:00<03:04,  3.42s/it]

Updates 189, num timesteps 389120, FPS 588 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 82%|████████▏ | 199/244 [11:31<02:34,  3.42s/it]

Updates 199, num timesteps 409600, FPS 589 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 82%|████████▏ | 200/244 [11:40<03:36,  4.91s/it]

Task 0: Evaluation using 10 episodes: mean reward 0.96475 

len task_sequences :  3


 86%|████████▌ | 210/244 [12:14<01:57,  3.45s/it]

Updates 209, num timesteps 430080, FPS 585 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 90%|█████████ | 220/244 [12:50<01:28,  3.71s/it]

Updates 219, num timesteps 450560, FPS 585 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 94%|█████████▍| 230/244 [13:27<00:52,  3.75s/it]

Updates 229, num timesteps 471040, FPS 583 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



 98%|█████████▊| 240/244 [14:04<00:14,  3.65s/it]

Updates 239, num timesteps 491520, FPS 582 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



100%|██████████| 244/244 [14:18<00:00,  3.52s/it]

MiniGrid-WallGapS6-v0





Task 0: Evaluation using 10 episodes: mean reward 0.96375 

Task 1: Evaluation using 10 episodes: mean reward 0.00000 

len task_sequences :  3


  4%|▍         | 10/244 [01:03<24:59,  6.41s/it]

Updates 9, num timesteps 20480, FPS 324 
 Last 10 training episodes: mean/median reward 0.3/0.2, min/max reward 0.0/0.9



  8%|▊         | 20/244 [02:05<23:08,  6.20s/it]

Updates 19, num timesteps 40960, FPS 326 
 Last 10 training episodes: mean/median reward 0.7/0.7, min/max reward 0.3/0.9



 12%|█▏        | 30/244 [03:06<21:24,  6.00s/it]

Updates 29, num timesteps 61440, FPS 329 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 16%|█▋        | 40/244 [04:10<21:35,  6.35s/it]

Updates 39, num timesteps 81920, FPS 327 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 20%|██        | 50/244 [05:09<19:26,  6.01s/it]

Updates 49, num timesteps 102400, FPS 330 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 25%|██▍       | 60/244 [06:09<18:31,  6.04s/it]

Updates 59, num timesteps 122880, FPS 332 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 29%|██▊       | 70/244 [07:12<17:36,  6.07s/it]

Updates 69, num timesteps 143360, FPS 331 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 33%|███▎      | 80/244 [08:11<16:19,  5.97s/it]

Updates 79, num timesteps 163840, FPS 333 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 37%|███▋      | 90/244 [09:15<16:06,  6.28s/it]

Updates 89, num timesteps 184320, FPS 332 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 41%|████      | 99/244 [10:20<17:49,  7.37s/it]

Updates 99, num timesteps 204800, FPS 326 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0

Task 0: Evaluation using 10 episodes: mean reward 0.96275 



 41%|████      | 100/244 [10:40<27:04, 11.28s/it]

Task 1: Evaluation using 10 episodes: mean reward 0.94250 

len task_sequences :  3


 45%|████▌     | 110/244 [11:53<16:13,  7.26s/it]

Updates 109, num timesteps 225280, FPS 315 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 49%|████▉     | 120/244 [13:03<14:10,  6.86s/it]

Updates 119, num timesteps 245760, FPS 313 
 Last 10 training episodes: mean/median reward 1.0/0.9, min/max reward 0.9/1.0



 53%|█████▎    | 130/244 [14:04<11:47,  6.21s/it]

Updates 129, num timesteps 266240, FPS 315 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/0.9



 57%|█████▋    | 140/244 [15:06<10:45,  6.21s/it]

Updates 139, num timesteps 286720, FPS 316 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 61%|██████▏   | 150/244 [16:07<09:21,  5.97s/it]

Updates 149, num timesteps 307200, FPS 317 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 66%|██████▌   | 160/244 [17:06<08:19,  5.94s/it]

Updates 159, num timesteps 327680, FPS 319 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 70%|██████▉   | 170/244 [18:21<09:07,  7.40s/it]

Updates 169, num timesteps 348160, FPS 315 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 74%|███████▍  | 180/244 [19:29<07:18,  6.85s/it]

Updates 179, num timesteps 368640, FPS 315 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 78%|███████▊  | 190/244 [20:37<06:24,  7.11s/it]

Updates 189, num timesteps 389120, FPS 314 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 82%|████████▏ | 199/244 [21:34<04:54,  6.54s/it]

Updates 199, num timesteps 409600, FPS 314 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.7/1.0

Task 0: Evaluation using 10 episodes: mean reward 0.96400 



 82%|████████▏ | 200/244 [21:53<07:23, 10.07s/it]

Task 1: Evaluation using 10 episodes: mean reward 0.90813 

len task_sequences :  3


 86%|████████▌ | 210/244 [22:58<03:39,  6.47s/it]

Updates 209, num timesteps 430080, FPS 312 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 90%|█████████ | 220/244 [24:05<02:34,  6.44s/it]

Updates 219, num timesteps 450560, FPS 311 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 94%|█████████▍| 230/244 [25:07<01:31,  6.52s/it]

Updates 229, num timesteps 471040, FPS 312 
 Last 10 training episodes: mean/median reward 1.0/0.9, min/max reward 0.9/1.0



 98%|█████████▊| 240/244 [26:14<00:27,  6.83s/it]

Updates 239, num timesteps 491520, FPS 312 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/0.9



100%|██████████| 244/244 [26:40<00:00,  6.56s/it]


MiniGrid-LavaGapS6-v0
Task 0: Evaluation using 10 episodes: mean reward 0.96650 

Task 1: Evaluation using 10 episodes: mean reward 0.95250 

Task 2: Evaluation using 10 episodes: mean reward 0.00000 

len task_sequences :  3


  4%|▍         | 10/244 [01:06<25:45,  6.60s/it]

Updates 9, num timesteps 20480, FPS 309 
 Last 10 training episodes: mean/median reward 0.0/0.0, min/max reward 0.0/0.0



  8%|▊         | 20/244 [02:13<24:44,  6.63s/it]

Updates 19, num timesteps 40960, FPS 307 
 Last 10 training episodes: mean/median reward 0.2/0.0, min/max reward 0.0/0.9



 12%|█▏        | 30/244 [03:22<24:46,  6.95s/it]

Updates 29, num timesteps 61440, FPS 304 
 Last 10 training episodes: mean/median reward 0.3/0.0, min/max reward 0.0/0.9



 16%|█▋        | 40/244 [04:29<22:45,  6.69s/it]

Updates 39, num timesteps 81920, FPS 303 
 Last 10 training episodes: mean/median reward 0.1/0.0, min/max reward 0.0/0.9



 20%|██        | 50/244 [05:37<22:33,  6.98s/it]

Updates 49, num timesteps 102400, FPS 303 
 Last 10 training episodes: mean/median reward 0.3/0.0, min/max reward 0.0/0.9



 25%|██▍       | 60/244 [06:45<20:04,  6.55s/it]

Updates 59, num timesteps 122880, FPS 303 
 Last 10 training episodes: mean/median reward 0.0/0.0, min/max reward 0.0/0.0



 29%|██▊       | 70/244 [07:49<18:00,  6.21s/it]

Updates 69, num timesteps 143360, FPS 305 
 Last 10 training episodes: mean/median reward 0.2/0.0, min/max reward 0.0/0.9



 33%|███▎      | 80/244 [08:50<16:34,  6.06s/it]

Updates 79, num timesteps 163840, FPS 308 
 Last 10 training episodes: mean/median reward 0.1/0.0, min/max reward 0.0/0.9



 37%|███▋      | 90/244 [09:51<15:36,  6.08s/it]

Updates 89, num timesteps 184320, FPS 311 
 Last 10 training episodes: mean/median reward 0.1/0.0, min/max reward 0.0/0.9



 41%|████      | 99/244 [10:45<14:39,  6.06s/it]

Updates 99, num timesteps 204800, FPS 314 
 Last 10 training episodes: mean/median reward 0.2/0.0, min/max reward 0.0/0.9

Task 0: Evaluation using 10 episodes: mean reward 0.96525 

Task 1: Evaluation using 10 episodes: mean reward 0.95062 



 41%|████      | 100/244 [11:07<26:05, 10.87s/it]

Task 2: Evaluation using 10 episodes: mean reward 0.19000 

len task_sequences :  3


 45%|████▌     | 110/244 [12:08<13:52,  6.22s/it]

Updates 109, num timesteps 225280, FPS 309 
 Last 10 training episodes: mean/median reward 0.3/0.0, min/max reward 0.0/0.9



 49%|████▉     | 120/244 [13:11<12:57,  6.27s/it]

Updates 119, num timesteps 245760, FPS 310 
 Last 10 training episodes: mean/median reward 0.3/0.0, min/max reward 0.0/0.9



 53%|█████▎    | 130/244 [14:15<12:07,  6.38s/it]

Updates 129, num timesteps 266240, FPS 311 
 Last 10 training episodes: mean/median reward 0.3/0.0, min/max reward 0.0/0.9



 57%|█████▋    | 140/244 [15:21<11:38,  6.72s/it]

Updates 139, num timesteps 286720, FPS 311 
 Last 10 training episodes: mean/median reward 0.4/0.3, min/max reward 0.0/0.9



 61%|██████▏   | 150/244 [16:22<09:39,  6.17s/it]

Updates 149, num timesteps 307200, FPS 312 
 Last 10 training episodes: mean/median reward 0.2/0.0, min/max reward 0.0/0.8



 66%|██████▌   | 160/244 [17:25<08:49,  6.31s/it]

Updates 159, num timesteps 327680, FPS 313 
 Last 10 training episodes: mean/median reward 0.3/0.0, min/max reward 0.0/0.9



 70%|██████▉   | 170/244 [18:31<08:00,  6.49s/it]

Updates 169, num timesteps 348160, FPS 313 
 Last 10 training episodes: mean/median reward 0.3/0.0, min/max reward 0.0/0.9



 74%|███████▍  | 180/244 [19:36<06:43,  6.30s/it]

Updates 179, num timesteps 368640, FPS 313 
 Last 10 training episodes: mean/median reward 0.4/0.4, min/max reward 0.0/0.9



 78%|███████▊  | 190/244 [20:40<05:37,  6.25s/it]

Updates 189, num timesteps 389120, FPS 313 
 Last 10 training episodes: mean/median reward 0.7/0.9, min/max reward 0.0/0.9



 82%|████████▏ | 199/244 [21:35<04:37,  6.16s/it]

Updates 199, num timesteps 409600, FPS 314 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/0.9

Task 0: Evaluation using 10 episodes: mean reward 0.96425 

Task 1: Evaluation using 10 episodes: mean reward 0.93813 



 82%|████████▏ | 200/244 [21:59<08:22, 11.43s/it]

Task 2: Evaluation using 10 episodes: mean reward 0.93312 

len task_sequences :  3


 86%|████████▌ | 210/244 [23:02<03:36,  6.36s/it]

Updates 209, num timesteps 430080, FPS 311 
 Last 10 training episodes: mean/median reward 0.8/0.9, min/max reward 0.0/1.0



 90%|█████████ | 220/244 [24:06<02:32,  6.34s/it]

Updates 219, num timesteps 450560, FPS 311 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/0.9



 94%|█████████▍| 230/244 [25:08<01:26,  6.16s/it]

Updates 229, num timesteps 471040, FPS 312 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 98%|█████████▊| 240/244 [26:11<00:25,  6.40s/it]

Updates 239, num timesteps 491520, FPS 312 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



100%|██████████| 244/244 [26:37<00:00,  6.55s/it]


Evaluate

In [11]:
from rl_module.evaluation import evaluate
args.seed = 2
print(task_sequences)
task_idx = task_sequences[-1][0]
ob_rms = None

print('Evaluating tasks:')
eval_episode_mean_rewards = evaluate(actor_critic, ob_rms, task_sequences, args.seed,
                            args.num_processes, args.log_dir, device, obs_shape, task_idx, args.gamma, wrapper_class=FlatObsWrapper, episodes=100)

[(0, 'MiniGrid-DoorKey-6x6-v0'), (1, 'MiniGrid-WallGapS6-v0'), (2, 'MiniGrid-LavaGapS6-v0')]
Evaluating tasks:
Task 0: Evaluation using 100 episodes: mean reward 0.96837 

Task 1: Evaluation using 100 episodes: mean reward 0.94512 

Task 2: Evaluation using 100 episodes: mean reward 0.92600 



Check MiniGrid vectorized environments

In [None]:
import gym
from gym import spaces
from gym.wrappers import Monitor
import gym_minigrid
from gym_minigrid.wrappers import FlatObsWrapper, ImgObsWrapper, RGBImgPartialObsWrapper, RGBImgObsWrapper
from gym.wrappers import Monitor

In [None]:
import matplotlib.pyplot as plt

env_name = 'MiniGrid-DoorKey-6x6-v0'

# Create vectorized environment with wrapper class
vec_env = make_vec_envs(env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, wrapper_class=FlatObsWrapper)

# Plot snapshot of vectorized environment and check randomized init
vec_env.reset()
before_img = vec_env.render('rgb_array')

plt.figure(figsize = (6.,6.))
plt.imshow(before_img);