Default Jupyter settings

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
#%load_ext autoreload
#%autoreload 2
%reload_ext autoreload
%autoreload 2

Import libraries

In [2]:
import sys, os, time
import numpy as np
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim
from datetime import date

import pickle
import torch
from arguments_rl import get_args

from collections import deque
from rl_module.a2c_ppo_acktr.envs import make_vec_envs
from rl_module.a2c_ppo_acktr.storage import RolloutStorage
from rl_module.train_ppo import train_ppo
from rl_module.a2c_ppo_acktr.utils import seed
from torch_ac.utils import DictList

# Gym MiniGrid specific
import gym
from gym import spaces
from gym.wrappers import Monitor
import gym_minigrid
from gym_minigrid.wrappers import FlatObsWrapper, ImgObsWrapper, RGBImgPartialObsWrapper, RGBImgObsWrapper
from gym.wrappers import Monitor

  from .autonotebook import tqdm as notebook_tqdm


Load arguments

In [6]:
args = {
'algo':'ppo',
'experiment':'minigrid',
'approach':'fine-tuning',#'blip',
'optimizer':'RMSProp',#'Adam',
'gail':False,
'gail_experts_dir':'./gail_experts',
'gail_batch_size':128,
'gail_epoch':5,
'lr':7e-4,#2.5e-4,
'eps':1e-8,#1e-5,
'gamma':0.99,
'use_gae':True,
'gae_lambda':0.99,#0.95,
'entropy_coef':0.01,
'value_loss_coef':0.5,
'max_grad_norm':0.5,
'seed':1,
'cuda_deterministic':False,
'num_processes':16,
'num_steps':128,#5,
'ppo_epoch':4,
'num_mini_batch':256,#8,#32,
'clip_param':0.2,#0.1,
'log_interval':1,
'save_interval':10,
'eval_interval':10,
'num_env_steps':1e5,
'env_name':'PongNoFrameskip-v4',
'log_dir':'./logs/',
'save_dir':'./trained_models/',
'no_cuda':True,
'use_proper_time_limits':False,
'recurrent_policy':False,
'use_linear_lr_decay':False,
'ewc_lambda':5000.0,
'ewc_online':False,
'ewc_epochs':100,
'num_ewc_steps':20,
'save_name':None,
'date':date.today(),
'task_id':None,
'single_task':False,
'F_prior':1e-15,
'input_padding':False,
'sample':False,
'samples':1,
# render arguments
'render_ckpt_path':'',
'render_task_idx':0,
'num_render_traj':1000
}

args = DictList(args)

args.cuda = not args.no_cuda and torch.cuda.is_available()

assert args.algo in ['a2c', 'ppo', 'acktr']
if args.recurrent_policy:
    assert args.algo in ['a2c', 'ppo'], \
        'Recurrent policy is not implemented for ACKTR'
conv_experiment = [
    'atari',
]

print(args)

{'algo': 'ppo', 'experiment': 'minigrid', 'approach': 'fine-tuning', 'optimizer': 'RMSProp', 'gail': False, 'gail_experts_dir': './gail_experts', 'gail_batch_size': 128, 'gail_epoch': 5, 'lr': 0.0007, 'eps': 1e-08, 'gamma': 0.99, 'use_gae': True, 'gae_lambda': 0.99, 'entropy_coef': 0.01, 'value_loss_coef': 0.5, 'max_grad_norm': 0.5, 'seed': 1, 'cuda_deterministic': False, 'num_processes': 16, 'num_steps': 128, 'ppo_epoch': 4, 'num_mini_batch': 256, 'clip_param': 0.2, 'log_interval': 1, 'save_interval': 10, 'eval_interval': 10, 'num_env_steps': 100000.0, 'env_name': 'PongNoFrameskip-v4', 'log_dir': './logs/', 'save_dir': './trained_models/', 'no_cuda': True, 'use_proper_time_limits': False, 'recurrent_policy': False, 'use_linear_lr_decay': False, 'ewc_lambda': 5000.0, 'ewc_online': False, 'ewc_epochs': 100, 'num_ewc_steps': 20, 'save_name': None, 'date': datetime.date(2023, 1, 10), 'task_id': None, 'single_task': False, 'F_prior': 1e-15, 'input_padding': False, 'sample': False, 'samples

In [7]:
# Split

if args.approach == 'fine-tuning' or args.approach == 'ft-fix':
    log_name = '{}_{}_{}_{}'.format(args.date, args.experiment, args.approach,args.seed)
elif args.approach == 'ewc' in args.approach:
    log_name = '{}_{}_{}_{}_lamb_{}'.format(args.date, args.experiment, args.approach, args.seed, args.ewc_lambda)
elif args.approach == 'blip':
    log_name = '{}_{}_{}_{}_F_prior_{}'.format(args.date, args.experiment, args.approach, args.seed, args.F_prior)

if args.experiment in conv_experiment:
    log_name = log_name + '_conv'

# Seed
seed(args.seed)

# Inits
if args.cuda:
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
else:
    torch.set_default_tensor_type('torch.FloatTensor')

device = torch.device("cuda" if args.cuda else "cpu")

#taskcla = [(0,14), (1,18), (2,18), (3,18), (4,18), (5,6)]
#task_sequences = [(0, 'KungFuMasterNoFrameskip-v4'), (1, 'BoxingNoFrameskip-v4'), (2, 'JamesbondNoFrameskip-v4'), (3, 'KrullNoFrameskip-v4'), (4, 'RiverraidNoFrameskip-v4'), (5, 'SpaceInvadersNoFrameskip-v4')]

taskcla = [(0,7), (1,7)]
task_sequences = [(0, 'MiniGrid-Empty-5x5-v0'), (1, 'MiniGrid-Empty-Random-6x6-v0')]

# hard coded for atari environment
#obs_shape = (4,84,84)

# for FlatObsWrapper Minigrid environment
obs_shape = (2739,)

if args.approach == 'blip':
    from rl_module.ppo_model import QPolicy
    print('using fisher prior of: ', args.F_prior)
    actor_critic = QPolicy(obs_shape,
        taskcla,
        base_kwargs={'F_prior': args.F_prior, 'recurrent': args.recurrent_policy}).to(device)
else:
    from rl_module.ppo_model import Policy
    actor_critic = Policy(obs_shape,
        taskcla,
        base_kwargs={'recurrent': args.recurrent_policy}).to(device)

# Args -- Approach
if args.approach == 'fine-tuning' or args.approach == 'ft-fix':
    from rl_module.ppo import PPO as approach

    agent = approach(actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
            use_clipped_value_loss=True,
            optimizer=args.optimizer)
elif args.approach == 'ewc':
    from rl_module.ppo_ewc import PPO_EWC as approach

    agent = approach(
        actor_critic,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=args.lr,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm,
        use_clipped_value_loss=True,
        ewc_lambda= args.ewc_lambda,
        online = args.ewc_online)

elif args.approach == 'blip':
    from rl_module.ppo_blip import PPO_BLIP as approach

    agent = approach(
        actor_critic,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=args.lr,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm,
        use_clipped_value_loss=True)

In [8]:
tr_reward_arr = []
te_reward_arr = {}

for _type in (['mean', 'max', 'min']):
    te_reward_arr[_type] = {}
    for idx in range(len(taskcla)):
        te_reward_arr[_type]['task' + str(idx)] = []

for task_idx,env_name in task_sequences:
    print(env_name)
    # renew optimizer
    agent.renew_optimizer()

    #envs = make_vec_envs(env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False)

    # FlatObsWrapper for MiniGrid
    envs = make_vec_envs(env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, wrapper_class=FlatObsWrapper)
    obs = envs.reset()

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                    obs_shape, envs.action_space,
                                    actor_critic.recurrent_hidden_state_size)

    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes

    train_ppo(actor_critic, agent, rollouts, task_idx, env_name, task_sequences, envs,  obs_shape, args, episode_rewards, tr_reward_arr, te_reward_arr, num_updates, log_name, device, wrapper_class=FlatObsWrapper)

    # post-processing
    if args.approach == 'fine-tuning':
        if args.single_task == True:
            envs.close()
            break
        else:
            envs.close()
    elif args.approach == 'ft-fix':
        # fix the backbone
        for param in actor_critic.features.parameters():
            param.requires_grad = False
        if args.single_task == True:
            envs.close()
            break
        else:
            envs.close()
    elif args.approach == 'ewc':
        agent.update_fisher(rollouts, task_idx)
        envs.close()
    elif args.approach == 'blip':
        agent.ng_post_processing(rollouts, task_idx)
        # save the model here so that bit allocation is saved
        save_path = os.path.join(args.save_dir, args.algo)
        torch.save(actor_critic.state_dict(),
            os.path.join(save_path, log_name + '_task_' + str(task_idx) + ".pth"))
        envs.close()

MiniGrid-Empty-5x5-v0
 Evaluation using 10 episodes: mean reward 0.00000 

len task_sequences :  2


  2%|▏         | 1/48 [00:03<02:31,  3.23s/it]

Updates 0, num timesteps 2048, FPS 633 
 Last 10 training episodes: mean/median reward 0.1/0.0, min/max reward 0.0/0.6



  4%|▍         | 2/48 [00:06<02:27,  3.21s/it]

Updates 1, num timesteps 4096, FPS 637 
 Last 10 training episodes: mean/median reward 0.1/0.0, min/max reward 0.0/0.7



  6%|▋         | 3/48 [00:09<02:23,  3.19s/it]

Updates 2, num timesteps 6144, FPS 641 
 Last 10 training episodes: mean/median reward 0.3/0.2, min/max reward 0.0/0.8



  8%|▊         | 4/48 [00:12<02:20,  3.20s/it]

Updates 3, num timesteps 8192, FPS 640 
 Last 10 training episodes: mean/median reward 0.5/0.5, min/max reward 0.0/0.9



 10%|█         | 5/48 [00:15<02:16,  3.18s/it]

Updates 4, num timesteps 10240, FPS 642 
 Last 10 training episodes: mean/median reward 0.7/0.7, min/max reward 0.0/0.9



 12%|█▎        | 6/48 [00:19<02:12,  3.16s/it]

Updates 5, num timesteps 12288, FPS 644 
 Last 10 training episodes: mean/median reward 0.8/0.9, min/max reward 0.3/0.9



 15%|█▍        | 7/48 [00:22<02:09,  3.16s/it]

Updates 6, num timesteps 14336, FPS 645 
 Last 10 training episodes: mean/median reward 0.8/0.8, min/max reward 0.5/0.9



 17%|█▋        | 8/48 [00:25<02:06,  3.17s/it]

Updates 7, num timesteps 16384, FPS 644 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/0.9



 19%|█▉        | 9/48 [00:28<02:03,  3.16s/it]

Updates 8, num timesteps 18432, FPS 645 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/0.9

Updates 9, num timesteps 20480, FPS 644 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 21%|██        | 10/48 [00:37<03:07,  4.92s/it]

 Evaluation using 10 episodes: mean reward 0.00000 

len task_sequences :  2


 23%|██▎       | 11/48 [00:40<02:45,  4.46s/it]

Updates 10, num timesteps 22528, FPS 551 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 25%|██▌       | 12/48 [00:44<02:26,  4.07s/it]

Updates 11, num timesteps 24576, FPS 558 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 27%|██▋       | 13/48 [00:47<02:13,  3.81s/it]

Updates 12, num timesteps 26624, FPS 563 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 29%|██▉       | 14/48 [00:50<02:03,  3.63s/it]

Updates 13, num timesteps 28672, FPS 568 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 31%|███▏      | 15/48 [00:53<01:55,  3.50s/it]

Updates 14, num timesteps 30720, FPS 572 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 33%|███▎      | 16/48 [00:56<01:49,  3.41s/it]

Updates 15, num timesteps 32768, FPS 576 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 35%|███▌      | 17/48 [01:00<01:44,  3.38s/it]

Updates 16, num timesteps 34816, FPS 578 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 38%|███▊      | 18/48 [01:03<01:39,  3.31s/it]

Updates 17, num timesteps 36864, FPS 582 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 40%|███▉      | 19/48 [01:06<01:34,  3.27s/it]

Updates 18, num timesteps 38912, FPS 585 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0

Updates 19, num timesteps 40960, FPS 587 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 42%|████▏     | 20/48 [01:15<02:19,  5.00s/it]

 Evaluation using 10 episodes: mean reward 0.00000 

len task_sequences :  2


 44%|████▍     | 21/48 [01:18<02:01,  4.50s/it]

Updates 20, num timesteps 43008, FPS 545 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 46%|████▌     | 22/48 [01:21<01:46,  4.10s/it]

Updates 21, num timesteps 45056, FPS 549 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 48%|████▊     | 23/48 [01:25<01:35,  3.83s/it]

Updates 22, num timesteps 47104, FPS 552 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 50%|█████     | 24/48 [01:28<01:27,  3.63s/it]

Updates 23, num timesteps 49152, FPS 556 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 52%|█████▏    | 25/48 [01:31<01:20,  3.49s/it]

Updates 24, num timesteps 51200, FPS 559 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 54%|█████▍    | 26/48 [01:34<01:14,  3.39s/it]

Updates 25, num timesteps 53248, FPS 562 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



 56%|█████▋    | 27/48 [01:37<01:10,  3.34s/it]

Updates 26, num timesteps 55296, FPS 564 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 58%|█████▊    | 28/48 [01:41<01:05,  3.29s/it]

Updates 27, num timesteps 57344, FPS 567 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 60%|██████    | 29/48 [01:44<01:01,  3.26s/it]

Updates 28, num timesteps 59392, FPS 569 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0

Updates 29, num timesteps 61440, FPS 571 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 62%|██████▎   | 30/48 [01:52<01:26,  4.83s/it]

 Evaluation using 10 episodes: mean reward 0.95500 

len task_sequences :  2


 65%|██████▍   | 31/48 [01:56<01:14,  4.41s/it]

Updates 30, num timesteps 63488, FPS 546 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 67%|██████▋   | 32/48 [01:59<01:04,  4.03s/it]

Updates 31, num timesteps 65536, FPS 549 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 69%|██████▉   | 33/48 [02:02<00:56,  3.77s/it]

Updates 32, num timesteps 67584, FPS 551 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 71%|███████   | 34/48 [02:05<00:50,  3.59s/it]

Updates 33, num timesteps 69632, FPS 554 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 73%|███████▎  | 35/48 [02:08<00:45,  3.47s/it]

Updates 34, num timesteps 71680, FPS 556 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 75%|███████▌  | 36/48 [02:12<00:40,  3.40s/it]

Updates 35, num timesteps 73728, FPS 558 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 77%|███████▋  | 37/48 [02:15<00:36,  3.34s/it]

Updates 36, num timesteps 75776, FPS 560 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 79%|███████▉  | 38/48 [02:18<00:32,  3.29s/it]

Updates 37, num timesteps 77824, FPS 562 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 81%|████████▏ | 39/48 [02:21<00:29,  3.25s/it]

Updates 38, num timesteps 79872, FPS 563 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0

Updates 39, num timesteps 81920, FPS 565 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 83%|████████▎ | 40/48 [02:29<00:38,  4.78s/it]

 Evaluation using 10 episodes: mean reward 0.95500 

len task_sequences :  2


 85%|████████▌ | 41/48 [02:33<00:30,  4.36s/it]

Updates 40, num timesteps 83968, FPS 547 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



 88%|████████▊ | 42/48 [02:36<00:24,  4.01s/it]

Updates 41, num timesteps 86016, FPS 549 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 90%|████████▉ | 43/48 [02:39<00:18,  3.76s/it]

Updates 42, num timesteps 88064, FPS 551 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 92%|█████████▏| 44/48 [02:42<00:14,  3.59s/it]

Updates 43, num timesteps 90112, FPS 553 
 Last 10 training episodes: mean/median reward 0.9/1.0, min/max reward 0.9/1.0



 94%|█████████▍| 45/48 [02:46<00:10,  3.48s/it]

Updates 44, num timesteps 92160, FPS 554 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 96%|█████████▌| 46/48 [02:49<00:06,  3.40s/it]

Updates 45, num timesteps 94208, FPS 556 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 98%|█████████▊| 47/48 [02:52<00:03,  3.32s/it]

Updates 46, num timesteps 96256, FPS 558 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.7/1.0



100%|██████████| 48/48 [02:55<00:00,  3.66s/it]

Updates 47, num timesteps 98304, FPS 559 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0

MiniGrid-Empty-Random-6x6-v0





 Evaluation using 10 episodes: mean reward 0.95500 

 Evaluation using 10 episodes: mean reward 0.00000 

len task_sequences :  2


  2%|▏         | 1/48 [00:03<02:37,  3.34s/it]

Updates 0, num timesteps 2048, FPS 612 
 Last 7 training episodes: mean/median reward 0.8/0.8, min/max reward 0.5/1.0



  4%|▍         | 2/48 [00:06<02:30,  3.28s/it]

Updates 1, num timesteps 4096, FPS 622 
 Last 10 training episodes: mean/median reward 0.7/0.7, min/max reward 0.0/1.0



  6%|▋         | 3/48 [00:09<02:27,  3.28s/it]

Updates 2, num timesteps 6144, FPS 622 
 Last 10 training episodes: mean/median reward 0.6/0.7, min/max reward 0.0/0.9



  8%|▊         | 4/48 [00:13<02:23,  3.27s/it]

Updates 3, num timesteps 8192, FPS 625 
 Last 10 training episodes: mean/median reward 0.6/0.7, min/max reward 0.1/1.0



 10%|█         | 5/48 [00:16<02:20,  3.26s/it]

Updates 4, num timesteps 10240, FPS 626 
 Last 10 training episodes: mean/median reward 0.7/0.8, min/max reward 0.3/1.0



 12%|█▎        | 6/48 [00:19<02:17,  3.27s/it]

Updates 5, num timesteps 12288, FPS 625 
 Last 10 training episodes: mean/median reward 0.7/0.8, min/max reward 0.0/1.0



 15%|█▍        | 7/48 [00:22<02:15,  3.30s/it]

Updates 6, num timesteps 14336, FPS 623 
 Last 10 training episodes: mean/median reward 0.8/0.9, min/max reward 0.6/1.0



 17%|█▋        | 8/48 [00:26<02:11,  3.29s/it]

Updates 7, num timesteps 16384, FPS 623 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.7/1.0



 19%|█▉        | 9/48 [00:29<02:08,  3.30s/it]

Updates 8, num timesteps 18432, FPS 622 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.6/1.0

Updates 9, num timesteps 20480, FPS 621 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0

 Evaluation using 10 episodes: mean reward 0.95500 



 21%|██        | 10/48 [00:44<04:18,  6.79s/it]

 Evaluation using 10 episodes: mean reward 0.39062 

len task_sequences :  2


 23%|██▎       | 11/48 [00:47<03:33,  5.77s/it]

Updates 10, num timesteps 22528, FPS 472 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 25%|██▌       | 12/48 [00:51<03:02,  5.08s/it]

Updates 11, num timesteps 24576, FPS 480 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 27%|██▋       | 13/48 [00:54<02:39,  4.55s/it]

Updates 12, num timesteps 26624, FPS 488 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.8/1.0



 29%|██▉       | 14/48 [00:57<02:21,  4.17s/it]

Updates 13, num timesteps 28672, FPS 496 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 31%|███▏      | 15/48 [01:01<02:09,  3.93s/it]

Updates 14, num timesteps 30720, FPS 502 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 33%|███▎      | 16/48 [01:04<01:59,  3.75s/it]

Updates 15, num timesteps 32768, FPS 508 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 35%|███▌      | 17/48 [01:07<01:52,  3.62s/it]

Updates 16, num timesteps 34816, FPS 513 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 38%|███▊      | 18/48 [01:11<01:46,  3.54s/it]

Updates 17, num timesteps 36864, FPS 518 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 40%|███▉      | 19/48 [01:14<01:40,  3.48s/it]

Updates 18, num timesteps 38912, FPS 522 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0

Updates 19, num timesteps 40960, FPS 526 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0

 Evaluation using 10 episodes: mean reward 0.95500 



 42%|████▏     | 20/48 [01:28<03:06,  6.64s/it]

 Evaluation using 10 episodes: mean reward 0.97062 

len task_sequences :  2


 44%|████▍     | 21/48 [01:32<02:34,  5.70s/it]

Updates 20, num timesteps 43008, FPS 467 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 46%|████▌     | 22/48 [01:35<02:10,  5.00s/it]

Updates 21, num timesteps 45056, FPS 472 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



 48%|████▊     | 23/48 [01:38<01:52,  4.51s/it]

Updates 22, num timesteps 47104, FPS 477 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 50%|█████     | 24/48 [01:42<01:39,  4.16s/it]

Updates 23, num timesteps 49152, FPS 481 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 52%|█████▏    | 25/48 [01:45<01:30,  3.92s/it]

Updates 24, num timesteps 51200, FPS 485 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 54%|█████▍    | 26/48 [01:48<01:22,  3.75s/it]

Updates 25, num timesteps 53248, FPS 489 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 56%|█████▋    | 27/48 [01:52<01:16,  3.63s/it]

Updates 26, num timesteps 55296, FPS 493 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



 58%|█████▊    | 28/48 [01:55<01:12,  3.62s/it]

Updates 27, num timesteps 57344, FPS 495 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 60%|██████    | 29/48 [01:59<01:07,  3.53s/it]

Updates 28, num timesteps 59392, FPS 498 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0

Updates 29, num timesteps 61440, FPS 501 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0

 Evaluation using 10 episodes: mean reward 0.95500 



 62%|██████▎   | 30/48 [02:13<02:00,  6.69s/it]

 Evaluation using 10 episodes: mean reward 0.97313 

len task_sequences :  2


 65%|██████▍   | 31/48 [02:16<01:37,  5.72s/it]

Updates 30, num timesteps 63488, FPS 464 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 67%|██████▋   | 32/48 [02:19<01:20,  5.01s/it]

Updates 31, num timesteps 65536, FPS 468 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 69%|██████▉   | 33/48 [02:23<01:07,  4.51s/it]

Updates 32, num timesteps 67584, FPS 471 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 71%|███████   | 34/48 [02:26<00:58,  4.16s/it]

Updates 33, num timesteps 69632, FPS 474 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 73%|███████▎  | 35/48 [02:29<00:50,  3.91s/it]

Updates 34, num timesteps 71680, FPS 477 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 75%|███████▌  | 36/48 [02:33<00:44,  3.74s/it]

Updates 35, num timesteps 73728, FPS 480 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



 77%|███████▋  | 37/48 [02:36<00:39,  3.62s/it]

Updates 36, num timesteps 75776, FPS 483 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 79%|███████▉  | 38/48 [02:40<00:35,  3.55s/it]

Updates 37, num timesteps 77824, FPS 486 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 81%|████████▏ | 39/48 [02:43<00:31,  3.50s/it]

Updates 38, num timesteps 79872, FPS 488 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0

Updates 39, num timesteps 81920, FPS 491 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0

 Evaluation using 10 episodes: mean reward 0.95500 



 83%|████████▎ | 40/48 [02:57<00:53,  6.68s/it]

 Evaluation using 10 episodes: mean reward 0.97437 

len task_sequences :  2


 85%|████████▌ | 41/48 [03:01<00:40,  5.72s/it]

Updates 40, num timesteps 83968, FPS 463 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 88%|████████▊ | 42/48 [03:04<00:29,  5.00s/it]

Updates 41, num timesteps 86016, FPS 466 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 90%|████████▉ | 43/48 [03:07<00:22,  4.50s/it]

Updates 42, num timesteps 88064, FPS 469 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 92%|█████████▏| 44/48 [03:11<00:16,  4.17s/it]

Updates 43, num timesteps 90112, FPS 471 
 Last 10 training episodes: mean/median reward 0.9/0.9, min/max reward 0.9/1.0



 94%|█████████▍| 45/48 [03:14<00:11,  3.91s/it]

Updates 44, num timesteps 92160, FPS 474 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 1.0/1.0



 96%|█████████▌| 46/48 [03:17<00:07,  3.73s/it]

Updates 45, num timesteps 94208, FPS 476 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



 98%|█████████▊| 47/48 [03:20<00:03,  3.61s/it]

Updates 46, num timesteps 96256, FPS 478 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0



100%|██████████| 48/48 [03:24<00:00,  4.26s/it]

Updates 47, num timesteps 98304, FPS 480 
 Last 10 training episodes: mean/median reward 1.0/1.0, min/max reward 0.9/1.0






Check MiniGrid vectorized environments

In [None]:
import gym
from gym import spaces
from gym.wrappers import Monitor
import gym_minigrid
from gym_minigrid.wrappers import FlatObsWrapper, ImgObsWrapper, RGBImgPartialObsWrapper, RGBImgObsWrapper
from gym.wrappers import Monitor

In [None]:
import matplotlib.pyplot as plt

env_name = 'MiniGrid-DoorKey-6x6-v0'

# Create vectorized environment with wrapper class
vec_env = make_vec_envs(env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, wrapper_class=FlatObsWrapper)

# Plot snapshot of vectorized environment and check randomized init
vec_env.reset()
before_img = vec_env.render('rgb_array')

plt.figure(figsize = (6.,6.))
plt.imshow(before_img);

In [None]:
from scipy.io import savemat
import numpy as np
a = np.arange(20)
mdic = {"a": a, "label": "experiment"}
print(mdic)
savemat("./res/matlab_matrix.mat", mdic)