# Meta-RIMs learning

In [1]:
# JUPYTER SETTINGS

import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
#%load_ext autoreload
#%autoreload 2
%reload_ext autoreload
%autoreload 2

# IMPORT LIBRARIES

import time
import io
import os
import glob
import base64
import datetime
import torch
import torch_ac
import tensorboardX
import sys
import utils
from model import ACModel
from torch_ac.utils import DictList, ParallelEnv
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
import numpy as np
import random

import gym
from gym import spaces
from gym.wrappers import Monitor
import gym_minigrid
from gym_minigrid.wrappers import FlatObsWrapper, ImgObsWrapper, RGBImgPartialObsWrapper

# HELPER FUNCTIONS

def make_envs(env_id, procs, seed=None):
    envs = []
    for i in range(procs):
        if seed:
            e = utils.make_env(env_id, seed + 10000 * i)
        else:
            e = utils.make_env(env_id)
        envs.append(e)
    env = ParallelEnv(envs)
    return env

def sample_tasks(n_tasks):
    tasks_list = []
    for i in range(n_tasks):
        random_data = os.urandom(4)
        seed = int.from_bytes(random_data, byteorder="big")
        tasks_list.append(seed)
    return tasks_list

def env_snapshot(env:ParallelEnv):
    im_list = []
    for e in env.envs:
        #print(type(e.render('rgb_array')))
        #e.reset()
        im_list.append(e.render('rgb_array'))

    fig = plt.figure(figsize=(8., 8.))
    grid = ImageGrid(fig, 111,  # similar to subplot(111)
                    nrows_ncols=(4, 4),  # creates 2x2 grid of axes
                    axes_pad=0.1,  # pad between axes in inch.
                    )

    for ax, im in zip(grid, im_list):
        # Iterating over the grid returns the Axes.
        ax.imshow(im)

def set_freeze_status(model, params, freeze=True):
    for name, param in model.named_parameters():
        if any(ext in name for ext in params):
            param.requires_grad = False if freeze else True
            #param.grad = None if freeze else param.grad

# Function to concatenate two tasks rollout exps dictionaries
def cat_exps(exps_1, exps_2):
    exp_out = {}
    for (k,v), (k2,v2) in zip(exps_1.items(), exps_2.items()):
        if k == 'obs':
            obs = k
            exp_out[obs] = {}
            for (k,v), (k2,v2) in zip(exps_1.obs.items(), exps_2.obs.items()):
                exp_out[obs][k] = torch.cat((v, v2), 0)
        else:
            exp_out[k] = torch.cat((v, v2), 0)
    return exp_out

# Function to concatenate two tasks rollout logs dictionaries
def cat_logs(logs_1, logs_2):
    logs_out = {}
    for (k,v), (k2,v2) in zip(logs_1.items(), logs_2.items()):
        logs_out[k] = v + v2
    return logs_out

def change_multienv_seed(env, seed):
    for i, e in enumerate(env.envs):
        e.seed(seed + 10000 * i)
        e.reset()
    return env
        
def sample_tasks_experiences(agent, tasks):
    seed_list = tasks['seed_list']
    exps_batch = []
    logs1_batch = []
    for seed in seed_list:
        agent.env = change_multienv_seed(agent.env, seed)
        #agent.env = make_envs(env_id, procs, seed)
        exps, logs1 = agent.collect_experiences() 
        exps_batch.append(exps)
        logs1_batch.append(logs1)
    return exps_batch, logs1_batch

# Function to collect and concatenate all tasks rollout exps dictionaries
def collect_tasks_meta_experiences(agent, tasks):
    seed_list = tasks['seed_list']
    for i, seed in enumerate(seed_list):
        #agent.env = make_envs(env_id, procs, seed)
        agent.env = change_multienv_seed(agent.env, seed)
        exps, logs1 = agent.collect_experiences()
        concat_exps = exps if i==0 else cat_exps(concat_exps, exps)
        concat_exps = DictList(concat_exps)
        concat_exps.obs = DictList(concat_exps.obs)
        concat_logs1 = logs1 if i==0 else cat_logs(concat_logs1, logs1)
    return concat_exps, concat_logs1

## Training

In [5]:
# TRAINING

# List of tasks

sequence = 2

if sequence == 0:
    ## Experiment 0
    taskcla = [(0,7), (1,7), (2,7), (3,7)]
    tasks_sequence = [
        (0, 'MiniGrid-RedBlueDoors-6x6-v0'), 
        (1, 'MiniGrid-DoorKey-6x6-v0'),
        (2, 'MiniGrid-WallGapS6-v0'),
        (3, 'MiniGrid-LavaGapS6-v0')      
        ]
elif sequence == 1:
    ## Experiment 1
    taskcla = [(0,7), (1,7), (2,7), (3,7)]
    tasks_sequence = [
        (0, 'MiniGrid-DoorKey-6x6-v0'),
        (1, 'MiniGrid-RedBlueDoors-6x6-v0'), 
        (2, 'MiniGrid-WallGapS6-v0'),
        (3, 'MiniGrid-LavaGapS6-v0')
        ]
elif sequence == 2:
    ## Experiment 3
    taskcla = [(0,7), (1,7), (2,7), (3,7)]
    tasks_sequence = [
        (0, 'MiniGrid-WallGapS6-v0'),
        (1, 'MiniGrid-DoorKey-6x6-v0'),
        (2, 'MiniGrid-RedBlueDoors-6x6-v0'), 
        (3, 'MiniGrid-SimpleCrossingS9N1-v0')   
        ]
elif sequence == 3:
    ## Experiment 4
    taskcla = [(0,7), (1,7), (2,7), (3,7)]
    tasks_sequence = [
        (0, 'MiniGrid-WallGapS6-v0'),
        (1, 'MiniGrid-DoorKey-6x6-v0'),
        (2, 'MiniGrid-SimpleCrossingS9N1-v0'),
        (3, 'MiniGrid-UnlockPickup-v0'), 
        ]
elif sequence == 4:
    ## Experiment 4
    taskcla = [(0,7), (1,7), (2,7), (3,7)]
    tasks_sequence = [
        (0, 'MiniGrid-UnlockPickup-v0'),
        (1, 'MiniGrid-DoorKey-6x6-v0'),
        (2, 'MiniGrid-WallGapS6-v0'),
        (3, 'MiniGrid-SimpleCrossingS9N1-v0'), 
        ]
elif sequence == 5:
    ## Experiment 3
    taskcla = [(0,7), (1,7), (2,7), (3,7)]
    tasks_sequence = [
        (0, 'MiniGrid-LavaGapS5-v0'),
        (1, 'MiniGrid-DoorKey-6x6-v0'),
        (2, 'MiniGrid-RedBlueDoors-6x6-v0'), 
        (3, 'MiniGrid-WallGapS6-v0')   
        ]

seed_list = [123456, 789012, 345678]

model = "wallgap-doorkey-redblue-crossing"
frames_per_proc = 128
processes = 16
reshape = False

for seed in seed_list:
    
    # START TRAINING 1st ENVIRONMENT ----------------------

    # LOAD PARAMETERS
    index = 0
    env_id = tasks_sequence[index][1]

    frames = 5e5
    add_frames = 5e5

    ## Hyper-parameters
    args = {
    # General parameters
    'algo':"ppo",
    'env':env_id,
    'model':model,
    'early_stop':False,
    'seed':seed,
    'log_interval':1,
    'save_interval':10,
    'procs':processes,
    'frames':int(frames), # default 1e7
    # Parameters for main algorithm
    'epochs':4,
    'batch_size':256,
    'frames_per_proc':frames_per_proc, # 128 for PPO and 5 per A2C
    'discount':0.99,
    #'lr':2.5e-4,#0.0001, # for Adam
    'lr':0.0007, # for RMSProp
    #'gae_lambda':0.95, # 1 means no gae, for Adam
    'gae_lambda':0.99, # 1 means no gae, for RMSProp
    'entropy_coef': 0.01,
    'value_loss_coef':0.5,
    'max_grad_norm':0.5,
    'optim_eps':1e-8,
    'optim_alpha':0.99,
    'clip_eps':0.2,
    'recurrence':32, # if > 1, a LSTM is added
    'text':False, # add a GRU for text input
    # Model Parameters
    'use_rim':True, # action = 'store_true'
    'meta_learn':True,
    'reshape_reward':reshape,
    'date':datetime.date.today()
    }

    #args = utils.dotdict(args)
    args = DictList(args)

    args.mem = args.recurrence > 1

    # RIM specific hyperparameters
    if args.use_rim:
        args.num_units = 6
        args.k = 4
        args.input_heads = 1

    if args.meta_learn:
        args.lr_alpha= args.lr
        args.lr_beta= args.lr
        args.inner_recurrence= 8
        args.outer_recurrence= 32 # 4x inner_recurrence
        args.num_tasks = 2
        args.inner_params= ['image_conv', 'i2h', 'h2h', 'actor'] # params to be updated in inner loop
        args.outer_params = ['query', 'key', 'value', 'comm', 'critic'] # params to be updated in outer loop


    # INITIAL SETTINGS

    # Set run dir

    date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}"
    #model_name = args.model or default_model_name
    model_name = '{}_{}_{}_{}_{}_frames_{}_cascade_{}_reshape_{}'.format(args.date, args.model, args.algo, args.seed, args.frames, args.frames_per_proc, args.cascade_depth, args.reshape_reward) or default_model_name
    model_dir = utils.get_model_dir(model_name)

    # Load loggers and Tensorboard writer

    txt_logger = utils.get_txt_logger(model_dir)
    csv_file, csv_logger = utils.get_csv_logger(model_dir)
    tb_writer = tensorboardX.SummaryWriter(model_dir)

    # Log command and all script arguments

    #txt_logger.info("{}\n".format(" ".join(sys.argv)))
    txt_logger.info("{}\n".format(args))

    # Set seed for all randomness sources

    utils.seed(args.seed)

    # Set device

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    txt_logger.info(f"Device: {device}\n")


    # LOAD ENVIRONMENTS AND INITIALIZE MODELS

    # Load environments

    envs = []
    for i in range(args.procs):
        envs.append(utils.make_env(args.env, args.seed + 10000 * i))
    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    # Reshape reward function
    if args.reshape_reward:
        def reshape_reward(obs, action, reward, done):
            if not done:
                reward = -1
            else:
                reward = 1
            return reward
    else:
        reshape_reward = None

    # Load model

    acmodel = ACModel(obs_space=obs_space, action_space=envs[0].action_space, use_memory=args.mem, use_text=args.text, use_rim=args.use_rim, num_units=args.num_units, k=args.k, input_heads=args.input_heads)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
    acmodel.to(device)
    txt_logger.info("Model loaded\n")
    txt_logger.info("{}\n".format(acmodel))


    # Load algo

    if args.algo == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps, preprocess_obss)
    elif args.algo == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                                args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss, reshape_reward)                     
    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    # change to RMSProp optimizer
    algo.optimizer = torch.optim.RMSprop(algo.acmodel.parameters(), args.lr, eps=args.optim_eps)

    if "optimizer_state" in status:
        algo.optimizer.load_state_dict(status["optimizer_state"])
    txt_logger.info("Optimizer loaded\n")

    # meta-learn initializacion

    # delete param_groups after it has been created
    for i in range(len(algo.optimizer.param_groups)):
        del algo.optimizer.param_groups[0]

    # Re-create separate param_groups for different inner and outer loop optimizer lr

    # inner loop param group
    algo.optimizer.add_param_group({'params': [
        *acmodel.image_conv.parameters(), 
        *acmodel.memory_rnn.rnn.parameters(), 
        *acmodel.actor.parameters()], 'lr': args.lr_alpha})

    # outer loop param group
    algo.optimizer.add_param_group({'params': [
        *acmodel.critic.parameters(), 
        *acmodel.memory_rnn.key.parameters(),
        *acmodel.memory_rnn.key_.parameters(),
        *acmodel.memory_rnn.query.parameters(),
        *acmodel.memory_rnn.query_.parameters(),
        *acmodel.memory_rnn.value.parameters(),
        *acmodel.memory_rnn.value_.parameters(),
        *acmodel.memory_rnn.comm_attention_output.parameters()
        ], 'lr': args.lr_beta})


    # TRAINING LOOP

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()
    #nupdates = args.frames // (args.procs * args.frames_per_proc)

    # Moving average parameters
    threshold = 0.90
    window = 10
    rreturn_total = 0
    i = 0

    # run just once to have initial grads in all parameters and avoid backward error on first pass
    exps, _ = algo.collect_experiences()
    algo.update_parameters(exps)


    while num_frames < args.frames: # STEP 2

        update_start_time = time.time()

        # Sample batch of tasks: STEP 3
        tasks_batch = sample_tasks(n_tasks=args.num_tasks)

        for n, task in enumerate(tasks_batch):

            algo.env = change_multienv_seed(algo.env, seed=task)
            # Sample pre-trajectories from each task: STEP 4
            pre_exps, pre_logs1 = algo.collect_experiences()
            # Unfreeze inner loop params,so grads can get updated in the inner loop
            set_freeze_status(algo.acmodel, args.inner_params, freeze=False)
            # Freeze outer loop parameters, so grads do not get updated in the inner loop
            set_freeze_status(algo.acmodel, args.outer_params, freeze=True)
            # set inner RIM recurence
            algo.recurrence = args.inner_recurrence
            # Update parameters: STEP 6
            algo.update_parameters(pre_exps)
            # Sample post-trajectories t_i from tasks T_i: STEP 7
            post_exps, post_logs1 = algo.collect_experiences()
            # Concatenate to get D_meta: STEP 8
            meta_exps = post_exps if n==0 else cat_exps(meta_exps, post_exps)
            meta_exps = DictList(meta_exps)
            meta_exps.obs = DictList(meta_exps.obs)
            meta_logs1 = post_logs1 if n==0 else cat_logs(meta_logs1, post_logs1)

        # Unfreeze outer loop params, so so grads can get updated in the outer loop
        set_freeze_status(algo.acmodel, args.outer_params, freeze=False)
        # Freeze inner loop params, so grads do not get updated in the outer loop
        set_freeze_status(algo.acmodel, args.inner_params, freeze=True)   
        
        # set outer RIM recurence
        algo.recurrence = args.outer_recurrence

        # Update parameters while keeping inner parametes (module and policy) frozen: STEP 9
        meta_logs2 = algo.update_parameters(meta_exps)

        meta_logs = {**meta_logs1, **meta_logs2}
        update_end_time = time.time()

        num_frames += meta_logs["num_frames"]
        update += 1    
        
        # Print logs

        if update % args.log_interval == 0:

            fps = meta_logs["num_frames"]/(update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(meta_logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(meta_logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(meta_logs["num_frames_per_episode"])
            # Moving average to break loop if mean reward threshold reached
            if args.early_stop:
                #rreturn_total +=rreturn_per_episode['mean']
                rreturn_total +=return_per_episode['mean']
                i+=1
                if i >= window:
                    rreturn_mavg = rreturn_total / i
                    if rreturn_mavg >= threshold:
                        break
                    else:
                        i = 0
                        rreturn_total = 0

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            #header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            #data += rreturn_per_episode.values()
            header += ["rreturn_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()       
            header += ["num_frames_" + key for key in num_frames_per_episode.keys()]
            data += num_frames_per_episode.values()
            header += ["entropy", "value", "policy_loss", "value_loss", "grad_norm"]
            data += [meta_logs["entropy"], meta_logs["value"], meta_logs["policy_loss"], meta_logs["value_loss"], meta_logs["grad_norm"]]

            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                .format(*data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_logger.writerow(header)
            csv_logger.writerow(data)
            csv_file.flush()

            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)       

        # Save status

        if args.save_interval > 0 and update % args.save_interval == 0:
            status = {"num_frames": num_frames, "update": update,
                    "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict()}
            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
            txt_logger.info("Status saved")
    # STEP 10

    print("Number of frames: ", num_frames)

    # CONTINUE TRAINING 2nd ENVIRONMENT ----------------------

    index = 1
    env_id = tasks_sequence[index][1]
    frames = args.frames + add_frames

    ## Hyper-parameters
    args.env = env_id
    args.frames = int(frames)

    # Load environments

    envs = []
    for i in range(args.procs):
        envs.append(utils.make_env(args.env, args.seed + 10000 * i))
    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    # Reshape reward function
    if args.reshape_reward:
        def reshape_reward(obs, action, reward, done):
            if not done:
                reward = -1
            else:
                reward = 1
            return reward
    else:
        reshape_reward = None

    # Load model

    acmodel = ACModel(obs_space=obs_space, action_space=envs[0].action_space, use_memory=args.mem, use_text=args.text, use_rim=args.use_rim, num_units=args.num_units, k=args.k, input_heads=args.input_heads)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
    acmodel.to(device)
    txt_logger.info("Model loaded\n")
    txt_logger.info("{}\n".format(acmodel))


    # Load algo

    if args.algo == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps, preprocess_obss)
    elif args.algo == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                                args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss, reshape_reward)
    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    # change to RMSProp optimizer
    algo.optimizer = torch.optim.RMSprop(algo.acmodel.parameters(), args.lr, eps=args.optim_eps)

    # delete param_groups after it has been created
    for i in range(len(algo.optimizer.param_groups)):
        del algo.optimizer.param_groups[0]

    # Re-create separate param_groups for different inner and outer loop optimizer lr

    # inner loop param group
    algo.optimizer.add_param_group({'params': [
        *acmodel.image_conv.parameters(), 
        *acmodel.memory_rnn.rnn.parameters(), 
        *acmodel.actor.parameters()], 'lr': args.lr_alpha})

    # outer loop param group
    algo.optimizer.add_param_group({'params': [
        *acmodel.critic.parameters(), 
        *acmodel.memory_rnn.key.parameters(),
        *acmodel.memory_rnn.key_.parameters(),
        *acmodel.memory_rnn.query.parameters(),
        *acmodel.memory_rnn.query_.parameters(),
        *acmodel.memory_rnn.value.parameters(),
        *acmodel.memory_rnn.value_.parameters(),
        *acmodel.memory_rnn.comm_attention_output.parameters()
        ], 'lr': args.lr_beta})

    if "optimizer_state" in status:
        algo.optimizer.load_state_dict(status["optimizer_state"])
    txt_logger.info("Optimizer loaded\n")

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    # Moving average parameters
    threshold = 0.90
    window = 10
    rreturn_total = 0
    i = 0

    # run just once to have initial grads in all parameters and avoid backward error on first pass
    exps, _ = algo.collect_experiences()
    algo.update_parameters(exps)

    while num_frames < args.frames: # STEP 2

        update_start_time = time.time()

        # Sample batch of tasks: STEP 3
        tasks_batch = sample_tasks(n_tasks=args.num_tasks)

        for n, task in enumerate(tasks_batch):

            algo.env = change_multienv_seed(algo.env, seed=task)
            # Sample pre-trajectories from each task: STEP 4
            pre_exps, pre_logs1 = algo.collect_experiences()
            # Unfreeze inner loop params,so grads can get updated in the inner loop
            set_freeze_status(algo.acmodel, args.inner_params, freeze=False)
            # Freeze outer loop parameters, so grads do not get updated in the inner loop
            set_freeze_status(algo.acmodel, args.outer_params, freeze=True)
            # set inner RIM recurence
            algo.recurrence = args.inner_recurrence
            # Update parameters: STEP 6
            algo.update_parameters(pre_exps)
            # Sample post-trajectories t_i from tasks T_i: STEP 7
            post_exps, post_logs1 = algo.collect_experiences()
            # Concatenate to get D_meta: STEP 8
            meta_exps = post_exps if n==0 else cat_exps(meta_exps, post_exps)
            meta_exps = DictList(meta_exps)
            meta_exps.obs = DictList(meta_exps.obs)
            meta_logs1 = post_logs1 if n==0 else cat_logs(meta_logs1, post_logs1)

        # Unfreeze outer loop params, so so grads can get updated in the outer loop
        set_freeze_status(algo.acmodel, args.outer_params, freeze=False)
        # Freeze inner loop params, so grads do not get updated in the outer loop
        set_freeze_status(algo.acmodel, args.inner_params, freeze=True)   
        
        # set outer RIM recurence
        algo.recurrence = args.outer_recurrence

        # Update parameters while keeping inner parametes (module and policy) frozen: STEP 9
        meta_logs2 = algo.update_parameters(meta_exps)

        meta_logs = {**meta_logs1, **meta_logs2}
        update_end_time = time.time()

        num_frames += meta_logs["num_frames"]
        update += 1    
        
        # Print logs

        if update % args.log_interval == 0:
            
            fps = meta_logs["num_frames"]/(update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(meta_logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(meta_logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(meta_logs["num_frames_per_episode"])
            # Moving average to break loop if mean reward threshold reached
            if args.early_stop:
                #rreturn_total +=rreturn_per_episode['mean']
                rreturn_total +=return_per_episode['mean']
                i+=1
                if i >= window:
                    rreturn_mavg = rreturn_total / i
                    if rreturn_mavg >= threshold:
                        break
                    else:
                        i = 0
                        rreturn_total = 0

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            #header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            #data += rreturn_per_episode.values()
            header += ["rreturn_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()       
            header += ["num_frames_" + key for key in num_frames_per_episode.keys()]
            data += num_frames_per_episode.values()
            header += ["entropy", "value", "policy_loss", "value_loss", "grad_norm"]
            data += [meta_logs["entropy"], meta_logs["value"], meta_logs["policy_loss"], meta_logs["value_loss"], meta_logs["grad_norm"]]

            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                .format(*data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_logger.writerow(header)
            csv_logger.writerow(data)
            csv_file.flush()

            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)       

        # Save status

        if args.save_interval > 0 and update % args.save_interval == 0:
            status = {"num_frames": num_frames, "update": update,
                    "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict()}
            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
            txt_logger.info("Status saved")
    # STEP 10

    print("Number of frames: ", num_frames)


    # CONTINUE TRAINING 3rd ENVIRONMENT ----------------------

    index = 2
    env_id = tasks_sequence[index][1]
    frames = args.frames + add_frames

    ## Hyper-parameters
    args.env = env_id
    args.frames = int(frames)

    # Load environments

    envs = []
    for i in range(args.procs):
        envs.append(utils.make_env(args.env, args.seed + 10000 * i))
    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    # Reshape reward function
    if args.reshape_reward:
        def reshape_reward(obs, action, reward, done):
            if not done:
                reward = -1
            else:
                reward = 1
            return reward
    else:
        reshape_reward = None

    # Load model

    acmodel = ACModel(obs_space=obs_space, action_space=envs[0].action_space, use_memory=args.mem, use_text=args.text, use_rim=args.use_rim, num_units=args.num_units, k=args.k, input_heads=args.input_heads)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
    acmodel.to(device)
    txt_logger.info("Model loaded\n")
    txt_logger.info("{}\n".format(acmodel))


    # Load algo

    if args.algo == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps, preprocess_obss)
    elif args.algo == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                                args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss, reshape_reward)
    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    # change to RMSProp optimizer
    algo.optimizer = torch.optim.RMSprop(algo.acmodel.parameters(), args.lr, eps=args.optim_eps)

    # delete param_groups after it has been created
    for i in range(len(algo.optimizer.param_groups)):
        del algo.optimizer.param_groups[0]

    # Re-create separate param_groups for different inner and outer loop optimizer lr

    # inner loop param group
    algo.optimizer.add_param_group({'params': [
        *acmodel.image_conv.parameters(), 
        *acmodel.memory_rnn.rnn.parameters(), 
        *acmodel.actor.parameters()], 'lr': args.lr_alpha})

    # outer loop param group
    algo.optimizer.add_param_group({'params': [
        *acmodel.critic.parameters(), 
        *acmodel.memory_rnn.key.parameters(),
        *acmodel.memory_rnn.key_.parameters(),
        *acmodel.memory_rnn.query.parameters(),
        *acmodel.memory_rnn.query_.parameters(),
        *acmodel.memory_rnn.value.parameters(),
        *acmodel.memory_rnn.value_.parameters(),
        *acmodel.memory_rnn.comm_attention_output.parameters()
        ], 'lr': args.lr_beta})

    if "optimizer_state" in status:
        algo.optimizer.load_state_dict(status["optimizer_state"])
    txt_logger.info("Optimizer loaded\n")

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    # Moving average parameters
    threshold = 0.90
    window = 10
    rreturn_total = 0
    i = 0

    # run just once to have initial grads in all parameters and avoid backward error on first pass
    exps, _ = algo.collect_experiences()
    algo.update_parameters(exps)

    while num_frames < args.frames: # STEP 2

        update_start_time = time.time()

        # Sample batch of tasks: STEP 3
        tasks_batch = sample_tasks(n_tasks=args.num_tasks)

        for n, task in enumerate(tasks_batch):

            algo.env = change_multienv_seed(algo.env, seed=task)
            # Sample pre-trajectories from each task: STEP 4
            pre_exps, pre_logs1 = algo.collect_experiences()
            # Unfreeze inner loop params,so grads can get updated in the inner loop
            set_freeze_status(algo.acmodel, args.inner_params, freeze=False)
            # Freeze outer loop parameters, so grads do not get updated in the inner loop
            set_freeze_status(algo.acmodel, args.outer_params, freeze=True)
            # set inner RIM recurence
            algo.recurrence = args.inner_recurrence
            # Update parameters: STEP 6
            algo.update_parameters(pre_exps)
            # Sample post-trajectories t_i from tasks T_i: STEP 7
            post_exps, post_logs1 = algo.collect_experiences()
            # Concatenate to get D_meta: STEP 8
            meta_exps = post_exps if n==0 else cat_exps(meta_exps, post_exps)
            meta_exps = DictList(meta_exps)
            meta_exps.obs = DictList(meta_exps.obs)
            meta_logs1 = post_logs1 if n==0 else cat_logs(meta_logs1, post_logs1)

        # Unfreeze outer loop params, so so grads can get updated in the outer loop
        set_freeze_status(algo.acmodel, args.outer_params, freeze=False)
        # Freeze inner loop params, so grads do not get updated in the outer loop
        set_freeze_status(algo.acmodel, args.inner_params, freeze=True)   
        
        # set outer RIM recurence
        algo.recurrence = args.outer_recurrence

        # Update parameters while keeping inner parametes (module and policy) frozen: STEP 9
        meta_logs2 = algo.update_parameters(meta_exps)

        meta_logs = {**meta_logs1, **meta_logs2}
        update_end_time = time.time()

        num_frames += meta_logs["num_frames"]
        update += 1    
        
        # Print logs

        if update % args.log_interval == 0:
            
            fps = meta_logs["num_frames"]/(update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(meta_logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(meta_logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(meta_logs["num_frames_per_episode"])
            # Moving average to break loop if mean reward threshold reached
            if args.early_stop:
                #rreturn_total +=rreturn_per_episode['mean']
                rreturn_total +=return_per_episode['mean']
                i+=1
                if i >= window:
                    rreturn_mavg = rreturn_total / i
                    if rreturn_mavg >= threshold:
                        break
                    else:
                        i = 0
                        rreturn_total = 0

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            #header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            #data += rreturn_per_episode.values()
            header += ["rreturn_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()       
            header += ["num_frames_" + key for key in num_frames_per_episode.keys()]
            data += num_frames_per_episode.values()
            header += ["entropy", "value", "policy_loss", "value_loss", "grad_norm"]
            data += [meta_logs["entropy"], meta_logs["value"], meta_logs["policy_loss"], meta_logs["value_loss"], meta_logs["grad_norm"]]

            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                .format(*data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_logger.writerow(header)
            csv_logger.writerow(data)
            csv_file.flush()

            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)       

        # Save status

        if args.save_interval > 0 and update % args.save_interval == 0:
            status = {"num_frames": num_frames, "update": update,
                    "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict()}
            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
            txt_logger.info("Status saved")
    # STEP 10

    print("Number of frames: ", num_frames)


# CONTINUE TRAINING 4th ENVIRONMENT ----------------------

    index = 3
    env_id = tasks_sequence[index][1]
    frames = args.frames + add_frames

    ## Hyper-parameters
    args.env = env_id
    args.frames = int(frames)

    # Load environments

    envs = []
    for i in range(args.procs):
        envs.append(utils.make_env(args.env, args.seed + 10000 * i))
    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    # Reshape reward function
    if args.reshape_reward:
        def reshape_reward(obs, action, reward, done):
            if not done:
                reward = -1
            else:
                reward = 1
            return reward
    else:
        reshape_reward = None

    # Load model

    acmodel = ACModel(obs_space=obs_space, action_space=envs[0].action_space, use_memory=args.mem, use_text=args.text, use_rim=args.use_rim, num_units=args.num_units, k=args.k, input_heads=args.input_heads)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
    acmodel.to(device)
    txt_logger.info("Model loaded\n")
    txt_logger.info("{}\n".format(acmodel))


    # Load algo

    if args.algo == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps, preprocess_obss)
    elif args.algo == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                                args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss, reshape_reward)
    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    # change to RMSProp optimizer
    algo.optimizer = torch.optim.RMSprop(algo.acmodel.parameters(), args.lr, eps=args.optim_eps)

    # delete param_groups after it has been created
    for i in range(len(algo.optimizer.param_groups)):
        del algo.optimizer.param_groups[0]

    # Re-create separate param_groups for different inner and outer loop optimizer lr

    # inner loop param group
    algo.optimizer.add_param_group({'params': [
        *acmodel.image_conv.parameters(), 
        *acmodel.memory_rnn.rnn.parameters(), 
        *acmodel.actor.parameters()], 'lr': args.lr_alpha})

    # outer loop param group
    algo.optimizer.add_param_group({'params': [
        *acmodel.critic.parameters(), 
        *acmodel.memory_rnn.key.parameters(),
        *acmodel.memory_rnn.key_.parameters(),
        *acmodel.memory_rnn.query.parameters(),
        *acmodel.memory_rnn.query_.parameters(),
        *acmodel.memory_rnn.value.parameters(),
        *acmodel.memory_rnn.value_.parameters(),
        *acmodel.memory_rnn.comm_attention_output.parameters()
        ], 'lr': args.lr_beta})

    if "optimizer_state" in status:
        algo.optimizer.load_state_dict(status["optimizer_state"])
    txt_logger.info("Optimizer loaded\n")

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    # Moving average parameters
    threshold = 0.90
    window = 10
    rreturn_total = 0
    i = 0

    # run just once to have initial grads in all parameters and avoid backward error on first pass
    exps, _ = algo.collect_experiences()
    algo.update_parameters(exps)

    while num_frames < args.frames: # STEP 2

        update_start_time = time.time()

        # Sample batch of tasks: STEP 3
        tasks_batch = sample_tasks(n_tasks=args.num_tasks)

        for n, task in enumerate(tasks_batch):

            algo.env = change_multienv_seed(algo.env, seed=task)
            # Sample pre-trajectories from each task: STEP 4
            pre_exps, pre_logs1 = algo.collect_experiences()
            # Unfreeze inner loop params,so grads can get updated in the inner loop
            set_freeze_status(algo.acmodel, args.inner_params, freeze=False)
            # Freeze outer loop parameters, so grads do not get updated in the inner loop
            set_freeze_status(algo.acmodel, args.outer_params, freeze=True)
            # set inner RIM recurence
            algo.recurrence = args.inner_recurrence
            # Update parameters: STEP 6
            algo.update_parameters(pre_exps)
            # Sample post-trajectories t_i from tasks T_i: STEP 7
            post_exps, post_logs1 = algo.collect_experiences()
            # Concatenate to get D_meta: STEP 8
            meta_exps = post_exps if n==0 else cat_exps(meta_exps, post_exps)
            meta_exps = DictList(meta_exps)
            meta_exps.obs = DictList(meta_exps.obs)
            meta_logs1 = post_logs1 if n==0 else cat_logs(meta_logs1, post_logs1)

        # Unfreeze outer loop params, so so grads can get updated in the outer loop
        set_freeze_status(algo.acmodel, args.outer_params, freeze=False)
        # Freeze inner loop params, so grads do not get updated in the outer loop
        set_freeze_status(algo.acmodel, args.inner_params, freeze=True)   
        
        # set outer RIM recurence
        algo.recurrence = args.outer_recurrence

        # Update parameters while keeping inner parametes (module and policy) frozen: STEP 9
        meta_logs2 = algo.update_parameters(meta_exps)

        meta_logs = {**meta_logs1, **meta_logs2}
        update_end_time = time.time()

        num_frames += meta_logs["num_frames"]
        update += 1    
        
        # Print logs

        if update % args.log_interval == 0:
            
            fps = meta_logs["num_frames"]/(update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(meta_logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(meta_logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(meta_logs["num_frames_per_episode"])
            # Moving average to break loop if mean reward threshold reached
            if args.early_stop:
                #rreturn_total +=rreturn_per_episode['mean']
                rreturn_total +=return_per_episode['mean']
                i+=1
                if i >= window:
                    rreturn_mavg = rreturn_total / i
                    if rreturn_mavg >= threshold:
                        break
                    else:
                        i = 0
                        rreturn_total = 0

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            #header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            #data += rreturn_per_episode.values()
            header += ["rreturn_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()       
            header += ["num_frames_" + key for key in num_frames_per_episode.keys()]
            data += num_frames_per_episode.values()
            header += ["entropy", "value", "policy_loss", "value_loss", "grad_norm"]
            data += [meta_logs["entropy"], meta_logs["value"], meta_logs["policy_loss"], meta_logs["value_loss"], meta_logs["grad_norm"]]

            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                .format(*data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_logger.writerow(header)
            csv_logger.writerow(data)
            csv_file.flush()

            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)       

        # Save status

        if args.save_interval > 0 and update % args.save_interval == 0:
            status = {"num_frames": num_frames, "update": update,
                    "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict()}
            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
            txt_logger.info("Status saved")
    # STEP 10

    print("Number of frames: ", num_frames)

### Initial settings

## Evaluation

### Load parameters

In [10]:
#env_id = 'MiniGrid-Empty-Random-6x6-v0'
#env_id = 'MiniGrid-DoorKey-6x6-v0'
#env_id = 'MiniGrid-Empty-8x8-v0'
env_id = 'MiniGrid-WallGapS6-v0'

## Hyper-parameters
args.env = env_id
args.episodes = 100
args.seed = 2
args.argmax = False
args.worst_episodes_to_show = None
print(args)

# Set seed for all randomness sources
utils.seed(args.seed)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}\n")

{'algo': 'ppo', 'env': 'MiniGrid-WallGapS6-v0', 'model': 'test_metarims_6_4_wallgap_doorkey', 'early_stop': True, 'seed': 2, 'log_interval': 1, 'save_interval': 10, 'procs': 16, 'frames': 1000000.0, 'epochs': 4, 'batch_size': 256, 'frames_per_proc': 128, 'discount': 0.99, 'lr': 0.0007, 'gae_lambda': 0.99, 'entropy_coef': 0.01, 'value_loss_coef': 0.5, 'max_grad_norm': 0.5, 'optim_eps': 1e-08, 'optim_alpha': 0.99, 'clip_eps': 0.2, 'recurrence': 32, 'text': False, 'use_rim': True, 'meta_learn': True, 'reshape_reward': False, 'mem': True, 'num_units': 6, 'k': 4, 'input_heads': 1, 'lr_alpha': 0.0007, 'lr_beta': 0.0007, 'inner_recurrence': 8, 'outer_recurrence': 32, 'num_tasks': 2, 'inner_params': ['image_conv', 'i2h', 'h2h', 'actor'], 'outer_params': ['query', 'key', 'value', 'comm', 'critic'], 'episodes': 100, 'argmax': False, 'worst_episodes_to_show': None}
Device: cpu



### Set environments, agent and logs, Run agent and print results

In [11]:
num_frames_list = []
fps_list = []
duration_list = []
return_per_episode_list = []
num_frames_per_episode_list = []
seed_list = [10, 20, 30]

print("Env:", args.env, "\n")

for n, seed in enumerate(seed_list):

    # Load environments

    envs = []
    for i in range(args.procs):
        env = utils.make_env(args.env, seed + 10000 * i)
        envs.append(env)
    env = ParallelEnv(envs)
    print("Environments loaded")

    # Load agent

    model_dir = utils.get_model_dir(args.model)
    agent = utils.Agent(obs_space=env.observation_space, action_space=env.action_space, model_dir=model_dir, device=device, argmax=args.argmax, num_envs=args.procs, use_memory=args.mem, use_rim=args.use_rim, num_units=args.num_units, k=args.k, input_heads=args.input_heads)
    print("Agent loaded")

    # Initialize logs

    logs = {"num_frames_per_episode": [], "return_per_episode": []}

    # Run agent

    start_time = time.time()

    obss = env.reset()

    log_done_counter = 0
    log_episode_return = torch.zeros(args.procs, device=device)
    log_episode_num_frames = torch.zeros(args.procs, device=device)

    while log_done_counter < args.episodes:
        actions = agent.get_actions(obss)
        obss, rewards, dones, _ = env.step(actions)
        agent.analyze_feedbacks(rewards, dones)

        log_episode_return += torch.tensor(rewards, device=device, dtype=torch.float)
        log_episode_num_frames += torch.ones(args.procs, device=device)

        for i, done in enumerate(dones):
            if done:
                log_done_counter += 1
                logs["return_per_episode"].append(log_episode_return[i].item())
                logs["num_frames_per_episode"].append(log_episode_num_frames[i].item())

        mask = 1 - torch.tensor(dones, device=device, dtype=torch.float)
        log_episode_return *= mask
        log_episode_num_frames *= mask

    end_time = time.time()
    print("Agent run_{} completed\n" .format(n+1))

    num_frames = sum(logs["num_frames_per_episode"])
    fps = num_frames/(end_time - start_time)
    duration = int(end_time - start_time)
    return_per_episode = utils.synthesize(logs["return_per_episode"])
    num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"])

    # Acumulate logs per agent

    num_frames_list.append(num_frames)
    fps_list.append(fps)
    duration_list.append(duration)
    return_per_episode_list.append(np.fromiter(return_per_episode.values(), float))
    num_frames_per_episode_list.append(np.fromiter(num_frames_per_episode.values(), float))

# Convert lists to numpy arrays
num_frames_tot = np.array(num_frames_list, ndmin=2)
fps_tot = np.array(fps_list, ndmin=2)
duration_tot = np.array(duration_list, ndmin=2)
return_per_episode_tot = np.array(return_per_episode_list, ndmin=2)
num_frames_per_episode_tot = np.array(num_frames_per_episode_list, ndmin=2)

# Print logs

# print("F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
#       .format(num_frames, fps, duration,
#               *return_per_episode.values(),
#               *num_frames_per_episode.values()))

print("F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
      .format(np.median(num_frames_tot, axis=0)[0], np.median(fps_tot, axis=0)[0], np.median(duration_tot, axis=0)[0], *np.median(return_per_episode_tot, axis=0), *np.median(num_frames_per_episode_tot, axis=0)))

#return_per_episode_tot = np.array(return_per_episode_tot, ndim=2)

# Print worst episodes
if args.worst_episodes_to_show:
    n = args.worst_episodes_to_show
    if n > 0:
        print("\n{} worst episodes:".format(n))

        indexes = sorted(range(len(logs["return_per_episode"])), key=lambda k: logs["return_per_episode"][k])
        for i in indexes[:n]:
            print("- episode {}: R={}, F={}".format(i, logs["return_per_episode"][i], logs["num_frames_per_episode"][i]))

Env: MiniGrid-WallGapS6-v0 

Environments loaded
Agent loaded
Agent run_1 completed

Environments loaded
Agent loaded
Agent run_2 completed

Environments loaded
Agent loaded
Agent run_3 completed

F 1408.0 | FPS 2974 | D 0.0 | R:μσmM 0.91 0.04 0.79 0.98 | F:μσmM 14.1 6.0 3.0 33.0


## Visualization

### Load parameters

In [None]:
import array2gif
from pathlib import Path
import numpy

## Hyper-parameters
args = {
# General parameters
'env':args.env,
'model':args.model,
'seed':15,
'shift':0,
'argmax':False,
'pause':0.1,
'gif':args.model,
'episodes':5,
# Model Parameters
'use_rim':args.use_rim,
'num_units':args.num_units,
'k':args.k
}

args = DictList(args)
print(args)

### Set environment, agent and logs

In [None]:
# Set seed for all randomness sources

utils.seed(args.seed)

# Set device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}\n")

# Load environment

env = utils.make_env(args.env, args.seed)
for _ in range(args.shift):
    env.reset()
print("Environment loaded\n")

# Load agent

model_dir = utils.get_model_dir(args.model)
agent = utils.Agent(env.observation_space, env.action_space, model_dir, device, args.argmax, use_rim = args.use_rim, num_units = args.num_units, k = args.k)

print("Agent loaded\n")

### Make animation

In [None]:
%%capture
# Run the agent

if args.gif:
   from array2gif import write_gif
   frames = []

# Create a window to view the environment
env.render('human')

for episode in range(args.episodes):
    obs = env.reset()
    done2 = False
    while True:
        env.render('human')
        if args.gif:
            frames.append(numpy.moveaxis(env.render("rgb_array"), 2, 0))
            

        action = agent.get_action(obs)
        obs, reward, done, _ = env.step(action)
        agent.analyze_feedback(reward, done)
        
        if done or env.window.closed:
            if episode == 4:
                done2 = True
            break
    if done2 == True:
        env.close()
        break
    #if env.window.closed:
    #    break
print('doneeee')
if args.gif:
    print("Saving gif... ", end="")
    utils.create_folders_if_necessary("./animation")
    #Path("./animation").mkdir(parents=True, exist_ok=True)
    write_gif(numpy.array(frames), "./animation/"+args.gif+".gif")
    print("Done.")

In [None]:
show_animation(args.model)

### Make video

In [None]:
test_env = wrap_env(env)
observation = test_env.reset()

done = False
episode_reward = 0
episode_length = 0

while not done:
    action = agent.get_action(observation)
    observation, reward, done, info = test_env.step(action)
    episode_reward += reward
    episode_length += 1

print('Total reward:', episode_reward)
print('Total length:', episode_length)

test_env.close()
show_video()

# Continue learning on 2nd environment

## Configuration

### Set general parameters

In [12]:
#env_id = 'MiniGrid-Empty-8x8-v0'
#env_id = 'MiniGrid-LavaGapS5-v0'
#env_id = 'MiniGrid-DoorKey-5x5-v0'
#env_id = 'MiniGrid-DoorKey-5x5-v0'
env_id = 'MiniGrid-DoorKey-6x6-v0'
#env_id_1 = 'MiniGrid-WallGapS6-v0'
#env_id = 'MiniGrid-Empty-Random-6x6-v0'
#env_id = 'MiniGrid-Empty-5x5-v0'
#env_id = 'MiniGrid-RedBlueDoors-6x6-v0'

#model = 'MiniGrid-WallGapS6-v0_alt_MiniGrid-DoorKey-6x6-v0_meta_RIMs_6_4_proc_16_RMSProp_nxt_MiniGrid-RedBlueDoors-6x6-v0_end'

add_frames = 2e6
frames = frames + add_frames

## Hyper-parameters
args = {
# General parameters
'algo':'ppo',
'env':env_id,
'model':model,
'early_stop':True,
'seed':1,
'log_interval':1,
'save_interval':10,
'procs':processes,
'frames':frames, # default 1e7
# Parameters for main algorithm
'epochs':4,
'batch_size':256,
'frames_per_proc':128, # 128 for PPO and 5 per A2C
'discount':0.99,
#'lr':0.0001, # for Adam
'lr':0.0007, # for RMSProp
#'gae_lambda':0.95, # 1 means no gae, for Adam
'gae_lambda':0.99, # 1 means no gae, for RMSProp
'entropy_coef':0.01,
'value_loss_coef':0.5,
'max_grad_norm':0.5,
'optim_eps':1e-8,
'optim_alpha':0.99,
'clip_eps':0.2,
'recurrence':32, # if > 1, a LSTM is added
'text':False, # add a GRU for text input
# Model Parameters
'use_rim':True, # action = 'store_true'
'meta_learn':True,
'reshape_reward':False
}

#args = utils.dotdict(args)
args = DictList(args)

args.mem = args.recurrence > 1

# RIM specific hyperparameters
if args.use_rim:
    args.num_units = 6
    args.k = 4
    args.input_heads = 1

if args.meta_learn:
    args.lr_alpha= args.lr
    args.lr_beta= args.lr
    args.inner_recurrence= 8
    args.outer_recurrence= 32 # 4x inner_recurrence
    args.num_tasks = 2
    args.inner_params= ['image_conv', 'i2h', 'h2h', 'actor'] # params to be updated in inner loop
    args.outer_params = ['query', 'key', 'value', 'comm', 'critic'] # params to be updated in outer loop

### Load previous loggers and settings

In [14]:
# Set run dir

date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}"

model_name = args.model or default_model_name
model_dir = utils.get_model_dir(model_name)

# Load loggers and Tensorboard writer

txt_logger = utils.get_txt_logger(model_dir)
csv_file, csv_logger = utils.get_csv_logger(model_dir)
tb_writer = tensorboardX.SummaryWriter(model_dir)

# Log command and all script arguments

#txt_logger.info("{}\n".format(" ".join(sys.argv)))
txt_logger.info("{}\n".format(args))

# Set seed for all randomness sources

utils.seed(args.seed)

# Set device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
txt_logger.info(f"Device: {device}\n")

{'algo': 'ppo', 'env': 'MiniGrid-DoorKey-6x6-v0', 'model': 'test_metarims_6_4_wallgap_doorkey', 'early_stop': True, 'seed': 1, 'log_interval': 1, 'save_interval': 10, 'procs': 16, 'frames': 3000000.0, 'epochs': 4, 'batch_size': 256, 'frames_per_proc': 128, 'discount': 0.99, 'lr': 0.0007, 'gae_lambda': 0.99, 'entropy_coef': 0.01, 'value_loss_coef': 0.5, 'max_grad_norm': 0.5, 'optim_eps': 1e-08, 'optim_alpha': 0.99, 'clip_eps': 0.2, 'recurrence': 32, 'text': False, 'use_rim': True, 'meta_learn': True, 'reshape_reward': False, 'mem': True, 'num_units': 6, 'k': 4, 'input_heads': 1, 'lr_alpha': 0.0007, 'lr_beta': 0.0007, 'inner_recurrence': 8, 'outer_recurrence': 32, 'num_tasks': 2, 'inner_params': ['image_conv', 'i2h', 'h2h', 'actor'], 'outer_params': ['query', 'key', 'value', 'comm', 'critic']}

Device: cpu



### Load existing environments, model and training status

In [15]:
# Load environments

envs = []
for i in range(args.procs):
    envs.append(utils.make_env(args.env, args.seed + 10000 * i))
txt_logger.info("Environments loaded\n")

# Load training status

try:
    status = utils.get_status(model_dir)
except OSError:
    status = {"num_frames": 0, "update": 0}
txt_logger.info("Training status loaded\n")

# Load observations preprocessor

obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space)
if "vocab" in status:
    preprocess_obss.vocab.load_vocab(status["vocab"])
txt_logger.info("Observations preprocessor loaded")

# Reshape reward function
if args.reshape_reward:
    def reshape_reward(obs, action, reward, done):
        if not done:
            reward = -1
        else:
            reward = 1
        return reward
else:
    reshape_reward = None

# Load model

acmodel = ACModel(obs_space=obs_space, action_space=envs[0].action_space, use_memory=args.mem, use_text=args.text, use_rim=args.use_rim, num_units=args.num_units, k=args.k, input_heads=args.input_heads)
if "model_state" in status:
    acmodel.load_state_dict(status["model_state"])
acmodel.to(device)
txt_logger.info("Model loaded\n")
txt_logger.info("{}\n".format(acmodel))

# Load algo

if args.algo == "a2c":
    algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                            args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                            args.optim_alpha, args.optim_eps, preprocess_obss)
elif args.algo == "ppo":
    algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                            args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                            args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss, reshape_reward)
else:
    raise ValueError("Incorrect algorithm name: {}".format(args.algo))

# change to RMSProp optimizer
algo.optimizer = torch.optim.RMSprop(algo.acmodel.parameters(), args.lr, eps=args.optim_eps)

# delete param_groups after it has been created
for i in range(len(algo.optimizer.param_groups)):
    del algo.optimizer.param_groups[0]

# Re-create separate param_groups for different inner and outer loop optimizer lr

# inner loop param group
algo.optimizer.add_param_group({'params': [
    *acmodel.image_conv.parameters(), 
    *acmodel.memory_rnn.rnn.parameters(), 
    *acmodel.actor.parameters()], 'lr': args.lr_alpha})

# outer loop param group
algo.optimizer.add_param_group({'params': [
    *acmodel.critic.parameters(), 
    *acmodel.memory_rnn.key.parameters(),
    *acmodel.memory_rnn.key_.parameters(),
    *acmodel.memory_rnn.query.parameters(),
    *acmodel.memory_rnn.query_.parameters(),
    *acmodel.memory_rnn.value.parameters(),
    *acmodel.memory_rnn.value_.parameters(),
    *acmodel.memory_rnn.comm_attention_output.parameters()
    ], 'lr': args.lr_beta})

if "optimizer_state" in status:
    algo.optimizer.load_state_dict(status["optimizer_state"])
txt_logger.info("Optimizer loaded\n")

Environments loaded

Training status loaded

Observations preprocessor loaded
Model loaded

ACModel(
  (image_conv): Sequential(
    (0): Conv2d(3, 16, kernel_size=(2, 2), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(2, 2), stride=(1, 1))
    (4): ReLU()
    (5): Conv2d(32, 64, kernel_size=(2, 2), stride=(1, 1))
    (6): ReLU()
  )
  (memory_rnn): RIMCell(
    (key): Linear(in_features=64, out_features=64, bias=True)
    (value): Linear(in_features=64, out_features=64, bias=True)
    (rnn): GroupLSTMCell(
      (i2h): GroupLinearLayer()
      (h2h): GroupLinearLayer()
    )
    (query): GroupLinearLayer()
    (query_): GroupLinearLayer()
    (key_): GroupLinearLayer()
    (value_): GroupLinearLayer()
    (comm_attention_output): GroupLinearLayer()
    (comm_dropout): Dropout(p=0.1, inplace=False)
    (input_dropout): Dropout(p=0.1, inplace=False)
  )
  (actor): Sequentia

## Continue training

In [16]:
## Training

num_frames = status["num_frames"]
update = status["update"]
start_time = time.time()

# Moving average settings for early stop
threshold = 0.9
window = 10
rreturn_total = 0
i = 0

# run just once to have initial grads in all parameters and avoid backward error on first pass
exps, _ = algo.collect_experiences()
algo.update_parameters(exps)

while num_frames < args.frames: # STEP 2

    update_start_time = time.time()

    # Sample batch of tasks: STEP 3
    tasks_batch = sample_tasks(n_tasks=args.num_tasks)

    for n, task in enumerate(tasks_batch):

        algo.env = change_multienv_seed(algo.env, seed=task)
        # Sample pre-trajectories from each task: STEP 4
        pre_exps, pre_logs1 = algo.collect_experiences()
         # Unfreeze inner loop params,so grads can get updated in the inner loop
        set_freeze_status(algo.acmodel, args.inner_params, freeze=False)
        # Freeze outer loop parameters, so grads do not get updated in the inner loop
        set_freeze_status(algo.acmodel, args.outer_params, freeze=True)
        # set inner RIM recurence
        algo.recurrence = args.inner_recurrence
        # Update parameters: STEP 6
        algo.update_parameters(pre_exps)
        # Sample post-trajectories t_i from tasks T_i: STEP 7
        post_exps, post_logs1 = algo.collect_experiences()
        # Concatenate to get D_meta: STEP 8
        meta_exps = post_exps if n==0 else cat_exps(meta_exps, post_exps)
        meta_exps = DictList(meta_exps)
        meta_exps.obs = DictList(meta_exps.obs)
        meta_logs1 = post_logs1 if n==0 else cat_logs(meta_logs1, post_logs1)

    # Unfreeze outer loop params, so so grads can get updated in the outer loop
    set_freeze_status(algo.acmodel, args.outer_params, freeze=False)
    # Freeze inner loop params, so grads do not get updated in the outer loop
    set_freeze_status(algo.acmodel, args.inner_params, freeze=True)   
    
    # set outer RIM recurence
    algo.recurrence = args.outer_recurrence

    # Update parameters while keeping inner parametes (module and policy) frozen: STEP 9
    meta_logs2 = algo.update_parameters(meta_exps)

    meta_logs = {**meta_logs1, **meta_logs2}
    update_end_time = time.time()

    num_frames += meta_logs["num_frames"]
    update += 1    
    
    # Print logs

    if update % args.log_interval == 0:
        
        fps = meta_logs["num_frames"]/(update_end_time - update_start_time)
        duration = int(time.time() - start_time)
        return_per_episode = utils.synthesize(meta_logs["return_per_episode"])
        rreturn_per_episode = utils.synthesize(meta_logs["reshaped_return_per_episode"])
        num_frames_per_episode = utils.synthesize(meta_logs["num_frames_per_episode"])
        # Moving average to break loop if mean reward threshold reached
        if args.early_stop:
            #rreturn_total +=rreturn_per_episode['mean']
            rreturn_total +=return_per_episode['mean']
            i+=1
            if i >= window:
                rreturn_mavg = rreturn_total / i
                if rreturn_mavg >= threshold:
                    break
                else:
                    i = 0
                    rreturn_total = 0

        header = ["update", "frames", "FPS", "duration"]
        data = [update, num_frames, fps, duration]
        #header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
        #data += rreturn_per_episode.values()
        header += ["rreturn_" + key for key in return_per_episode.keys()]
        data += return_per_episode.values()       
        header += ["num_frames_" + key for key in num_frames_per_episode.keys()]
        data += num_frames_per_episode.values()
        header += ["entropy", "value", "policy_loss", "value_loss", "grad_norm"]
        data += [meta_logs["entropy"], meta_logs["value"], meta_logs["policy_loss"], meta_logs["value_loss"], meta_logs["grad_norm"]]

        txt_logger.info(
            "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
            .format(*data))

        header += ["return_" + key for key in return_per_episode.keys()]
        data += return_per_episode.values()

        if status["num_frames"] == 0:
            csv_logger.writerow(header)
        csv_logger.writerow(data)
        csv_file.flush()

        for field, value in zip(header, data):
            tb_writer.add_scalar(field, value, num_frames)       

    # Save status

    if args.save_interval > 0 and update % args.save_interval == 0:
        status = {"num_frames": num_frames, "update": update,
                  "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict()}
        if hasattr(preprocess_obss, "vocab"):
            status["vocab"] = preprocess_obss.vocab.vocab
        utils.save_status(status, model_dir)
        txt_logger.info("Status saved")
# STEP 10

print("Number of frames: ", num_frames)

U 41 | F 167936 | FPS 0445 | D 14 | rR:μσmM 0.04 0.13 0.00 0.56 | F:μσmM 335.9 73.0 0.0 360.0 | H 1.656 | V 0.207 | pL 0.272 | vL 0.016 | ∇ 0.090
U 42 | F 172032 | FPS 0441 | D 23 | rR:μσmM 0.03 0.11 0.00 0.61 | F:μσmM 379.4 121.6 303.0 1054.0 | H 1.848 | V 0.078 | pL 0.039 | vL 0.001 | ∇ 0.039
U 43 | F 176128 | FPS 0470 | D 32 | rR:μσmM 0.10 0.24 0.00 0.89 | F:μσmM 364.0 147.7 43.0 1054.0 | H 1.919 | V 0.025 | pL 0.018 | vL 0.001 | ∇ 0.044
U 44 | F 180224 | FPS 0451 | D 41 | rR:μσmM 0.12 0.22 0.00 0.89 | F:μσmM 345.3 57.1 144.0 529.0 | H 1.926 | V 0.033 | pL -0.002 | vL 0.001 | ∇ 0.064
U 45 | F 184320 | FPS 0461 | D 50 | rR:μσmM 0.10 0.21 0.00 0.72 | F:μσmM 329.4 71.3 112.0 360.0 | H 1.929 | V 0.025 | pL 0.026 | vL 0.000 | ∇ 0.027
U 46 | F 188416 | FPS 0445 | D 59 | rR:μσmM 0.12 0.26 0.00 0.92 | F:μσmM 318.8 91.0 31.0 360.0 | H 1.919 | V 0.005 | pL -0.010 | vL 0.002 | ∇ 0.068
U 47 | F 192512 | FPS 0453 | D 68 | rR:μσmM 0.06 0.16 0.00 0.55 | F:μσmM 341.1 49.8 182.0 360.0 | H 1.922 | V 

## Evaluate environment

### Load parameters

In [19]:
#env_id = 'MiniGrid-Empty-8x8-v0'
#env_id = 'MiniGrid-LavaGapS5-v0'
#env_id = 'MiniGrid-DoorKey-5x5-v0'
#env_id = 'MiniGrid-DoorKey-5x5-v0'
env_id = 'MiniGrid-DoorKey-6x6-v0'
#env_id_1 = 'MiniGrid-WallGapS6-v0'
#env_id = 'MiniGrid-Empty-Random-6x6-v0'
#env_id = 'MiniGrid-Empty-5x5-v0'
#env_id = 'MiniGrid-RedBlueDoors-6x6-v0'

args.model = 'test_metarims_6_4_wallgap_doorkey'
## Hyper-parameters
args.env = env_id
args.episodes = 100
args.seed = 2
args.argmax = False
args.worst_episodes_to_show = None
print(args)

# Set seed for all randomness sources
utils.seed(args.seed)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}\n")

{'algo': 'ppo', 'env': 'MiniGrid-DoorKey-6x6-v0', 'model': 'test_metarims_6_4_wallgap_doorkey', 'early_stop': True, 'seed': 2, 'log_interval': 1, 'save_interval': 10, 'procs': 16, 'frames': 3000000.0, 'epochs': 4, 'batch_size': 256, 'frames_per_proc': 128, 'discount': 0.99, 'lr': 0.0007, 'gae_lambda': 0.99, 'entropy_coef': 0.01, 'value_loss_coef': 0.5, 'max_grad_norm': 0.5, 'optim_eps': 1e-08, 'optim_alpha': 0.99, 'clip_eps': 0.2, 'recurrence': 32, 'text': False, 'use_rim': True, 'meta_learn': True, 'reshape_reward': False, 'mem': True, 'num_units': 6, 'k': 4, 'input_heads': 1, 'lr_alpha': 0.0007, 'lr_beta': 0.0007, 'inner_recurrence': 8, 'outer_recurrence': 32, 'num_tasks': 2, 'inner_params': ['image_conv', 'i2h', 'h2h', 'actor'], 'outer_params': ['query', 'key', 'value', 'comm', 'critic'], 'episodes': 100, 'argmax': False, 'worst_episodes_to_show': None}
Device: cpu



### Set environments, agent and logs, Run agent and print results

In [20]:
num_frames_list = []
fps_list = []
duration_list = []
return_per_episode_list = []
num_frames_per_episode_list = []
seed_list = [10, 20, 30]

print("Env:", args.env, "\n")

for n, seed in enumerate(seed_list):

    # Load environments

    envs = []
    for i in range(args.procs):
        env = utils.make_env(args.env, seed + 10000 * i)
        envs.append(env)
    env = ParallelEnv(envs)
    print("Environments loaded")

    # Load agent

    model_dir = utils.get_model_dir(args.model)
    agent = utils.Agent(obs_space=env.observation_space, action_space=env.action_space, model_dir=model_dir, device=device, argmax=args.argmax, num_envs=args.procs, use_memory=args.mem, use_rim=args.use_rim, num_units=args.num_units, k=args.k, input_heads=args.input_heads)
    print("Agent loaded")

    # Initialize logs

    logs = {"num_frames_per_episode": [], "return_per_episode": []}

    # Run agent

    start_time = time.time()

    obss = env.reset()

    log_done_counter = 0
    log_episode_return = torch.zeros(args.procs, device=device)
    log_episode_num_frames = torch.zeros(args.procs, device=device)

    while log_done_counter < args.episodes:
        actions = agent.get_actions(obss)
        obss, rewards, dones, _ = env.step(actions)
        agent.analyze_feedbacks(rewards, dones)

        log_episode_return += torch.tensor(rewards, device=device, dtype=torch.float)
        log_episode_num_frames += torch.ones(args.procs, device=device)

        for i, done in enumerate(dones):
            if done:
                log_done_counter += 1
                logs["return_per_episode"].append(log_episode_return[i].item())
                logs["num_frames_per_episode"].append(log_episode_num_frames[i].item())

        mask = 1 - torch.tensor(dones, device=device, dtype=torch.float)
        log_episode_return *= mask
        log_episode_num_frames *= mask

    end_time = time.time()
    print("Agent run_{} completed\n" .format(n+1))

    num_frames = sum(logs["num_frames_per_episode"])
    fps = num_frames/(end_time - start_time)
    duration = int(end_time - start_time)
    return_per_episode = utils.synthesize(logs["return_per_episode"])
    num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"])

    # Acumulate logs per agent

    num_frames_list.append(num_frames)
    fps_list.append(fps)
    duration_list.append(duration)
    return_per_episode_list.append(np.fromiter(return_per_episode.values(), float))
    num_frames_per_episode_list.append(np.fromiter(num_frames_per_episode.values(), float))

# Convert lists to numpy arrays
num_frames_tot = np.array(num_frames_list, ndmin=2)
fps_tot = np.array(fps_list, ndmin=2)
duration_tot = np.array(duration_list, ndmin=2)
return_per_episode_tot = np.array(return_per_episode_list, ndmin=2)
num_frames_per_episode_tot = np.array(num_frames_per_episode_list, ndmin=2)

# Print logs

# print("F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
#       .format(num_frames, fps, duration,
#               *return_per_episode.values(),
#               *num_frames_per_episode.values()))

print("F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
      .format(np.median(num_frames_tot, axis=0)[0], np.median(fps_tot, axis=0)[0], np.median(duration_tot, axis=0)[0], *np.median(return_per_episode_tot, axis=0), *np.median(num_frames_per_episode_tot, axis=0)))

#return_per_episode_tot = np.array(return_per_episode_tot, ndim=2)

# Print worst episodes
if args.worst_episodes_to_show:
    n = args.worst_episodes_to_show
    if n > 0:
        print("\n{} worst episodes:".format(n))

        indexes = sorted(range(len(logs["return_per_episode"])), key=lambda k: logs["return_per_episode"][k])
        for i in indexes[:n]:
            print("- episode {}: R={}, F={}".format(i, logs["return_per_episode"][i], logs["num_frames_per_episode"][i]))

Env: MiniGrid-DoorKey-6x6-v0 

Environments loaded
Agent loaded
Agent run_1 completed

Environments loaded
Agent loaded
Agent run_2 completed

Environments loaded
Agent loaded
Agent run_3 completed

F 3741.0 | FPS 2909 | D 1.0 | R:μσmM 0.90 0.06 0.63 0.98 | F:μσmM 39.8 23.7 10.0 146.0


# Re-evaluate previous environments and test CF

### Load parameters env 1

In [21]:
#env_id = 'MiniGrid-Empty-8x8-v0'
#env_id = 'MiniGrid-LavaGapS5-v0'
#env_id = 'MiniGrid-DoorKey-5x5-v0'
#env_id = 'MiniGrid-DoorKey-5x5-v0'
#env_id = 'MiniGrid-DoorKey-6x6-v0'
env_id = 'MiniGrid-WallGapS6-v0'
#env_id = 'MiniGrid-Empty-Random-6x6-v0'
#env_id = 'MiniGrid-Empty-5x5-v0'
#env_id = 'MiniGrid-RedBlueDoors-6x6-v0'

args.model = 'test_metarims_6_4_wallgap_doorkey'
## Hyper-parameters
args.env = env_id
args.model = model
args.episodes = 100
args.seed = 3
args.argmax = False
args.worst_episodes_to_show = None
print(args)

# Set seed for all randomness sources
utils.seed(args.seed)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}\n")

{'algo': 'ppo', 'env': 'MiniGrid-WallGapS6-v0', 'model': 'test_metarims_6_4_wallgap_doorkey', 'early_stop': True, 'seed': 3, 'log_interval': 1, 'save_interval': 10, 'procs': 16, 'frames': 3000000.0, 'epochs': 4, 'batch_size': 256, 'frames_per_proc': 128, 'discount': 0.99, 'lr': 0.0007, 'gae_lambda': 0.99, 'entropy_coef': 0.01, 'value_loss_coef': 0.5, 'max_grad_norm': 0.5, 'optim_eps': 1e-08, 'optim_alpha': 0.99, 'clip_eps': 0.2, 'recurrence': 32, 'text': False, 'use_rim': True, 'meta_learn': True, 'reshape_reward': False, 'mem': True, 'num_units': 6, 'k': 4, 'input_heads': 1, 'lr_alpha': 0.0007, 'lr_beta': 0.0007, 'inner_recurrence': 8, 'outer_recurrence': 32, 'num_tasks': 2, 'inner_params': ['image_conv', 'i2h', 'h2h', 'actor'], 'outer_params': ['query', 'key', 'value', 'comm', 'critic'], 'episodes': 100, 'argmax': False, 'worst_episodes_to_show': None}
Device: cpu



### Set environments, agent and logs, Run agent and print results for env 1

In [22]:
num_frames_list = []
fps_list = []
duration_list = []
return_per_episode_list = []
num_frames_per_episode_list = []
seed_list = [10, 20, 30]

print("Env:", args.env, "\n")

for n, seed in enumerate(seed_list):

    # Load environments

    envs = []
    for i in range(args.procs):
        env = utils.make_env(args.env, seed + 10000 * i)
        envs.append(env)
    env = ParallelEnv(envs)
    print("Environments loaded")

    # Load agent

    model_dir = utils.get_model_dir(args.model)
    agent = utils.Agent(obs_space=env.observation_space, action_space=env.action_space, model_dir=model_dir, device=device, argmax=args.argmax, num_envs=args.procs, use_memory=args.mem, use_rim=args.use_rim, num_units=args.num_units, k=args.k, input_heads=args.input_heads)
    print("Agent loaded")

    # Initialize logs

    logs = {"num_frames_per_episode": [], "return_per_episode": []}

    # Run agent

    start_time = time.time()

    obss = env.reset()

    log_done_counter = 0
    log_episode_return = torch.zeros(args.procs, device=device)
    log_episode_num_frames = torch.zeros(args.procs, device=device)

    while log_done_counter < args.episodes:
        actions = agent.get_actions(obss)
        obss, rewards, dones, _ = env.step(actions)
        agent.analyze_feedbacks(rewards, dones)

        log_episode_return += torch.tensor(rewards, device=device, dtype=torch.float)
        log_episode_num_frames += torch.ones(args.procs, device=device)

        for i, done in enumerate(dones):
            if done:
                log_done_counter += 1
                logs["return_per_episode"].append(log_episode_return[i].item())
                logs["num_frames_per_episode"].append(log_episode_num_frames[i].item())

        mask = 1 - torch.tensor(dones, device=device, dtype=torch.float)
        log_episode_return *= mask
        log_episode_num_frames *= mask

    end_time = time.time()
    print("Agent run_{} completed\n" .format(n+1))

    num_frames = sum(logs["num_frames_per_episode"])
    fps = num_frames/(end_time - start_time)
    duration = int(end_time - start_time)
    return_per_episode = utils.synthesize(logs["return_per_episode"])
    num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"])

    # Acumulate logs per agent

    num_frames_list.append(num_frames)
    fps_list.append(fps)
    duration_list.append(duration)
    return_per_episode_list.append(np.fromiter(return_per_episode.values(), float))
    num_frames_per_episode_list.append(np.fromiter(num_frames_per_episode.values(), float))

# Convert lists to numpy arrays
num_frames_tot = np.array(num_frames_list, ndmin=2)
fps_tot = np.array(fps_list, ndmin=2)
duration_tot = np.array(duration_list, ndmin=2)
return_per_episode_tot = np.array(return_per_episode_list, ndmin=2)
num_frames_per_episode_tot = np.array(num_frames_per_episode_list, ndmin=2)

# Print logs

# print("F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
#       .format(num_frames, fps, duration,
#               *return_per_episode.values(),
#               *num_frames_per_episode.values()))

print("F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
      .format(np.median(num_frames_tot, axis=0)[0], np.median(fps_tot, axis=0)[0], np.median(duration_tot, axis=0)[0], *np.median(return_per_episode_tot, axis=0), *np.median(num_frames_per_episode_tot, axis=0)))

#return_per_episode_tot = np.array(return_per_episode_tot, ndim=2)

# Print worst episodes
if args.worst_episodes_to_show:
    n = args.worst_episodes_to_show
    if n > 0:
        print("\n{} worst episodes:".format(n))

        indexes = sorted(range(len(logs["return_per_episode"])), key=lambda k: logs["return_per_episode"][k])
        for i in indexes[:n]:
            print("- episode {}: R={}, F={}".format(i, logs["return_per_episode"][i], logs["num_frames_per_episode"][i]))

Env: MiniGrid-WallGapS6-v0 

Environments loaded
Agent loaded
Agent run_1 completed

Environments loaded
Agent loaded
Agent run_2 completed

Environments loaded
Agent loaded
Agent run_3 completed

F 7167.0 | FPS 3032 | D 2.0 | R:μσmM 0.52 0.40 0.00 0.98 | F:μσmM 71.7 57.3 4.0 144.0


# Continue learning on 3rd environment

In [23]:
#env_id = 'MiniGrid-Empty-8x8-v0'
#env_id = 'MiniGrid-LavaGapS5-v0'
#env_id = 'MiniGrid-DoorKey-5x5-v0'
#env_id = 'MiniGrid-DoorKey-5x5-v0'
#env_id = 'MiniGrid-DoorKey-6x6-v0'
#env_id_1 = 'MiniGrid-WallGapS6-v0'
#env_id = 'MiniGrid-Empty-Random-6x6-v0'
#env_id = 'MiniGrid-Empty-5x5-v0'
#env_id = 'MiniGrid-RedBlueDoors-6x6-v0'
env_id = 'MiniGrid-SimpleCrossingS9N2-v0'

#model = 'MiniGrid-WallGapS6-v0_alt_MiniGrid-DoorKey-6x6-v0_meta_RIMs_6_4_proc_16_RMSProp_nxt_MiniGrid-RedBlueDoors-6x6-v0_end'
model = 'test_metarims_6_4_wallgap_doorkey_crossing'

add_frames = 2e6
frames = frames + add_frames

## Hyper-parameters
args = {
# General parameters
'algo':'ppo',
'env':env_id,
'model':model,
'early_stop':True,
'seed':1,
'log_interval':1,
'save_interval':10,
'procs':processes,
'frames':frames, # default 1e7
# Parameters for main algorithm
'epochs':4,
'batch_size':256,
'frames_per_proc':128, # 128 for PPO and 5 per A2C
'discount':0.99,
#'lr':0.0001, # for Adam
'lr':0.0007, # for RMSProp
#'gae_lambda':0.95, # 1 means no gae, for Adam
'gae_lambda':0.99, # 1 means no gae, for RMSProp
'entropy_coef':0.01,
'value_loss_coef':0.5,
'max_grad_norm':0.5,
'optim_eps':1e-8,
'optim_alpha':0.99,
'clip_eps':0.2,
'recurrence':32, # if > 1, a LSTM is added
'text':False, # add a GRU for text input
# Model Parameters
'use_rim':True, # action = 'store_true'
'meta_learn':True,
'reshape_reward':False
}

#args = utils.dotdict(args)
args = DictList(args)

args.mem = args.recurrence > 1

# RIM specific hyperparameters
if args.use_rim:
    args.num_units = 6
    args.k = 4
    args.input_heads = 1

if args.meta_learn:
    args.lr_alpha= args.lr
    args.lr_beta= args.lr
    args.inner_recurrence= 8
    args.outer_recurrence= 32 # 4x inner_recurrence
    args.num_tasks = 2
    args.inner_params= ['image_conv', 'i2h', 'h2h', 'actor'] # params to be updated in inner loop
    args.outer_params = ['query', 'key', 'value', 'comm', 'critic'] # params to be updated in outer loop

In [24]:
# Set run dir

date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}"

model_name = args.model or default_model_name
model_dir = utils.get_model_dir(model_name)

# Load loggers and Tensorboard writer

txt_logger = utils.get_txt_logger(model_dir)
csv_file, csv_logger = utils.get_csv_logger(model_dir)
tb_writer = tensorboardX.SummaryWriter(model_dir)

# Log command and all script arguments

#txt_logger.info("{}\n".format(" ".join(sys.argv)))
txt_logger.info("{}\n".format(args))

# Set seed for all randomness sources

utils.seed(args.seed)

# Set device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
txt_logger.info(f"Device: {device}\n")

{'algo': 'ppo', 'env': 'MiniGrid-SimpleCrossingS9N2-v0', 'model': 'test_metarims_6_4_wallgap_doorkey_crossing', 'early_stop': True, 'seed': 1, 'log_interval': 1, 'save_interval': 10, 'procs': 16, 'frames': 5000000.0, 'epochs': 4, 'batch_size': 256, 'frames_per_proc': 128, 'discount': 0.99, 'lr': 0.0007, 'gae_lambda': 0.99, 'entropy_coef': 0.01, 'value_loss_coef': 0.5, 'max_grad_norm': 0.5, 'optim_eps': 1e-08, 'optim_alpha': 0.99, 'clip_eps': 0.2, 'recurrence': 32, 'text': False, 'use_rim': True, 'meta_learn': True, 'reshape_reward': False, 'mem': True, 'num_units': 6, 'k': 4, 'input_heads': 1, 'lr_alpha': 0.0007, 'lr_beta': 0.0007, 'inner_recurrence': 8, 'outer_recurrence': 32, 'num_tasks': 2, 'inner_params': ['image_conv', 'i2h', 'h2h', 'actor'], 'outer_params': ['query', 'key', 'value', 'comm', 'critic']}

Device: cpu



In [25]:
# Load environments

envs = []
for i in range(args.procs):
    envs.append(utils.make_env(args.env, args.seed + 10000 * i))
txt_logger.info("Environments loaded\n")

# Load training status

try:
    status = utils.get_status(model_dir)
except OSError:
    status = {"num_frames": 0, "update": 0}
txt_logger.info("Training status loaded\n")

# Load observations preprocessor

obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space)
if "vocab" in status:
    preprocess_obss.vocab.load_vocab(status["vocab"])
txt_logger.info("Observations preprocessor loaded")

# Reshape reward function
if args.reshape_reward:
    def reshape_reward(obs, action, reward, done):
        if not done:
            reward = -1
        else:
            reward = 1
        return reward
else:
    reshape_reward = None

# Load model

acmodel = ACModel(obs_space=obs_space, action_space=envs[0].action_space, use_memory=args.mem, use_text=args.text, use_rim=args.use_rim, num_units=args.num_units, k=args.k, input_heads=args.input_heads)
if "model_state" in status:
    acmodel.load_state_dict(status["model_state"])
acmodel.to(device)
txt_logger.info("Model loaded\n")
txt_logger.info("{}\n".format(acmodel))

# Load algo

if args.algo == "a2c":
    algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                            args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                            args.optim_alpha, args.optim_eps, preprocess_obss)
elif args.algo == "ppo":
    algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                            args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                            args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss, reshape_reward)
else:
    raise ValueError("Incorrect algorithm name: {}".format(args.algo))

# change to RMSProp optimizer
algo.optimizer = torch.optim.RMSprop(algo.acmodel.parameters(), args.lr, eps=args.optim_eps)

# delete param_groups after it has been created
for i in range(len(algo.optimizer.param_groups)):
    del algo.optimizer.param_groups[0]

# Re-create separate param_groups for different inner and outer loop optimizer lr

# inner loop param group
algo.optimizer.add_param_group({'params': [
    *acmodel.image_conv.parameters(), 
    *acmodel.memory_rnn.rnn.parameters(), 
    *acmodel.actor.parameters()], 'lr': args.lr_alpha})

# outer loop param group
algo.optimizer.add_param_group({'params': [
    *acmodel.critic.parameters(), 
    *acmodel.memory_rnn.key.parameters(),
    *acmodel.memory_rnn.key_.parameters(),
    *acmodel.memory_rnn.query.parameters(),
    *acmodel.memory_rnn.query_.parameters(),
    *acmodel.memory_rnn.value.parameters(),
    *acmodel.memory_rnn.value_.parameters(),
    *acmodel.memory_rnn.comm_attention_output.parameters()
    ], 'lr': args.lr_beta})

if "optimizer_state" in status:
    algo.optimizer.load_state_dict(status["optimizer_state"])
txt_logger.info("Optimizer loaded\n")

Environments loaded

Training status loaded

Observations preprocessor loaded
Model loaded

ACModel(
  (image_conv): Sequential(
    (0): Conv2d(3, 16, kernel_size=(2, 2), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(2, 2), stride=(1, 1))
    (4): ReLU()
    (5): Conv2d(32, 64, kernel_size=(2, 2), stride=(1, 1))
    (6): ReLU()
  )
  (memory_rnn): RIMCell(
    (key): Linear(in_features=64, out_features=64, bias=True)
    (value): Linear(in_features=64, out_features=64, bias=True)
    (rnn): GroupLSTMCell(
      (i2h): GroupLinearLayer()
      (h2h): GroupLinearLayer()
    )
    (query): GroupLinearLayer()
    (query_): GroupLinearLayer()
    (key_): GroupLinearLayer()
    (value_): GroupLinearLayer()
    (comm_attention_output): GroupLinearLayer()
    (comm_dropout): Dropout(p=0.1, inplace=False)
    (input_dropout): Dropout(p=0.1, inplace=False)
  )
  (actor): Sequentia

In [26]:
## Training

num_frames = status["num_frames"]
update = status["update"]
start_time = time.time()

# Moving average settings for early stop
threshold = 0.9
window = 10
rreturn_total = 0
i = 0

# run just once to have initial grads in all parameters and avoid backward error on first pass
exps, _ = algo.collect_experiences()
algo.update_parameters(exps)

while num_frames < args.frames: # STEP 2

    update_start_time = time.time()

    # Sample batch of tasks: STEP 3
    tasks_batch = sample_tasks(n_tasks=args.num_tasks)

    for n, task in enumerate(tasks_batch):

        algo.env = change_multienv_seed(algo.env, seed=task)
        # Sample pre-trajectories from each task: STEP 4
        pre_exps, pre_logs1 = algo.collect_experiences()
         # Unfreeze inner loop params,so grads can get updated in the inner loop
        set_freeze_status(algo.acmodel, args.inner_params, freeze=False)
        # Freeze outer loop parameters, so grads do not get updated in the inner loop
        set_freeze_status(algo.acmodel, args.outer_params, freeze=True)
        # set inner RIM recurence
        algo.recurrence = args.inner_recurrence
        # Update parameters: STEP 6
        algo.update_parameters(pre_exps)
        # Sample post-trajectories t_i from tasks T_i: STEP 7
        post_exps, post_logs1 = algo.collect_experiences()
        # Concatenate to get D_meta: STEP 8
        meta_exps = post_exps if n==0 else cat_exps(meta_exps, post_exps)
        meta_exps = DictList(meta_exps)
        meta_exps.obs = DictList(meta_exps.obs)
        meta_logs1 = post_logs1 if n==0 else cat_logs(meta_logs1, post_logs1)

    # Unfreeze outer loop params, so so grads can get updated in the outer loop
    set_freeze_status(algo.acmodel, args.outer_params, freeze=False)
    # Freeze inner loop params, so grads do not get updated in the outer loop
    set_freeze_status(algo.acmodel, args.inner_params, freeze=True)   
    
    # set outer RIM recurence
    algo.recurrence = args.outer_recurrence

    # Update parameters while keeping inner parametes (module and policy) frozen: STEP 9
    meta_logs2 = algo.update_parameters(meta_exps)

    meta_logs = {**meta_logs1, **meta_logs2}
    update_end_time = time.time()

    num_frames += meta_logs["num_frames"]
    update += 1    
    
    # Print logs

    if update % args.log_interval == 0:
        
        fps = meta_logs["num_frames"]/(update_end_time - update_start_time)
        duration = int(time.time() - start_time)
        return_per_episode = utils.synthesize(meta_logs["return_per_episode"])
        rreturn_per_episode = utils.synthesize(meta_logs["reshaped_return_per_episode"])
        num_frames_per_episode = utils.synthesize(meta_logs["num_frames_per_episode"])
        # Moving average to break loop if mean reward threshold reached
        if args.early_stop:
            #rreturn_total +=rreturn_per_episode['mean']
            rreturn_total +=return_per_episode['mean']
            i+=1
            if i >= window:
                rreturn_mavg = rreturn_total / i
                if rreturn_mavg >= threshold:
                    break
                else:
                    i = 0
                    rreturn_total = 0

        header = ["update", "frames", "FPS", "duration"]
        data = [update, num_frames, fps, duration]
        #header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
        #data += rreturn_per_episode.values()
        header += ["rreturn_" + key for key in return_per_episode.keys()]
        data += return_per_episode.values()       
        header += ["num_frames_" + key for key in num_frames_per_episode.keys()]
        data += num_frames_per_episode.values()
        header += ["entropy", "value", "policy_loss", "value_loss", "grad_norm"]
        data += [meta_logs["entropy"], meta_logs["value"], meta_logs["policy_loss"], meta_logs["value_loss"], meta_logs["grad_norm"]]

        txt_logger.info(
            "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
            .format(*data))

        header += ["return_" + key for key in return_per_episode.keys()]
        data += return_per_episode.values()

        if status["num_frames"] == 0:
            csv_logger.writerow(header)
        csv_logger.writerow(data)
        csv_file.flush()

        for field, value in zip(header, data):
            tb_writer.add_scalar(field, value, num_frames)       

    # Save status

    if args.save_interval > 0 and update % args.save_interval == 0:
        status = {"num_frames": num_frames, "update": update,
                  "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict()}
        if hasattr(preprocess_obss, "vocab"):
            status["vocab"] = preprocess_obss.vocab.vocab
        utils.save_status(status, model_dir)
        txt_logger.info("Status saved")
# STEP 10

print("Number of frames: ", num_frames)

U 71 | F 290816 | FPS 0469 | D 14 | rR:μσmM 0.30 0.35 0.00 0.91 | F:μσmM 237.3 106.6 34.0 324.0 | H 1.798 | V 0.189 | pL 0.117 | vL 0.014 | ∇ 0.114
U 72 | F 294912 | FPS 0341 | D 26 | rR:μσmM 0.22 0.32 0.00 0.84 | F:μσmM 257.0 98.5 59.0 324.0 | H 1.853 | V 0.109 | pL 0.030 | vL 0.009 | ∇ 0.080
U 73 | F 299008 | FPS 0456 | D 35 | rR:μσmM 0.32 0.32 0.00 0.83 | F:μσmM 230.5 100.4 60.0 324.0 | H 1.891 | V 0.092 | pL -0.018 | vL 0.014 | ∇ 0.095
U 74 | F 303104 | FPS 0460 | D 44 | rR:μσmM 0.29 0.33 0.00 0.83 | F:μσmM 236.2 100.6 61.0 324.0 | H 1.890 | V 0.052 | pL -0.003 | vL 0.006 | ∇ 0.074
U 75 | F 307200 | FPS 0463 | D 53 | rR:μσmM 0.19 0.31 0.00 0.87 | F:μσmM 287.7 138.9 48.0 851.0 | H 1.895 | V 0.060 | pL -0.016 | vL 0.015 | ∇ 0.115
U 76 | F 311296 | FPS 0453 | D 62 | rR:μσmM 0.20 0.26 0.00 0.71 | F:μσmM 278.5 79.0 104.0 433.0 | H 1.895 | V 0.053 | pL -0.008 | vL 0.007 | ∇ 0.071
U 77 | F 315392 | FPS 0434 | D 71 | rR:μσmM 0.21 0.28 0.00 0.87 | F:μσmM 277.7 101.3 46.0 577.0 | H 1.904 | V

## Evaluate 3rd environment

In [28]:
#env_id = 'MiniGrid-Empty-8x8-v0'
#env_id = 'MiniGrid-LavaGapS5-v0'
#env_id = 'MiniGrid-DoorKey-5x5-v0'
#env_id = 'MiniGrid-DoorKey-5x5-v0'
#env_id = 'MiniGrid-DoorKey-6x6-v0'
#env_id_1 = 'MiniGrid-WallGapS6-v0'
#env_id = 'MiniGrid-Empty-Random-6x6-v0'
#env_id = 'MiniGrid-Empty-5x5-v0'
#env_id = 'MiniGrid-RedBlueDoors-6x6-v0'
env_id = 'MiniGrid-SimpleCrossingS9N2-v0'

args.model = 'test_metarims_6_4_wallgap_doorkey_crossing'
## Hyper-parameters
args.env = env_id
args.episodes = 100
args.seed = 2
args.argmax = False
args.worst_episodes_to_show = None
print(args)

# Set seed for all randomness sources
utils.seed(args.seed)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}\n")

{'algo': 'ppo', 'env': 'MiniGrid-SimpleCrossingS9N2-v0', 'model': 'test_metarims_6_4_wallgap_doorkey_crossing', 'early_stop': True, 'seed': 2, 'log_interval': 1, 'save_interval': 10, 'procs': 16, 'frames': 5000000.0, 'epochs': 4, 'batch_size': 256, 'frames_per_proc': 128, 'discount': 0.99, 'lr': 0.0007, 'gae_lambda': 0.99, 'entropy_coef': 0.01, 'value_loss_coef': 0.5, 'max_grad_norm': 0.5, 'optim_eps': 1e-08, 'optim_alpha': 0.99, 'clip_eps': 0.2, 'recurrence': 32, 'text': False, 'use_rim': True, 'meta_learn': True, 'reshape_reward': False, 'mem': True, 'num_units': 6, 'k': 4, 'input_heads': 1, 'lr_alpha': 0.0007, 'lr_beta': 0.0007, 'inner_recurrence': 8, 'outer_recurrence': 32, 'num_tasks': 2, 'inner_params': ['image_conv', 'i2h', 'h2h', 'actor'], 'outer_params': ['query', 'key', 'value', 'comm', 'critic'], 'episodes': 100, 'argmax': False, 'worst_episodes_to_show': None}
Device: cpu



In [29]:
num_frames_list = []
fps_list = []
duration_list = []
return_per_episode_list = []
num_frames_per_episode_list = []
seed_list = [10, 20, 30]

print("Env:", args.env, "\n")

for n, seed in enumerate(seed_list):

    # Load environments

    envs = []
    for i in range(args.procs):
        env = utils.make_env(args.env, seed + 10000 * i)
        envs.append(env)
    env = ParallelEnv(envs)
    print("Environments loaded")

    # Load agent

    model_dir = utils.get_model_dir(args.model)
    agent = utils.Agent(obs_space=env.observation_space, action_space=env.action_space, model_dir=model_dir, device=device, argmax=args.argmax, num_envs=args.procs, use_memory=args.mem, use_rim=args.use_rim, num_units=args.num_units, k=args.k, input_heads=args.input_heads)
    print("Agent loaded")

    # Initialize logs

    logs = {"num_frames_per_episode": [], "return_per_episode": []}

    # Run agent

    start_time = time.time()

    obss = env.reset()

    log_done_counter = 0
    log_episode_return = torch.zeros(args.procs, device=device)
    log_episode_num_frames = torch.zeros(args.procs, device=device)

    while log_done_counter < args.episodes:
        actions = agent.get_actions(obss)
        obss, rewards, dones, _ = env.step(actions)
        agent.analyze_feedbacks(rewards, dones)

        log_episode_return += torch.tensor(rewards, device=device, dtype=torch.float)
        log_episode_num_frames += torch.ones(args.procs, device=device)

        for i, done in enumerate(dones):
            if done:
                log_done_counter += 1
                logs["return_per_episode"].append(log_episode_return[i].item())
                logs["num_frames_per_episode"].append(log_episode_num_frames[i].item())

        mask = 1 - torch.tensor(dones, device=device, dtype=torch.float)
        log_episode_return *= mask
        log_episode_num_frames *= mask

    end_time = time.time()
    print("Agent run_{} completed\n" .format(n+1))

    num_frames = sum(logs["num_frames_per_episode"])
    fps = num_frames/(end_time - start_time)
    duration = int(end_time - start_time)
    return_per_episode = utils.synthesize(logs["return_per_episode"])
    num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"])

    # Acumulate logs per agent

    num_frames_list.append(num_frames)
    fps_list.append(fps)
    duration_list.append(duration)
    return_per_episode_list.append(np.fromiter(return_per_episode.values(), float))
    num_frames_per_episode_list.append(np.fromiter(num_frames_per_episode.values(), float))

# Convert lists to numpy arrays
num_frames_tot = np.array(num_frames_list, ndmin=2)
fps_tot = np.array(fps_list, ndmin=2)
duration_tot = np.array(duration_list, ndmin=2)
return_per_episode_tot = np.array(return_per_episode_list, ndmin=2)
num_frames_per_episode_tot = np.array(num_frames_per_episode_list, ndmin=2)

# Print logs

# print("F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
#       .format(num_frames, fps, duration,
#               *return_per_episode.values(),
#               *num_frames_per_episode.values()))

print("F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
      .format(np.median(num_frames_tot, axis=0)[0], np.median(fps_tot, axis=0)[0], np.median(duration_tot, axis=0)[0], *np.median(return_per_episode_tot, axis=0), *np.median(num_frames_per_episode_tot, axis=0)))

#return_per_episode_tot = np.array(return_per_episode_tot, ndim=2)

# Print worst episodes
if args.worst_episodes_to_show:
    n = args.worst_episodes_to_show
    if n > 0:
        print("\n{} worst episodes:".format(n))

        indexes = sorted(range(len(logs["return_per_episode"])), key=lambda k: logs["return_per_episode"][k])
        for i in indexes[:n]:
            print("- episode {}: R={}, F={}".format(i, logs["return_per_episode"][i], logs["num_frames_per_episode"][i]))

Env: MiniGrid-SimpleCrossingS9N2-v0 

Environments loaded
Agent loaded
Agent run_1 completed

Environments loaded
Agent loaded
Agent run_2 completed

Environments loaded
Agent loaded
Agent run_3 completed

F 3442.0 | FPS 2955 | D 1.0 | R:μσmM 0.89 0.07 0.57 0.95 | F:μσmM 40.3 26.6 17.0 155.0


# Re-evaluate 1st environment and test CF

In [30]:
#env_id = 'MiniGrid-Empty-Random-6x6-v0'
#env_id = 'MiniGrid-DoorKey-6x6-v0'
#env_id = 'MiniGrid-Empty-8x8-v0'
#env_id = 'MiniGrid-LavaGapS5-v0'
env_id = 'MiniGrid-WallGapS6-v0'

args.model = 'test_metarims_6_4_wallgap_doorkey_crossing'
## Hyper-parameters
args.env = env_id
args.episodes = 100
args.seed = 3
args.argmax = False
args.worst_episodes_to_show = None
print(args)


# Set seed for all randomness sources
utils.seed(args.seed)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}\n")

{'algo': 'ppo', 'env': 'MiniGrid-WallGapS6-v0', 'model': 'test_metarims_6_4_wallgap_doorkey_crossing', 'early_stop': True, 'seed': 3, 'log_interval': 1, 'save_interval': 10, 'procs': 16, 'frames': 5000000.0, 'epochs': 4, 'batch_size': 256, 'frames_per_proc': 128, 'discount': 0.99, 'lr': 0.0007, 'gae_lambda': 0.99, 'entropy_coef': 0.01, 'value_loss_coef': 0.5, 'max_grad_norm': 0.5, 'optim_eps': 1e-08, 'optim_alpha': 0.99, 'clip_eps': 0.2, 'recurrence': 32, 'text': False, 'use_rim': True, 'meta_learn': True, 'reshape_reward': False, 'mem': True, 'num_units': 6, 'k': 4, 'input_heads': 1, 'lr_alpha': 0.0007, 'lr_beta': 0.0007, 'inner_recurrence': 8, 'outer_recurrence': 32, 'num_tasks': 2, 'inner_params': ['image_conv', 'i2h', 'h2h', 'actor'], 'outer_params': ['query', 'key', 'value', 'comm', 'critic'], 'episodes': 100, 'argmax': False, 'worst_episodes_to_show': None}
Device: cpu



In [31]:
num_frames_list = []
fps_list = []
duration_list = []
return_per_episode_list = []
num_frames_per_episode_list = []
seed_list = [10, 20, 30]

print("Env:", args.env, "\n")

for n, seed in enumerate(seed_list):

    # Load environments

    envs = []
    for i in range(args.procs):
        env = utils.make_env(args.env, seed + 10000 * i)
        envs.append(env)
    env = ParallelEnv(envs)
    print("Environments loaded")

    # Load agent

    model_dir = utils.get_model_dir(args.model)
    agent = utils.Agent(obs_space=env.observation_space, action_space=env.action_space, model_dir=model_dir, device=device, argmax=args.argmax, num_envs=args.procs, use_memory=args.mem, use_rim=args.use_rim, num_units=args.num_units, k=args.k, input_heads=args.input_heads)
    print("Agent loaded")

    # Initialize logs

    logs = {"num_frames_per_episode": [], "return_per_episode": []}

    # Run agent

    start_time = time.time()

    obss = env.reset()

    log_done_counter = 0
    log_episode_return = torch.zeros(args.procs, device=device)
    log_episode_num_frames = torch.zeros(args.procs, device=device)

    while log_done_counter < args.episodes:
        actions = agent.get_actions(obss)
        obss, rewards, dones, _ = env.step(actions)
        agent.analyze_feedbacks(rewards, dones)

        log_episode_return += torch.tensor(rewards, device=device, dtype=torch.float)
        log_episode_num_frames += torch.ones(args.procs, device=device)

        for i, done in enumerate(dones):
            if done:
                log_done_counter += 1
                logs["return_per_episode"].append(log_episode_return[i].item())
                logs["num_frames_per_episode"].append(log_episode_num_frames[i].item())

        mask = 1 - torch.tensor(dones, device=device, dtype=torch.float)
        log_episode_return *= mask
        log_episode_num_frames *= mask

    end_time = time.time()
    print("Agent run_{} completed\n" .format(n+1))

    num_frames = sum(logs["num_frames_per_episode"])
    fps = num_frames/(end_time - start_time)
    duration = int(end_time - start_time)
    return_per_episode = utils.synthesize(logs["return_per_episode"])
    num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"])

    # Acumulate logs per agent

    num_frames_list.append(num_frames)
    fps_list.append(fps)
    duration_list.append(duration)
    return_per_episode_list.append(np.fromiter(return_per_episode.values(), float))
    num_frames_per_episode_list.append(np.fromiter(num_frames_per_episode.values(), float))

# Convert lists to numpy arrays
num_frames_tot = np.array(num_frames_list, ndmin=2)
fps_tot = np.array(fps_list, ndmin=2)
duration_tot = np.array(duration_list, ndmin=2)
return_per_episode_tot = np.array(return_per_episode_list, ndmin=2)
num_frames_per_episode_tot = np.array(num_frames_per_episode_list, ndmin=2)

# Print logs

# print("F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
#       .format(num_frames, fps, duration,
#               *return_per_episode.values(),
#               *num_frames_per_episode.values()))

print("F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
      .format(np.median(num_frames_tot, axis=0)[0], np.median(fps_tot, axis=0)[0], np.median(duration_tot, axis=0)[0], *np.median(return_per_episode_tot, axis=0), *np.median(num_frames_per_episode_tot, axis=0)))

#return_per_episode_tot = np.array(return_per_episode_tot, ndim=2)

# Print worst episodes
if args.worst_episodes_to_show:
    n = args.worst_episodes_to_show
    if n > 0:
        print("\n{} worst episodes:".format(n))

        indexes = sorted(range(len(logs["return_per_episode"])), key=lambda k: logs["return_per_episode"][k])
        for i in indexes[:n]:
            print("- episode {}: R={}, F={}".format(i, logs["return_per_episode"][i], logs["num_frames_per_episode"][i]))

Env: MiniGrid-WallGapS6-v0 

Environments loaded
Agent loaded
Agent run_1 completed

Environments loaded
Agent loaded
Agent run_2 completed

Environments loaded
Agent loaded
Agent run_3 completed

F 1770.0 | FPS 2856 | D 0.0 | R:μσmM 0.89 0.06 0.56 0.99 | F:μσmM 17.7 9.9 2.0 71.0


# Re-evaluate 2nd environment and test CF

In [32]:
#env_id = 'MiniGrid-Empty-Random-6x6-v0'
env_id = 'MiniGrid-DoorKey-6x6-v0'
#env_id = 'MiniGrid-DoorKey-8x8-v0'
#env_id = 'MiniGrid-Empty-8x8-v0'
#env_id = 'MiniGrid-LavaGapS5-v0'
#env_id = 'MiniGrid-WallGapS6-v0'

args.model = 'test_metarims_6_4_wallgap_doorkey_crossing'
## Hyper-parameters
args.env = env_id
args.episodes = 100
args.seed = 3
args.argmax = False
args.worst_episodes_to_show = None
print(args)


# Set seed for all randomness sources
utils.seed(args.seed)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}\n")

{'algo': 'ppo', 'env': 'MiniGrid-DoorKey-6x6-v0', 'model': 'test_metarims_6_4_wallgap_doorkey_crossing', 'early_stop': True, 'seed': 3, 'log_interval': 1, 'save_interval': 10, 'procs': 16, 'frames': 5000000.0, 'epochs': 4, 'batch_size': 256, 'frames_per_proc': 128, 'discount': 0.99, 'lr': 0.0007, 'gae_lambda': 0.99, 'entropy_coef': 0.01, 'value_loss_coef': 0.5, 'max_grad_norm': 0.5, 'optim_eps': 1e-08, 'optim_alpha': 0.99, 'clip_eps': 0.2, 'recurrence': 32, 'text': False, 'use_rim': True, 'meta_learn': True, 'reshape_reward': False, 'mem': True, 'num_units': 6, 'k': 4, 'input_heads': 1, 'lr_alpha': 0.0007, 'lr_beta': 0.0007, 'inner_recurrence': 8, 'outer_recurrence': 32, 'num_tasks': 2, 'inner_params': ['image_conv', 'i2h', 'h2h', 'actor'], 'outer_params': ['query', 'key', 'value', 'comm', 'critic'], 'episodes': 100, 'argmax': False, 'worst_episodes_to_show': None}
Device: cpu



In [33]:
num_frames_list = []
fps_list = []
duration_list = []
return_per_episode_list = []
num_frames_per_episode_list = []
seed_list = [10, 20, 30]

print("Env:", args.env, "\n")

for n, seed in enumerate(seed_list):

    # Load environments

    envs = []
    for i in range(args.procs):
        env = utils.make_env(args.env, seed + 10000 * i)
        envs.append(env)
    env = ParallelEnv(envs)
    print("Environments loaded")

    # Load agent

    model_dir = utils.get_model_dir(args.model)
    agent = utils.Agent(obs_space=env.observation_space, action_space=env.action_space, model_dir=model_dir, device=device, argmax=args.argmax, num_envs=args.procs, use_memory=args.mem, use_rim=args.use_rim, num_units=args.num_units, k=args.k, input_heads=args.input_heads)
    print("Agent loaded")

    # Initialize logs

    logs = {"num_frames_per_episode": [], "return_per_episode": []}

    # Run agent

    start_time = time.time()

    obss = env.reset()

    log_done_counter = 0
    log_episode_return = torch.zeros(args.procs, device=device)
    log_episode_num_frames = torch.zeros(args.procs, device=device)

    while log_done_counter < args.episodes:
        actions = agent.get_actions(obss)
        obss, rewards, dones, _ = env.step(actions)
        agent.analyze_feedbacks(rewards, dones)

        log_episode_return += torch.tensor(rewards, device=device, dtype=torch.float)
        log_episode_num_frames += torch.ones(args.procs, device=device)

        for i, done in enumerate(dones):
            if done:
                log_done_counter += 1
                logs["return_per_episode"].append(log_episode_return[i].item())
                logs["num_frames_per_episode"].append(log_episode_num_frames[i].item())

        mask = 1 - torch.tensor(dones, device=device, dtype=torch.float)
        log_episode_return *= mask
        log_episode_num_frames *= mask

    end_time = time.time()
    print("Agent run_{} completed\n" .format(n+1))

    num_frames = sum(logs["num_frames_per_episode"])
    fps = num_frames/(end_time - start_time)
    duration = int(end_time - start_time)
    return_per_episode = utils.synthesize(logs["return_per_episode"])
    num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"])

    # Acumulate logs per agent

    num_frames_list.append(num_frames)
    fps_list.append(fps)
    duration_list.append(duration)
    return_per_episode_list.append(np.fromiter(return_per_episode.values(), float))
    num_frames_per_episode_list.append(np.fromiter(num_frames_per_episode.values(), float))

# Convert lists to numpy arrays
num_frames_tot = np.array(num_frames_list, ndmin=2)
fps_tot = np.array(fps_list, ndmin=2)
duration_tot = np.array(duration_list, ndmin=2)
return_per_episode_tot = np.array(return_per_episode_list, ndmin=2)
num_frames_per_episode_tot = np.array(num_frames_per_episode_list, ndmin=2)

# Print logs

# print("F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
#       .format(num_frames, fps, duration,
#               *return_per_episode.values(),
#               *num_frames_per_episode.values()))

print("F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
      .format(np.median(num_frames_tot, axis=0)[0], np.median(fps_tot, axis=0)[0], np.median(duration_tot, axis=0)[0], *np.median(return_per_episode_tot, axis=0), *np.median(num_frames_per_episode_tot, axis=0)))

#return_per_episode_tot = np.array(return_per_episode_tot, ndim=2)

# Print worst episodes
if args.worst_episodes_to_show:
    n = args.worst_episodes_to_show
    if n > 0:
        print("\n{} worst episodes:".format(n))

        indexes = sorted(range(len(logs["return_per_episode"])), key=lambda k: logs["return_per_episode"][k])
        for i in indexes[:n]:
            print("- episode {}: R={}, F={}".format(i, logs["return_per_episode"][i], logs["num_frames_per_episode"][i]))

Env: MiniGrid-DoorKey-6x6-v0 

Environments loaded
Agent loaded
Agent run_1 completed

Environments loaded
Agent loaded
Agent run_2 completed

Environments loaded
Agent loaded
Agent run_3 completed

F 21860.0 | FPS 3266 | D 6.0 | R:μσmM 0.42 0.35 0.00 0.95 | F:μσmM 215.8 124.2 18.0 360.0
