In [1]:
import copy
import glob
import os
import time
from collections import deque

import gym
import gym_nav
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from a2c_ppo_acktr import algo, utils
from a2c_ppo_acktr.algo import gail
from a2c_ppo_acktr.arguments import get_args
from a2c_ppo_acktr.envs import make_vec_envs
from a2c_ppo_acktr.model import Policy
from a2c_ppo_acktr.storage import RolloutStorage
from evaluation import evaluate


In [2]:
env_name = 'Gridworld-v0'
log_dir = '/tmp/gym'
device = torch.device("cpu")
alg = 'ppo'
log_interval = 10

value_loss_coef = 0.5
entropy_coef = 0.01
gamma = 0.99
lr = 7e-4
eps = 1e-5
alpha = 0.99
max_grad_norm = 0.5

clip_param = 0.2
ppo_epoch = 4
num_mini_batch = 1

num_env_steps = 10000
num_steps = 5
num_processes = 1

use_gae = False
gae_lambda = 0.95
use_proper_time_limits = False

env = gym.make(env_name)
envs = make_vec_envs(env_name, 0, 1, gamma, log_dir, device, False, capture_video=1, env_kwargs={})

actor_critic = Policy(
    envs.observation_space.shape,
    envs.action_space,
    base_kwargs={'recurrent': True})
actor_critic.to(device)

if alg == 'a2c':
        agent = algo.A2C_ACKTR(
            actor_critic,
            value_loss_coef,
            entropy_coef,
            lr=lr,
            eps=eps,
            alpha=alpha,
            max_grad_norm=max_grad_norm)
elif alg == 'ppo':
    agent = algo.PPO(
        actor_critic,
        clip_param,
        ppo_epoch,
        num_mini_batch,
        value_loss_coef,
        entropy_coef,
        lr=lr,
        eps=eps,
        max_grad_norm=max_grad_norm)

global_step = 0

rollouts = RolloutStorage(num_steps, num_processes,
                          envs.observation_space.shape, envs.action_space,
                          actor_critic.recurrent_hidden_state_size)

obs = envs.reset()
rollouts.obs[0].copy_(obs)
rollouts.to(device)

episode_rewards = deque(maxlen=10)

start = time.time()
num_updates = int(
    num_env_steps) // num_steps // num_processes
for j in range(num_updates):

    for step in range(num_steps):
        #Andy: add global step
        global_step += 1 * num_processes
        # Sample actions
        with torch.no_grad():
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                rollouts.masks[step])

        # Obser reward and next obs
        obs, reward, done, infos = envs.step(action)

        for info in infos:
            if 'episode' in info.keys():
                episode_rewards.append(info['episode']['r'])

        # If done then clean the history of observations.
        masks = torch.FloatTensor(
            [[0.0] if done_ else [1.0] for done_ in done])
        bad_masks = torch.FloatTensor(
            [[0.0] if 'bad_transition' in info.keys() else [1.0]
             for info in infos])
        rollouts.insert(obs, recurrent_hidden_states, action,
                        action_log_prob, value, reward, masks, bad_masks)

    with torch.no_grad():
        next_value = actor_critic.get_value(
            rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
            rollouts.masks[-1]).detach()
        
    rollouts.compute_returns(next_value, use_gae, gamma,
                             gae_lambda, use_proper_time_limits)

    if alg == 'ppo':
        value_loss, action_loss, dist_entropy, approx_kl, clipfracs = \
        agent.update(rollouts)

    else:
        value_loss, action_loss, dist_entropy = agent.update(rollouts)

    rollouts.after_update()

    if j % log_interval == 0 and len(episode_rewards) > 1:
        total_num_steps = (j + 1) * num_processes * num_steps
        end = time.time()
        print(
            "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
            .format(j, total_num_steps,
                    int(total_num_steps / (end - start)),
                    len(episode_rewards), np.mean(episode_rewards),
                    np.median(episode_rewards), np.min(episode_rewards),
                    np.max(episode_rewards), dist_entropy, value_loss,
                    action_loss))



  f"Overwriting existing videos at {self.video_folder} folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)"


Updates 80, num timesteps 405, FPS 71 
 Last 2 training episodes: mean/median reward -200.0/-200.0, min/max reward -200.0/-200.0

Updates 90, num timesteps 455, FPS 73 
 Last 2 training episodes: mean/median reward -200.0/-200.0, min/max reward -200.0/-200.0

Updates 100, num timesteps 505, FPS 74 
 Last 2 training episodes: mean/median reward -200.0/-200.0, min/max reward -200.0/-200.0

Updates 110, num timesteps 555, FPS 74 
 Last 2 training episodes: mean/median reward -200.0/-200.0, min/max reward -200.0/-200.0

Updates 120, num timesteps 605, FPS 72 
 Last 3 training episodes: mean/median reward -200.0/-200.0, min/max reward -200.0/-200.0

Updates 130, num timesteps 655, FPS 72 
 Last 3 training episodes: mean/median reward -200.0/-200.0, min/max reward -200.0/-200.0

Updates 140, num timesteps 705, FPS 73 
 Last 3 training episodes: mean/median reward -200.0/-200.0, min/max reward -200.0/-200.0



KeyboardInterrupt: 