In [1]:
import torch
import numpy as np
import gym
import gym_nav
import sys
sys.path.append('../')
from ppo.model import *
from ppo.utils import init
from evaluation import *

from torch import nn

In [2]:
class DelayedRNNPPO(NNBase):
    '''
    Quick and simple static RNN network with a FC followed by RNN followed by
    2 layers of actor critic split
    '''
    def __init__(self, num_inputs, hidden_size=64,
                auxiliary_heads=[]):
        super(DelayedRNNPPO, self).__init__(True, hidden_size, hidden_size)
        # parameters create self.GRU with hidden_size as recurrent_input_size and
        #  hidden_size as recurrent_hidden_size
        
        self.auxiliary_heads = auxiliary_heads
        self.has_auxiliary = True

        init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
                               constant_(x, 0), np.sqrt(2))
                
        self.shared_layers = []
        self.critic_layers = []
        self.actor_layers = []
        self.conv1d_layers = []
        
        # generate all the shared layers        
        self.shared0 = nn.Sequential(init_(nn.Linear(num_inputs, hidden_size)),
                                nn.Tanh())
        self.critic0 = nn.Sequential(init_(nn.Linear(hidden_size, hidden_size)),
                                nn.Tanh())
        self.critic1 = nn.Sequential(init_(nn.Linear(hidden_size, hidden_size)),
                                nn.Tanh())
        self.actor0 = nn.Sequential(init_(nn.Linear(hidden_size, hidden_size)),
                                nn.Tanh())
        self.actor1 = nn.Sequential(init_(nn.Linear(hidden_size, hidden_size)),
                                nn.Tanh())        
        self.critic_head = init_(nn.Linear(hidden_size, 1))
        
            
        self.auxiliary_layers = []
        self.auxiliary_output_idxs = [] # indexes for generating auxiliary outputs
        self.auxiliary_layer_types = [] # 0 linear, 1 distribution
        self.auxiliary_output_sizes = []
        # generate auxiliary outputs
        current_auxiliary_output_idx = 0

        self.has_auxiliary = False
        self.train()
        
        
    def forward(self, inputs, rnn_hxs, masks, deterministic=False, with_activations=False):
        """Same as forward function but this will pass back all intermediate values

            _type_: _description_
        """
        auxiliary_preds = [None for i in range(len(self.auxiliary_output_sizes))]
        x = inputs

        shared_activations = []
        actor_activations = []
        critic_activations = []

        x = self.shared0(x)
        x = x.unsqueeze(0)
        shared_activations.append(x)
        x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks)
        shared_activations.append(x)
        
        actor_x = self.actor0(x)
        actor_activations.append(actor_x)
        actor_x = self.actor1(actor_x)
        actor_activations.append(actor_x)
        
        critic_x = self.critic0(x)
        critic_activations.append(critic_x)
        critic_x = self.critic1(x)
        critic_activations.append(critic_x)
                    
        # Finally get critic value estimation
        critic_val = self.critic_head(critic_x)

        outputs = {
            'value': critic_val,
            'actor_features': actor_x,
            'rnn_hxs': rnn_hxs,
        }
        
        if self.has_auxiliary:
            outputs['auxiliary_preds'] = auxiliary_preds
        if with_activations:
            outputs['activations'] = {
                'shared_activations': shared_activations,
                'actor_activations': actor_activations,
                'critic_activations': critic_activations
            }        
        return outputs

In [3]:
# env = gym.make('NavEnv-v0')
envs = simple_vec_envs()
env = envs.envs[0]
actor_critic = Policy(env.observation_space.shape, env.action_space, base='DelayedRNNPPO')
base = actor_critic.base

In [4]:
rnn_hxs = torch.zeros(
    1, actor_critic.recurrent_hidden_state_size)
masks = torch.zeros(1, 1)
obs = torch.Tensor(env.reset())

In [5]:


output = actor_critic.act(obs, rnn_hxs, masks)
# output = base(obs, rnn_hxs, masks)

In [6]:
output

{'value': tensor([[0.2708]], grad_fn=<AddmmBackward0>),
 'actor_features': tensor([[-0.1533,  0.1356, -0.2292,  0.0867, -0.0468, -0.0857,  0.1130,  0.2400,
           0.0661, -0.2064,  0.0300, -0.1709, -0.0264, -0.1074,  0.1676, -0.0953,
           0.0723,  0.0176, -0.1280,  0.0971,  0.1476, -0.0412, -0.2048,  0.0342,
           0.0953, -0.1104, -0.2634, -0.0140, -0.0317,  0.2504, -0.1643,  0.3934,
          -0.1132, -0.0138,  0.0387,  0.2093,  0.1442, -0.0193, -0.2719, -0.0531,
           0.0040, -0.0480,  0.3208, -0.0987,  0.1015, -0.0185, -0.2250, -0.1349,
           0.0475,  0.0249,  0.0211, -0.1089,  0.2073, -0.0637, -0.0624,  0.0318,
           0.1073,  0.1470, -0.2576,  0.0703, -0.0240, -0.1081, -0.1692, -0.2053]],
        grad_fn=<TanhBackward0>),
 'rnn_hxs': tensor([[-4.5651e-02,  3.0627e-02, -3.9439e-02, -1.4857e-01,  9.6118e-02,
          -1.7480e-01,  2.7043e-02, -4.2615e-02,  1.5409e-02, -1.8125e-02,
          -1.2344e-02, -9.6874e-02,  1.9311e-01,  2.9294e-02,  9.0790e-02

In [69]:
actor_critic.dist(output['actor_features'])

FixedCategorical(logits: torch.Size([1, 4]))

In [54]:
model = actor_critic.base

In [56]:
model.shared0(obs).unsqueeze(0)

tensor([[-0.3107, -0.2540,  0.1618, -0.1383,  0.0137,  0.0759,  0.1995,  0.0686,
          0.2912,  0.2471, -0.0391, -0.1496,  0.1112,  0.2538,  0.3188,  0.1102,
          0.5746, -0.0592,  0.4906,  0.0756, -0.4205,  0.2638, -0.1815, -0.4008,
         -0.1849,  0.3262, -0.0193, -0.0456, -0.1655, -0.0774, -0.5269,  0.2804,
          0.1704, -0.1933, -0.2009, -0.2227, -0.2322,  0.1751, -0.0233, -0.2919,
         -0.4070,  0.0553,  0.0229, -0.0602,  0.0235, -0.0424, -0.1237,  0.0351,
          0.2804, -0.5164,  0.1617, -0.3315,  0.0872,  0.3726, -0.2472,  0.1194,
         -0.0350,  0.2464, -0.4908,  0.4867,  0.5268,  0.1418, -0.2745, -0.2767]],
       grad_fn=<UnsqueezeBackward0>)

In [37]:
obs.shape

torch.Size([60])

In [44]:
# env = gym.make('NavEnv-v0')
envs = simple_vec_envs()
actor_critic = Policy(env.observation_space.shape, env.action_space, base='FlexBaseAux')

In [47]:
obs = envs.reset()

In [41]:
rnn_hxs = torch.zeros(
    1, actor_critic.recurrent_hidden_state_size)
masks = torch.zeros(1, 1)
obs = torch.Tensor(env.reset())

In [48]:
output = actor_critic.act(torch.Tensor(obs), rnn_hxs, masks)

In [1]:
from gradients import *



def initialize_ppo_training(model=None, obs_rms=None, env_name='NavEnv-v0', env_kwargs={}, make_env=True,
                            agent_base='LoudPPO', nn_base_kwargs={}, recurrent=True,
                            num_steps=10, num_processes=1, seed=0, ppo_epoch=4, clip_param=0.5,
                            num_mini_batch=1, value_loss_coef=0.5, entropy_coef=0.01, 
                            auxiliary_loss_coef=0.3, gamma=0.99, lr=7e-4, eps=1e-5, max_grad_norm=0.5,
                            log_dir='/tmp/gym/', device=torch.device('cpu'), 
                            capture_video=False, take_optimizer_step=True,
                            normalize=True, obs=None, aux_wrapper_kwargs={},
                            auxiliary_truth_sizes=[]):
    """Generate training objects, specifically setting up everything to generate gradients
        Important parameters:
            model, obs_rms, env_kwargs, num_steps (batch_size), num_processes, seed, 
            ppo_epoch (usually set=1), take_optimizer_step (usually set=False)

    Args:
        model (Policy, optional): Policy object (e.g. from load_model_and_env). If not provided
            generate a fresh model with nn_base and nn_base_kwargs
        obs_rms (RunningMeanStd, optional): obs_rms object for vectorized envs. Defaults to None.
        env_name (str, optional): Defaults to 'NavEnv-v0'.
        env_kwargs (dict, optional): Defaults to {}.
        nn_base (str, optional): Used to create model if model is not provided. 
            Defaults to 'FlexBase'.
        agent_base (str, optional): Used to create trainer object. Defaults to 'LoudPPO',
            can also use 'PPO' and 'DecomposeGradPPO'.
        nn_base_kwargs (dict, optional): Used to create model if model is not provided. 
            Defaults to {}.
        recurrent (bool, optional): Used if model==None. Defaults to True.
        num_steps (int, optional): Batch size to use. Defaults to 10.
        num_processes (int, optional): Number of concurrent processes. Defaults to 1.
        seed (int, optional): Randomizer seed. Defaults to 0.
        ppo_epoch (int, optional): Number of epochs to run for PPO. Defaults to 4. Usually
            we will want to set this to 1 to collect grads with
        clip_param (float, optional): PPO clip param. Defaults to 0.5.
        num_mini_batch (int, optional): Number of minibatches to split training rollouts into. 
            Defaults to 1.
        value_loss_coef (float, optional): Value loss weighting. Defaults to 0.5.
        entropy_coef (float, optional): Entropy loss weighting. Defaults to 0.01.
        auxiliary_loss_coef (float, optional): Auxiliary loss weighting. Defaults to 0.3.
        gamma (float, optional): Discount factor. Defaults to 0.99.
        lr (_type_, optional): Learning rate. Defaults to 7e-4.
        eps (_type_, optional): _description_. Defaults to 1e-5.
        max_grad_norm (float, optional): Cap on gradient steps. Defaults to 0.5.
        log_dir (str, optional): Logging directory. Defaults to '/tmp/gym/'.
        device (_type_, optional): Device to run on. Defaults to torch.device('cpu').
        capture_video (bool, optional): Whether to capture video on episodes. Defaults to False.
        take_optimizer_step (bool, optional): Whether to actually take gradient update
            step. Defaults to True.
        normalize (bool, optional): Whether to normalize vectorized environment observations. 
            Defaults to True.
        obs (torch.Tensor, optional): Need to pass the first observation if not making new environments

    Returns:
        agent, envs, rollouts
    """
    
    #Initialize vectorized environments
    # envs = make_vec_envs(env_name, seed, num_processes, gamma, log_dir, device, False,
    #                      capture_video=capture_video, env_kwargs=env_kwargs)
    if make_env:
        envs = make_vec_envs(env_name, seed, num_processes, gamma, log_dir, device, False,
                            capture_video=capture_video, env_kwargs=env_kwargs, normalize=normalize,
                            **aux_wrapper_kwargs)
    else:
        envs = None

    env = gym.make(env_name, **env_kwargs)

    if model is None:
        nn_base = 'DelayedRNNPPO'
        model = Policy(env.observation_space.shape,
                       env.action_space,
                       base=nn_base,
                       base_kwargs={'recurrent': recurrent,
                           **nn_base_kwargs})
        model.to(device)
    
    #Wrap model with an agent algorithm object
    # agent = algo.PPO(model, clip_param, ppo_epoch, num_mini_batch,
    try:
        # if new_aux:
        #     agent = PPOAux(model, clip_param, ppo_epoch, num_mini_batch,
        #             value_loss_coef, entropy_coef, auxiliary_loss_coef, lr=lr,
        #             eps=eps, max_grad_norm=max_grad_norm)
        # else:
        base = globals()[agent_base]
        agent = base(model, clip_param, ppo_epoch, num_mini_batch,
                        value_loss_coef, entropy_coef, auxiliary_loss_coef, lr=lr,
                        eps=eps, max_grad_norm=max_grad_norm,
                        take_optimizer_step=take_optimizer_step)
    except:
        print('Model type not found')
        return False


    #Initialize storage
    rollouts = RolloutStorageAux(num_steps, num_processes, env.observation_space.shape, env.action_space,
                        model.recurrent_hidden_state_size, model.auxiliary_output_sizes,
                        auxiliary_truth_sizes)
    #Storage objects initializes a bunch of empty tensors to store information, e.g.
    #obs has shape (num_steps+1, num_processes, obs_shape)
    #rewards has shape (num_steps, num_processes, 1)
    

    #If loading a previously trained model, pass an obs_rms object to set the vec envs to use
    
    if normalize and obs_rms != None:
        vec_norm = utils.get_vec_normalize(envs)
        if vec_norm is not None and obs_rms is not None:
            vec_norm.obs_rms = obs_rms

        
    #obs, recurrent_hidden_states, value_preds, returns all have batch size num_steps+1
    #rewards, action_log_probs, actions, masks, auxiliary_preds, auxiliary_truths all have batch size num_steps
    if make_env:
        obs = envs.reset()
    elif obs == None:
        raise Exception('No obs passed and no env created, storage cannot be initialized')
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    
    return agent, envs, rollouts

In [5]:
agent, envs, rollouts = initialize_ppo_training(env_kwargs={'rew_structure': 'goal'})

In [6]:
populate_rollouts(agent.actor_critic, envs, rollouts)

({},
 [[{'auxiliary': array([], dtype=float64)}],
  [{'auxiliary': array([], dtype=float64)}],
  [{'auxiliary': array([], dtype=float64)}],
  [{'auxiliary': array([], dtype=float64)}],
  [{'auxiliary': array([], dtype=float64)}],
  [{'auxiliary': array([], dtype=float64)}],
  [{'auxiliary': array([], dtype=float64)}],
  [{'auxiliary': array([], dtype=float64)}],
  [{'auxiliary': array([], dtype=float64)}],
  [{'auxiliary': array([], dtype=float64)}]])

In [7]:
agent.optimizer.zero_grad()
next_value = agent.actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                             rollouts.masks[-1]).detach()
rollouts.compute_returns(next_value, False, 0.99, 0.95)


In [8]:
value_loss, action_loss, dist_entropy, approx_kl, clipfracs, auxiliary_loss, \
    grads = agent.update(rollouts)


AttributeError: 'str' object has no attribute 'shape'

In [10]:
agent.actor_critic.is_recurrent

True