In [2]:
!ls ~/tmp/exception

ckpt.pth  local.pkl


In [6]:
from rlkits.policies import PolicyWithValue
import torch
import gym
import pickle

env = gym.make('Pendulum-v0')

ob_space = env.observation_space
ac_space = env.action_space

pi = PolicyWithValue(ob_space=ob_space,
    ac_space=ac_space, hidden_layers=[1024],
    activation=torch.nn.ReLU)

pi.load_ckpt('/home/ubuntu/tmp/exception')

with open('/home/ubuntu/tmp/exception/local.pkl', 'rb') as f:
    lossargs = pickle.load(f)

In [7]:
lossargs.keys()

dict_keys(['trajectory', 'midx', 'eps'])

In [10]:
for p in pi.policy_net.parameters():
    print(p.data)

tensor([[    nan,     nan,     nan],
        [-0.0865, -0.0426,  0.1264],
        [ 0.0225,  0.0171,  0.0205],
        ...,
        [    nan,     nan,     nan],
        [    nan,     nan,     nan],
        [    nan,     nan,     nan]])
tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]])


In [2]:
from rlkits.policies import PolicyWithValue
from rlkits.env_batch import ParallelEnvBatch
from rlkits.sampler import ParallelEnvTrajectorySampler
from rlkits.sampler import estimate_Q
import numpy as np
import gym

def make_env():
    return gym.make('Pendulum-v0')

def reward_transform(rew):
    return (rew + 8.0) / 16.0

env = ParallelEnvBatch(make_env, nenvs=2)

#env = make_env()
ob_space = env.observation_space
ac_space = env.action_space

pi = PolicyWithValue(
    ob_space=ob_space, ac_space=ac_space, ckpt_dir='/tmp', 
    hidden_layers=[1024])


samp = ParallelEnvTrajectorySampler(env, pi, 3, 
                                    reward_transform=reward_transform)

for attr in ['obs', 'rews', 'vpreds', 'dones', 'actions', 'log_prob']:
    print(attr, getattr(samp, attr).shape)
traj = samp(callback=estimate_Q)
#pp.pprint(traj)

Type of the environment <class 'rlkits.env_batch.ParallelEnvBatch'>
obs (3, 2, 3)
rews (3, 2)
vpreds (3, 2)
dones (3, 2)
actions (3, 2, 1)
log_prob (3, 2, 1)


In [4]:

for k, v in traj.items():
    if isinstance(v, np.ndarray):
        print(k, v.shape)

obs (3, 2, 3)
actions (3, 2, 1)
log_prob (3, 2, 1)
rews (3, 2)
vpreds (3, 2)
dones (3, 2)
Q (3, 2)


In [12]:
# Nuts and Bolts

# aggregate experiences from all envs 
# each expr from one env can be used for one update
# I want to expr from the same env to stick together
# This means I need to tranpose the array so that
# (nenvs, nsteps, ...)
# so that when I reshape (C style) the array to merge the first two axes
# the exprs from the same env are contiguous 


def sf01(arr):
    """
    aggregate experiences from all envs 
    each expr from one env can be used for one update
    I want to expr from the same env to stick together
    This means I need to tranpose the array so that
    (nenvs, nsteps, ...)
    so that when I reshape (C style) the array to merge the first two axes
    the exprs from the same env are contiguous     
    swap and then flatten axes 0 and 1
    """
    s = arr.shape
    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])

for k, v in traj.items():
    if isinstance(v, np.ndarray):
        print(k, sf01(v).shape)

obs (6, 3)
actions (6, 1)
log_prob (6, 1)
rews (6,)
vpreds (6,)
dones (6,)
Q (6,)


In [5]:
obs = traj['obs']

In [6]:
obs

array([[[-0.9999945 ,  0.00330988,  0.8918152 ],
        [ 0.5849865 , -0.811043  , -0.05165382]],

       [[-0.99921906, -0.03951333,  0.8566701 ],
        [ 0.5702964 , -0.82143897, -0.3599361 ]],

       [[-0.9974357 , -0.07156818,  0.6421159 ],
        [ 0.52808064, -0.8491942 , -1.0105578 ]]], dtype=float32)

In [7]:
obs = obs.swapaxes(0, 1)

In [9]:
obs.shape

(2, 3, 3)

In [10]:
help(np.reshape)

Help on function reshape in module numpy:

reshape(a, newshape, order='C')
    Gives a new shape to an array without changing its data.
    
    Parameters
    ----------
    a : array_like
        Array to be reshaped.
    newshape : int or tuple of ints
        The new shape should be compatible with the original shape. If
        an integer, then the result will be a 1-D array of that length.
        One shape dimension can be -1. In this case, the value is
        inferred from the length of the array and remaining dimensions.
    order : {'C', 'F', 'A'}, optional
        Read the elements of `a` using this index order, and place the
        elements into the reshaped array using this index order.  'C'
        means to read / write the elements using C-like index order,
        with the last axis index changing fastest, back to the first
        axis index changing slowest. 'F' means to read / write the
        elements using Fortran-like index order, with the first index
        c

In [5]:
from torch.distributions import Categorical

help(Categorical.entropy)

Help on function entropy in module torch.distributions.categorical:

entropy(self)
    Returns entropy of distribution, batched over batch_shape.
    
    Returns:
        Tensor of shape batch_shape.

