In [22]:
import gym, pybulletgym
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

In [5]:
# Parameters
env_name = 'InvertedPendulumMuJoCoEnv-v0'

In [12]:
# Setup the env - will reuse this by calling reset on it
env = gym.make(env_name)
# Get some parameters about the env
a_shape = env.action_space.shape
s_shape = env.observation_space.shape

In [19]:
# FIXME - this should be broken out into a file
def sample_trajectory(env, policy, max_steps=200):
    
    # Trajectory
    s_hist = []
    a_hist = []
    r_hist = []
    
    s = env.reset()
    for _ in range(max_steps):
        a = policy.sample(s)
        s_next, r, done, _ = env.step(a)
        
        s_hist.append(s)
        a_hist.append(a)
        r_hist.append(r)
        
        s = s_next
        
        if done[0]:
            break
    
    return s_hist, a_hist, r_hist

In [66]:
class Agent(nn.Module):
    def __init__(self, obs_space_shape, action_space_shape, hidden_layer_size=16):
        super(Agent, self).__init__()
        self._obs_space_flat_size = np.prod(obs_space_shape)
        self._action_space_size = np.prod(action_space_shape)
        self._action_space_shape = action_space_shape
        
        self.fc1 = nn.Linear(self._obs_space_flat_size, hidden_layer_size)
        self.fc2 = nn.Linear(hidden_layer_size, 2*self._action_space_size)
        
        self._rng = np.random.RandomState(123)
    
    def __call__(self, x):
        x = x.view(-1, self._obs_space_flat_size)
        x = self.fc1(x)
        x = nn.functional.relu(x)
        x = self.fc2(x)
        # Policy will model a Gaussian
        # [:, 0, ...] gives means
        # [:, 1, ...] gives stddevs, after a softplus
        x[:, 1, ...] = nn.functional.softplus(x[:, 1, ...])
        x = x.view((-1,2,) + self._action_space_shape)
        return x
    
    def sample(self, s):
        # Samples will be done one at a time - add fake batch dim
        with torch.no_grad():
            s = torch.tensor(s, dtype=torch.float32).unsqueeze(0)
            gaussian_params = self(s).numpy()
            means = gaussian_params[0, 0, ...]
            stddevs = gaussian_params[0, 1, ...]
        return self._rng.normal(loc=means, scale=stddevs)

In [67]:
agent = Agent((4,), (1,))

In [72]:
agent.sample(np.array([0.0, 0.0, 0.0, 0.0]))

array([-0.3009472])

In [48]:
np.array(np.arange(2* 2* 5* 5).reshape((2, 2, 5, 5)))[0, ...]

array([[[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24]],

       [[25, 26, 27, 28, 29],
        [30, 31, 32, 33, 34],
        [35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44],
        [45, 46, 47, 48, 49]]])

In [42]:
T = torch.tensor([[1.0, 4.5], [-0.5, 9.5]]).view(-1, 4)

In [43]:
T.numpy()

array([[ 1. ,  4.5, -0.5,  9.5]], dtype=float32)

In [37]:
nn.functional.relu

<function torch.nn.functional.relu(input, inplace=False)>

In [41]:
torch.tensor([[1.0, 4.5], [-0.5, 9.5]]).view((-1,) + (2,))

tensor([[ 1.0000,  4.5000],
        [-0.5000,  9.5000]])