In [1]:
import torch
import torch.nn as nn
from torch.distributions.normal import Normal
from torch.optim import Adam
import numpy as np
import gym

In [2]:
env_name='Pendulum-v0'
hidden_sizes=[32]
lr=1e-2
epochs=2
batch_size=5000
render=True

In [3]:
def mlp(sizes, activation=nn.Tanh, output_activation=nn.ReLU, threshold=nn.Threshold(0, .001)):
    # Build a feedforward neural network.
    layers = []
    for j in range(len(sizes)-1):
        act = activation if j < len(sizes)-2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
    return nn.Sequential(*layers + [threshold])

In [4]:
env = gym.make(env_name)
obs_dim = env.observation_space.shape[0]
params_net = mlp(sizes=[obs_dim]+hidden_sizes+[2])
optimizer = Adam(params_net.parameters(), lr=lr)

In [5]:
params_net

Sequential(
  (0): Linear(in_features=3, out_features=32, bias=True)
  (1): Tanh()
  (2): Linear(in_features=32, out_features=2, bias=True)
  (3): ReLU()
  (4): Threshold(threshold=0, value=0.001)
)

In [13]:
def get_policy(obs):
    params = params_net(obs).tolist()
    mean = nn.Parameter(torch.tensor(params[0]))
    std = nn.Parameter(torch.tensor(params[1]))
    return Normal(mean,std)

def get_action(obs):
    return get_policy(obs).rsample().reshape(1,)

# def compute_loss(obs, act, weights):
#     logp = get_policy(obs).log_prob(act)
#     return -(logp * weights).mean()

def compute_loss(obs, act, weights):
    loss = 0
    for _ in range(len(obs)):
        logp = get_policy(obs[0]).log_prob(act[0])
        loss -= (logp * weights[0]).mean()
    return loss

def reward_to_go(rews):
    n = len(rews)
    rtgs = np.zeros_like(rews)
    for i in reversed(range(n)):
        rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
    return rtgs

In [7]:
def train_one_epoch():
        # make some empty lists for logging.
        batch_obs = []          # for observations
        batch_acts = []         # for actions
        batch_weights = []      # for R(tau) weighting in policy gradient
        batch_rets = []         # for measuring episode returns
        batch_lens = []         # for measuring episode lengths

        # reset episode-specific variables
        obs = env.reset()       # first obs comes from starting distribution
        done = False            # signal from environment that episode is over
        ep_rews = []            # list for rewards accrued throughout ep

        # render first episode of each epoch
        finished_rendering_this_epoch = False

        # collect experience by acting in the environment with current policy
        while True:

            # rendering
            if (not finished_rendering_this_epoch) and render:
                env.render()

            # save obs
            batch_obs.append(obs.copy())

            # act in the environment
            act = get_action(torch.as_tensor(obs, dtype=torch.float32))
            obs, rew, done, _ = env.step(act.data)
    
            # save action, reward
            batch_acts.append(act)
            ep_rews.append(rew)

            if done:
                # if episode is over, record info about episode
                ep_ret, ep_len = sum(ep_rews), len(ep_rews)
                batch_rets.append(ep_ret)
                batch_lens.append(ep_len)

                # the weight for each logprob(a|s) is R(tau)
                batch_weights += list(reward_to_go(ep_rews))

                # reset episode-specific variables
                obs, done, ep_rews = env.reset(), False, []

                # won't render again this epoch
                finished_rendering_this_epoch = True

                # end experience loop if we have enough of it
                if len(batch_obs) > batch_size:
                    break

        # take a single policy gradient update step
        optimizer.zero_grad()
        batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
                                  act=torch.as_tensor(batch_acts, dtype=torch.float32),
                                  weights=torch.as_tensor(batch_weights, dtype=torch.float32)
                                  )
        batch_loss.backward()
        optimizer.step()
        return batch_loss, batch_rets, batch_lens

In [8]:
# training loop
for i in range(epochs):
    batch_loss, batch_rets, batch_lens = train_one_epoch()
    print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
            (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
env.close()

epoch:   0 	 loss: 20474150.000 	 return: -1580.412 	 ep_len: 200.000
epoch:   1 	 loss: 48983560.000 	 return: -1487.811 	 ep_len: 200.000
