In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torch.optim as optim
from torch.distributions.normal import Normal

In [2]:
class Buffer():
    def __init__(self, size, obs_dim, act_dim, gamma=0.99, lam=0.95):
        self.obs_buf = np.zeros((size, obs_dim), dtype=np.float32)
        self.act_buf = np.zeros((size, act_dim), dtype=np.float32)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.ret_buf = np.zeros(size, dtype=np.float32)
        self.logp_buf = np.zeros(size, dtype=np.float32)
        self.gamma = gamma
        self.max_size = size
        self.ptr = 0
        self.traj_start_idx = 0


    def _discount_cumsum(self, x, discount):
        """
        The code below calculates the cummulative discounted sum.
        A more efficient way of doing it, but less readible is the following:
            return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
        """
        cumsum = np.zeros(len(x), dtype=np.float32)
        cumsum[-1] = x[-1]
        for i in range(len(x) - 2, -1, -1):
            cumsum[i] = x[i] + discount * cumsum[i+1]
        return cumsum
        

    def store(self, obs, act, rew, logp):
        assert self.ptr < self.max_size # there must be space in the buffer to store

        self.obs_buf[self.ptr] = obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.logp_buf[self.ptr] = logp
        self.ptr += 1

    ## TO DO
    def end_trajectory(self, last_val):
        traj_slice = slice(self.traj_start_idx, self.ptr)
        rews = np.append(self.rew_buf[traj_slice], last_val)

        # the next line computes the reward to go
        self.ret_buf[traj_slice] = self._discount_cumsum(rews, self.gamma)[:-1]

        self.traj_start_idx = self.ptr


    def get(self):
        # normalize advantages for training stability
        # adv_mean = np.mean(self.adv_buf)
        # adv_std = np.std(self.adv_buf)
        # self.adv_buf = (self.adv_buf - adv_mean) / adv_std

        data = dict(obs=self.obs_buf[:self.ptr], act=self.act_buf[:self.ptr], ret=self.ret_buf[:self.ptr], logp=self.logp_buf[:self.ptr])

        # convert data to dict of torch tensors
        data = {k: torch.as_tensor(v, dtype=torch.float32) for k,v in data.items()}

        self.ptr, self.traj_start_idx = 0, 0

        return data

In [3]:
class Network(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        modules = []
        modules.append(nn.Linear(obs_dim, 100))
        modules.append(nn.ReLU())
        modules.append(nn.Linear(100, 50))
        modules.append(nn.ReLU())
        modules.append(nn.Linear(50, act_dim))

        log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
        self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
        self.mu_net = nn.Sequential(*modules)


    def _distribution(self, obs):
        mu = self.mu_net(obs)
        std = torch.exp(self.log_std)
        return Normal(mu, std)

    
    def _log_prob_from_distribution(self, pi, act):
        return pi.log_prob(act).sum(axis=-1)    # Last axis sum needed for Torch Normal distribution

    
    def forward(self, obs, act=None):
        # Produce action distributions for given observations, and 
        # optionally compute the log likelihood of given actions under
        # those distributions.
        pi = self._distribution(obs)
        logp_a = None
        if act is not None:
            logp_a = self._log_prob_from_distribution(pi, act)
        return pi, logp_a

In [4]:
class Policy(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()

        self.pi = Network(obs_dim, act_dim)

    def step(self, obs):
        with torch.no_grad():
            pi = self.pi._distribution(obs)
            a = pi.sample()
            logp_a = self.pi._log_prob_from_distribution(pi, a)
        return a.numpy(), logp_a.numpy()

    def act(self, obs):
        return self.step(obs)[0]

In [8]:
class Agent():
    def __init__(self, obs_dim, act_dim, gamma=0.99, pi_lr=5e-3, steps_per_epoch=4000):
        
        self.buf = Buffer(steps_per_epoch, obs_dim, act_dim, gamma)

        self.policy = Policy(obs_dim, act_dim)

        # Set up optimizers for policy and value function
        self.pi_optimizer = optim.Adam(self.policy.pi.parameters(), lr=pi_lr)

        self.steps_per_epoch = steps_per_epoch


    def _compute_loss_pi(self, data):
        obs, act, logp_old, ret = data['obs'], data['act'], data['logp'], data['ret']

        # Policy loss
        pi, logp = self.policy.pi(obs, act)
        loss_pi = -(logp * ret).mean() # negative log probability loss

        return loss_pi

    def update(self):
        data = self.buf.get()

        # Train policy with a single step of gradient descent
        self.pi_optimizer.zero_grad()
        loss_pi = self._compute_loss_pi(data)
        loss_pi.backward()
        self.pi_optimizer.step()

In [15]:
def train(agent, env_name, epochs):
    ep_ret, ep_len = 0, 0
    ep_rets = []
    for epoch in range(epochs):
        env = gym.make(env_name)
        o = env.reset()
        done = False
        while not done:
            a, logp = agent.policy.step(torch.as_tensor(np.ascontiguousarray(o), dtype=torch.float32).unsqueeze(0))
            a = a.squeeze(0)
            next_o, r, done, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            agent.buf.store(o, a, r, logp)

            o = next_o

        agent.buf.end_trajectory(0)

        ep_rets.append(ep_ret)
        o, ep_ret, ep_len = env.reset(), 0, 0

        agent.update()
        if epoch % 50 == 0:
            print('Epoch: ', epoch,'avg ep_ret: ', np.mean(ep_rets[-10:]), "total num ep: ", len(ep_rets))

    return ep_rets

In [16]:
import gym

env_name = "LunarLanderContinuous-v2"
agent = Agent(8, 2)

In [18]:
rets = train(agent, env_name, 1000)

Epoch:  0 avg ep_ret:  -1.3674316240498854 total num ep:  1
Epoch:  50 avg ep_ret:  -11.949712030928556 total num ep:  51
Epoch:  100 avg ep_ret:  2.379106555220438 total num ep:  101
Epoch:  150 avg ep_ret:  -8.426894928930427 total num ep:  151
Epoch:  200 avg ep_ret:  -52.840132316678876 total num ep:  201
Epoch:  250 avg ep_ret:  2.507996970778314 total num ep:  251
Epoch:  300 avg ep_ret:  23.214165605765366 total num ep:  301
Epoch:  350 avg ep_ret:  55.583959407265866 total num ep:  351
Epoch:  400 avg ep_ret:  18.187047357344095 total num ep:  401
Epoch:  450 avg ep_ret:  10.305653231404147 total num ep:  451
Epoch:  500 avg ep_ret:  98.65401960782928 total num ep:  501
Epoch:  550 avg ep_ret:  81.44497222452702 total num ep:  551
Epoch:  600 avg ep_ret:  87.77311846778218 total num ep:  601
Epoch:  650 avg ep_ret:  127.43209930649076 total num ep:  651
Epoch:  700 avg ep_ret:  94.95957573989838 total num ep:  701
Epoch:  750 avg ep_ret:  146.3161439425989 total num ep:  751
Ep

In [21]:
def test(agent, env_name, num_games=5):
    env = gym.make(env_name)

    def step(env, act):
        obs, rew, done, _ = env.step(act)
        env.render()
        return obs, rew, done, _

    for game in range(num_games):

        obs = env.reset()

        done = False
        while not done:
            act = agent.policy.act(torch.tensor(obs).unsqueeze(0))
            act = act.squeeze(0)
            obs, rew, done, _ = step(env, act)
            

        env.close()

In [22]:
test(agent, env_name)

KeyboardInterrupt: 