In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical

class Network(nn.Module):
    """Network definition to be used for actor and critic networks"""
    def __init__(self, in_dim, out_dim):
        super().__init__()
        # NOTE: feel free to experiment with this network
        self.linin = nn.Linear(in_dim, 200)
        self.linout = nn.Linear(200, out_dim)

        # initialize weights and bias to 0 in the last layer.
        # this ensures the actors starts out completely random in the beginning, and that the value function starts at 0
        # this can help training.  you can experiment with turning it off.
        self.linout.bias.data.fill_(0.0)
        self.linout.weight.data.fill_(0.0)

    def forward(self, inputs):
        """
        Args:
            inputs (torch.Tensor):  (BS, in_dim)
        Returns:
            torch.Tensor:  (BS, out_dim)
        """
        x = self.linin(inputs)
        x = torch.relu(x)
        x = self.linout(x)
        return x


# NOTE: polcy gradient methods can handle discrete or continuous actions. 
# we include definitions for both cases below.

class DiscreteActor(nn.Module):
    """Actor network that chooses 1 discrete action by sampling from a Categorical distribution of N actions"""
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.logits_net = Network(obs_dim, act_dim)

    def forward(self, obs, taken_act=None):
        logits = self.logits_net(obs)
        pi = Categorical(logits=logits)
        logp_a = None
        if taken_act is not None:
            logp_a = pi.log_prob(taken_act)
        return pi, logp_a

class GaussianActor(nn.Module):
    """Actor network that chooses N continuous actions by sampling from N parameterized independent Normal distributions"""
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.mu_net = Network(obs_dim, act_dim)
        # make the std learnable, but not dependent on the current observation
        log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
        self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))

    def forward(self, obs, taken_act=None):
        mu = self.mu_net(obs)
        std = torch.exp(self.log_std)
        pi = Normal(mu, std)
        logp_a = None
        if taken_act is not None:
            logp_a = pi.log_prob(taken_act).sum(axis=-1)
        return pi, logp_a

class ActorCritic(nn.Module):
    """
    Object to hold Actor and Critic network objects

    See Sutton book (http://www.incompleteideas.net/book/RLbook2018.pdf) Chapter 13 for discussion of Actor Critic methods.
    Basically they are just policy gradients methods where you also learn a value function and use that to aid in learning.
    Not all options in this class use a critic, for example psi_mode='future_return' just uses the rewards in a REINFORCE fashion.
    """
    def __init__(self, obs_dim, act_dim, discrete):
        super().__init__()
        # bulid actor networt
        self.discrete = discrete
        if self.discrete:
            self.pi = DiscreteActor(obs_dim, act_dim)
        else:
            self.pi = GaussianActor(obs_dim, act_dim)
        # build value function
        self.v  = Network(obs_dim, 1)

    def step(self, obs):
        """Run a single forward step of the ActorCritic networks.  Used during rollouts, but not during optimization"""
        # no_grad, since we don't need to do any backprop while we collect data.
        # this means we will have to recompute forward passes later. (this is standard)
        with torch.no_grad():  
            pi, _ = self.pi(obs)
            a = pi.sample()
            logp_a = pi.log_prob(a) if self.discrete else pi.log_prob(a).sum(axis=-1)
            v = self.v(obs)
        return a.cpu().numpy(), v.cpu().numpy(), logp_a.cpu().numpy()

    def act(self, obs):
        return self.step(obs)[0]

In [4]:
import time

import numpy as np
import gym
from gym.spaces import Box, Discrete

import torch
import torch.nn as nn
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
from torch.optim import Adam

from utils import count_vars, discount_cumsum, args_to_str
from models import ActorCritic
from pg_buffer import PGBuffer

from collections import defaultdict

from torch.utils.tensorboard import SummaryWriter
#import PIL

def main(args):
    # create environment 
    env = gym.make(args.env)
    env.seed(args.seed)
    obs_dim = env.observation_space.shape[0]
    if isinstance(env.action_space, Discrete):
        discrete = True
        act_dim = env.action_space.n
    else:
        discrete = False
        act_dim = env.action_space.shape[0]

    # actor critic 
    ac = ActorCritic(obs_dim, act_dim, discrete).to(args.device)
    print('Number of parameters', count_vars(ac))

    # Set up experience buffer
    steps_per_epoch = int(args.steps_per_epoch)
    buf = PGBuffer(obs_dim, act_dim, discrete, steps_per_epoch, args)
    logs = defaultdict(lambda: [])
    writer = SummaryWriter(args_to_str(args))
    gif_frames = []

    # Set up function for computing policy loss
    def compute_loss_pi(batch):
        obs, act, psi, logp_old = batch['obs'], batch['act'], batch['psi'], batch['logp']
        pi, logp = ac.pi(obs, act)

        # Policy loss
        if args.loss_mode == 'vpg':
            loss_pi= logp*psi
            ipdb.set_trace()
        else:
            raise Exception('Invalid loss_mode option', args.loss_mode)

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        pi_info = dict(kl=approx_kl, ent=ent)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(batch):
        obs, ret = batch['obs'], batch['ret']
        v = ac.v(obs)
        loss_v=(obs-ret)**2
        return loss_v

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=args.pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=args.v_lr)

    # Set up update function
    def update():
        batch = buf.get()

        # Get loss and info values before update
        pi_l_old, pi_info_old = compute_loss_pi(batch)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(batch).item()

        # Policy learning
        for i in range(args.train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(batch)
            loss_pi.backward()
            pi_optimizer.step()

        # Value function learning
        for i in range(args.train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(batch)
            loss_v.backward()
            vf_optimizer.step()

        # Log changes from update
        kl, ent = pi_info['kl'], pi_info_old['ent']
        logs['kl'] += [kl]
        logs['ent'] += [ent]
        logs['loss_v'] += [loss_v.item()]
        logs['loss_pi'] += [loss_pi.item()]

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    ep_count = 0  # just for logging purpose, number of episodes run
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(args.epochs):
        for t in range(steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32).to(args.device))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            if ep_count % 100 == 0:
                frame = env.render(mode='rgb_array')
                # uncomment this line if you want to log to tensorboard (can be memory intensive)
                #gif_frames.append(frame)
                #gif_frames.append(PIL.Image.fromarray(frame).resize([64,64]))  # you can try this downsize version if you are resource constrained
                time.sleep(0.01)
            
            # Update obs (critical!)
            o = next_o

            timeout = ep_len == args.max_ep_len
            terminal = d or timeout
            epoch_ended = t==steps_per_epoch-1

            if terminal or epoch_ended:
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32).to(args.device))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logs['ep_ret'] += [ep_ret]
                    logs['ep_len'] += [ep_len]
                    ep_count += 1

                o, ep_ret, ep_len = env.reset(), 0, 0

                # save a video to tensorboard so you can view later
                if len(gif_frames) != 0:
                    vid = np.stack(gif_frames)
                    vid_tensor = vid.transpose(0,3,1,2)[None]
                    writer.add_video('rollout', vid_tensor, epoch, fps=50)
                    gif_frames = []
                    writer.flush()
                    print('wrote video')

        # Perform VPG update!
        update()

        if epoch % 10 == 0:
            vals = {key: np.mean(val) for key, val in logs.items()}
            for key in vals:
                writer.add_scalar(key, vals[key], epoch)
            writer.flush()
            print('Epoch', epoch, vals)
            logs = defaultdict(lambda: [])



if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()

    parser.add_argument('--env', type=str, default='CartPole-v0', help='[CartPole-v0, LunarLander-v2, LunarLanderContinuous-v2, others]')

    parser.add_argument('--epochs', type=int, default=1000, help='Number of epochs to run')
    parser.add_argument('--gamma', type=float, default=0.99, help='discount factor')
    parser.add_argument('--lam', type=float, default=0.97, help='GAE-lambda factor')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--steps_per_epoch', type=int, default=1000, help='Number of env steps to run during optimizations')
    parser.add_argument('--max_ep_len', type=int, default=1000)

    parser.add_argument('--train_pi_iters', type=int, default=4)
    parser.add_argument('--train_v_iters', type=int, default=40)
    parser.add_argument('--pi_lr', type=float, default=1e-3, help='Policy learning rate')
    parser.add_argument('--v_lr', type=float, default=3e-4, help='Value learning rate')

    parser.add_argument('--psi_mode', type=str, default='gae', help='value to modulate logp gradient with [future_return, gae]')
    parser.add_argument('--loss_mode', type=str, default='vpg', help='Loss mode [vpg, ppo]')
    parser.add_argument('--clip_ratio', type=float, default=0.1, help='PPO clipping ratio')

    parser.add_argument('--render_interval', type=int, default=100, help='render every N')
    parser.add_argument('--log_interval', type=int, default=100, help='log every N')

    parser.add_argument('--device', type=str, default='cpu', help='you can set this to cuda if you have a GPU')

    parser.add_argument('--suffix', type=str, default='', help='Just for experiment logging (see utils)')
    parser.add_argument('--prefix', type=str, default='logs', help='Just for experiment logging (see utils)')
    
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    main(args)

usage: ipykernel_launcher.py [-h] [--env ENV] [--epochs EPOCHS]
                             [--gamma GAMMA] [--lam LAM] [--seed SEED]
                             [--steps_per_epoch STEPS_PER_EPOCH]
                             [--max_ep_len MAX_EP_LEN]
                             [--train_pi_iters TRAIN_PI_ITERS]
                             [--train_v_iters TRAIN_V_ITERS] [--pi_lr PI_LR]
                             [--v_lr V_LR] [--psi_mode PSI_MODE]
                             [--loss_mode LOSS_MODE] [--clip_ratio CLIP_RATIO]
                             [--render_interval RENDER_INTERVAL]
                             [--log_interval LOG_INTERVAL] [--device DEVICE]
                             [--suffix SUFFIX] [--prefix PREFIX]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\kehan\AppData\Roaming\jupyter\runtime\kernel-e84e53ca-88f8-43f8-bcf7-617e47ac6408.json


SystemExit: 2

In [5]:
pip install 'gym[box2d]'

Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: "'gym[box2d]'"
