<a href="https://colab.research.google.com/github/kmoy14-stanford/AA203-Homework/blob/master/HW4/AA_203_HW_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Getting OpenAI Gym to work on Colab:


In [None]:
%%bash 
# install required system dependencies
apt-get install -y xvfb x11-utils 
apt-get install swig
# install required python dependencies
pip install gym[box2d]==0.17.* \
            pyvirtualdisplay==0.2.* \
            PyOpenGL==3.1.* \
            PyOpenGL-accelerate==3.1.* \
            pyglet

Reading package lists...
Building dependency tree...
Reading state information...
x11-utils is already the newest version (7.7+3build1).
xvfb is already the newest version (2:1.19.6-1ubuntu4.9).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.
Reading package lists...
Building dependency tree...
Reading state information...
swig is already the newest version (3.0.12-1).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [None]:
import pyvirtualdisplay
# use False with Xvfb
_display = pyvirtualdisplay.Display(visible=False,  
                                    size=(1400, 900))
_ = _display.start()

In [None]:
#%%
import argparse
import gym
import numpy as np
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal

import matplotlib.pyplot as plt

# Cart Pole
parser = argparse.ArgumentParser(description='PyTorch actor-critic example')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
                    help='discount factor (default: 0.99)')
parser.add_argument('--seed', type=int, default=203, metavar='N',
                    help='random seed (default: 203)')
parser.add_argument('--render', default=True,
                    help='render the environment')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='interval between training status logs (default: 10)')
parser.add_argument("-f", "--file", required=False) 

# based on:
# https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py

args = parser.parse_args()
env = gym.make('LunarLanderContinuous-v2')
env.reset()
img = plt.imshow(env.render('rgb_array')) 

env.seed(args.seed)
torch.manual_seed(args.seed)

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

class Policy(nn.Module):
    """
    implements both actor and critic in one model
    """
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(state_dim, 128)
        self.affine2 = nn.Linear(128, 128)

        # actor's layer
        self.action_mean = nn.Linear(128, action_dim)
        self.action_var = nn.Linear(128, action_dim)
        # critic's layer
        self.value_head = nn.Linear(128, 1)
        # action & reward buffer
        self.saved_actions = []
        self.rewards = []
        
    def forward(self, x):
        """
        forward of both actor and critic
        """
        # TODO map input to 
        # mean of action distribution
        # variance of action distribution (pass this through a non-negative function), e.g. torch.exp()
        # state value

        x = F.relu(self.affine2(F.relu(self.affine1(x))))

        action_mean = self.action_mean(x)
        action_var = torch.exp(self.action_var(x))
        state_values = self.value_head(x)
        
        return 0.5*action_mean, 0.5*action_var, state_values
    
model = Policy().float()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
eps = np.finfo(np.float32).eps.item()

def select_action(state):
    state = torch.from_numpy(state).float()
    mu, sigma, state_value = model(state)
    
    # create a normal distribution over the continuous action space
    m = Normal(loc=mu,scale=sigma)
    
    # and sample an action using the distribution
    action = m.sample()
    
    # save to action buffer
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    
    # the action to take (left or right)
    return action.data.numpy()

def finish_episode():
    """
    Training code. Calculates actor and critic loss and performs backprop.
    """
    R = 0
    saved_actions = model.saved_actions
    policy_losses = [] # list to save actor (policy) loss
    value_losses = [] # list to save critic (value) loss
    returns = [] # list to save the true values
    
    # calculate the true value using rewards returned from the environment
    for r in model.rewards[::-1]:
        # TODO compute the value at state x
        # via the reward and the discounted tail reward
        R = r + args.gamma * R
        
        
        returns.insert(0, R)
        
    # whiten the returns
    returns = torch.tensor(returns).float()
    returns = (returns - returns.mean()) / (returns.std() + eps)
    
    for (log_prob, value), R in zip(saved_actions, returns):
        # TODO compute the advantage via subtracting off value
        advantage = R - value.item()  
        
        # TODO calculate actor (policy) loss, from log_prob (saved in select action)
        # and from advantage
        # append this to policy_losses
        policy_losses.append(-log_prob * advantage)
        
        # TODO calculate critic (value) loss
        value_losses.append(F.mse_loss(value, torch.tensor([R])))

        
    # reset gradients
    optimizer.zero_grad()
    
    # sum up all the values of policy_losses and value_losses
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
    
    # perform backprop
    loss.backward()
    optimizer.step()
    
    # reset rewards and action buffer
    del model.rewards[:]
    del model.saved_actions[:]

def main():
    running_reward = -100
    
    # run infinitely many episodes, until performance criteria met
    episodic_rewards = []
    
    for i_episode in count(1):
        # reset environment and episode reward
        state = env.reset()
        ep_reward = 0

        for t in range(1, 2500):
            # select action from policy
            action = select_action(state)
            
            # take the action
            state, reward, done, _ = env.step(action)
            
            if args.render and i_episode % 100 == 0:
                  screen = env.render(mode='rgb_array')

                  plt.imshow(screen)
                  ipythondisplay.clear_output(wait=True)
                  ipythondisplay.display(plt.gcf())
    
            model.rewards.append(reward)
            ep_reward += reward
            if done:
                episodic_rewards.append(ep_reward)
                break
                
        # update cumulative reward
        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        
        # perform backprop
        finish_episode()
        
        # log results
        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
            
        # check if we have "solved" the problem
        if running_reward > 200:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))

            # TODO plot episodic_rewards --- submit this plot with your code
            fig, ax1 = plt.subplots(1, 1, figsize=(8, 6))
            ax1.set_xlabel("Episode")
            ax1.set_ylabel("Reward")
            # ax1.set_title("ESS Revenue, Disaggregated")
            # p1 = ax1.plot(times_plt, lmp_ls)
            # p2 = ax1.plot(times_plt, -tou_ls)
            ax1.set_title("Episodic Rewards")
            p1 = ax1.plot(episodic_rewards)
            plt.grid()
            plt.savefig("ep_rewards.png", dpi=400)
            break
            
if __name__ == '__main__':
    main()

NameError: ignored