# RDPG with OpenAI gym - PyTorch



## Setup virtual display 

In [1]:
!apt-get update
!apt-get install -y xvfb x11-utils
!pip install pyvirtualdisplay==0.2.* \
             PyOpenGL==3.1.* \
             PyOpenGL-accelerate==3.1.*

## First we initialize a virtual display
import pyvirtualdisplay

_display = pyvirtualdisplay.Display(visible=False, size=(1400, 900)) #Use false with xvfb
_ = _display.start() 

!echo $DISPLAY

0% [Working]            Get:1 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [Waiting for headers] [Connecting to cloud.r-project.org] [1 InRelease 15.9                                                                                Get:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:5 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:6 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease [15.9 kB]
Get:7 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:8 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:9 https://cloud.r-project.org/bin/linux/ubuntu bionic-c

## Import Libraries

In [2]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from copy import deepcopy

%matplotlib inline

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Clone repo and set path

In [3]:
%cd /content/
!git clone https://github.com/ARG-NCTU/pytorch-rdpg.git

/content
Cloning into 'pytorch-rdpg'...
remote: Enumerating objects: 95, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 95 (delta 10), reused 10 (delta 10), pack-reused 80[K
Unpacking objects: 100% (95/95), done.


In [4]:
import os
import sys

sys.path.append('/content/pytorch-rdpg')

from rdpg import RDPG
from model import Actor, Critic
from agent import Agent
from memory import EpisodicMemory
from torch.optim import Adam
from util import *
from normalized_env import *
from evaluator import *



# test for import packages
actor_net = Actor(300, 2)
critic_net = Critic(300, 2)

print(actor_net)
print(critic_net)

Actor(
  (fc1): Linear(in_features=300, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=50, bias=True)
  (lstm): LSTMCell(50, 50)
  (fc3): Linear(in_features=50, out_features=2, bias=True)
  (relu): ReLU()
  (tanh): Tanh()
)
Critic(
  (fc1): Linear(in_features=300, out_features=20, bias=True)
  (fc2): Linear(in_features=22, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=1, bias=True)
  (relu): ReLU()
)


## Setup OpenAI Gym Environment

The Box space represents an n-dimensional box, so valid observations will be an array of the numbers of state/action. 

We can also check the Box’s bounds (high and low).

In [5]:
env = gym.make('Pendulum-v0')
#env = NormalizedEnv(gym.make('Pendulum-v0'))

print(env.observation_space)
print(env.action_space)

nb_states = env.observation_space.shape[0]
nb_actions = env.action_space.shape[0]

print(nb_states)
print(env.observation_space.high)
print(env.observation_space.low)

print(nb_actions)
print(env.action_space.high)
print(env.action_space.low)

Box(-8.0, 8.0, (3,), float32)
Box(-2.0, 2.0, (1,), float32)
3
[1. 1. 8.]
[-1. -1. -8.]
1
[2.]
[-2.]


## Training

Here we move the code in main.py to Colab

### Prepare for Require parameters and RDPG Agent

In [6]:
class Args:
  mode = 'train'
  rate = 0.001   # learning rate
  prate = 0.0001 # policy net learning rate (only for DDPG)
  warmup = 10000  # time without training but only filling the replay memory
  discount = 0.99
  bsize = 64     # minibatch size
  rmsize = 6000000 # memory size 
  window_length = 1
  tau = 0.001      # moving average for target network
  ou_theta = 0.15  # noise theta
  ou_sigma = 0.2   # noise sigma
  ou_mu    = 0.0   # noise mu
  validate_episodes = 20
  trajectory_length = 5
  max_episode_length = 500
  validate_steps = 2000
  debug = 'debug'
  init_w = 0.003
  train_iter = 2000000 # default: 20000000
  epsilon = 50000 # linear decay of exploration policy
  seed = -1
  checkpoint = "checkpoints"
  checkpoint_path = "checkpoints_path"

args = Args()

evaluate = Evaluator(args.validate_episodes, 
            args.validate_steps, 
            args.max_episode_length)
agent = Agent(nb_states, nb_actions, args)
memory = EpisodicMemory(capacity=args.rmsize, max_episode_length=args.trajectory_length, window_length=args.window_length)
critic_optim  = Adam(agent.critic.parameters(), lr=args.rate)
actor_optim  = Adam(agent.actor.parameters(), lr=args.prate)

In [7]:
#  Tensorboard
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs/exp1")
print(writer.get_logdir())

runs/exp1


In [8]:
def update_policy():
        # Sample batch
        experiences = memory.sample(args.bsize)
        if len(experiences) == 0: # not enough samples
            return

        policy_loss_total = 0
        value_loss_total = 0
        for t in range(len(experiences) - 1): # iterate over episodes
            target_cx = Variable(torch.zeros(args.bsize, 50)).type(FLOAT)
            target_hx = Variable(torch.zeros(args.bsize, 50)).type(FLOAT)

            cx = Variable(torch.zeros(args.bsize, 50)).type(FLOAT)
            hx = Variable(torch.zeros(args.bsize, 50)).type(FLOAT)

            # we first get the data out of the sampled experience
            state0 = np.stack((trajectory.state0 for trajectory in experiences[t]))
            # action = np.expand_dims(np.stack((trajectory.action for trajectory in experiences[t])), axis=1)
            action = np.stack((trajectory.action for trajectory in experiences[t]))
            reward = np.expand_dims(np.stack((trajectory.reward for trajectory in experiences[t])), axis=1)
            # reward = np.stack((trajectory.reward for trajectory in experiences[t]))
            state1 = np.stack((trajectory.state0 for trajectory in experiences[t+1]))

            target_action, (target_hx, target_cx) = agent.actor_target(to_tensor(state1, volatile=True), (target_hx, target_cx))
            next_q_value = agent.critic_target([
                to_tensor(state1, volatile=True),
                target_action
            ])
            next_q_value.volatile=False

            target_q = to_tensor(reward) + args.discount*next_q_value

            # Critic update
            current_q = agent.critic([ to_tensor(state0), to_tensor(action) ])

            # value_loss = criterion(q_batch, target_q_batch)
            value_loss = F.smooth_l1_loss(current_q, target_q)
            value_loss /= len(experiences) # divide by trajectory length
            value_loss_total += value_loss

            # Actor update
            action, (hx, cx) = agent.actor(to_tensor(state0), (hx, cx))
            policy_loss = -agent.critic([
                to_tensor(state0),
                action
            ])
            policy_loss /= len(experiences) # divide by trajectory length
            policy_loss_total += policy_loss.mean()

            # update per trajectory
            agent.critic.zero_grad()
            agent.actor.zero_grad()
            
            value_loss.backward(retain_graph=True)
            
            policy_loss = policy_loss.mean()
            policy_loss.backward()
            
            critic_optim.step()
            actor_optim.step()

        # Target update
        soft_update(agent.actor_target, agent.actor, args.tau)
        soft_update(agent.critic_target, agent.critic, args.tau)

In [9]:
def train(num_iterations, checkpoint_path, debug):
        agent.is_training = True
        step = episode = episode_steps = trajectory_steps = 0
        episode_reward = 0.
        state0 = None
        while step < num_iterations:
            episode_steps = 0
            while episode_steps < args.max_episode_length:
                # reset if it is the start of episode
                if state0 is None:
                    state0 = deepcopy(env.reset())
                    agent.reset()

                # agent pick action ...
                if step <= args.warmup:
                    action = agent.random_action()
                else:
                    action = agent.select_action(state0)

                # env response with next_observation, reward, terminate_info
                state, reward, done, info = env.step(action)
                state = deepcopy(state)

                env.render()

                # agent observe and update policy
                memory.append(state0, action, reward, done)

                # update 
                step += 1
                episode_steps += 1
                trajectory_steps += 1
                episode_reward += reward
                state0 = deepcopy(state)

                if trajectory_steps >= args.trajectory_length:
                    agent.reset_lstm_hidden_state(done=False)
                    trajectory_steps = 0
                    if step > args.warmup:
                        update_policy()

                # [optional] save intermideate model
                if step % int(num_iterations/3) == 0:
                    agent.save_model(checkpoint_path)

                if done: # end of episode
                    if debug: prGreen('#{}: episode_reward:{} steps:{}'.format(episode,episode_reward,step))

                    writer.add_scalar('reward', episode_reward, step)

                    # reset
                    state0 = None
                    episode_reward = 0.
                    episode += 1
                    agent.reset_lstm_hidden_state(done=True)
                    break
            writer.close()
            # [optional] evaluate
            if evaluate is not None and args.validate_steps > 0 and step % args.validate_steps == 0:
                policy = lambda x: agent.select_action(x, decay_epsilon=False)
                validate_reward = evaluate(env, policy, debug=False, visualize=False)
                if debug: prYellow('[Evaluate] Step_{:07d}: mean_reward:{}'.format(step, validate_reward))


### Training

In [None]:
train(args.train_iter, args.checkpoint, args.debug)

[92m #0: episode_reward:-751.1821816693467 steps:200[00m
[92m #1: episode_reward:-1069.0359171986572 steps:400[00m
[92m #2: episode_reward:-1356.8948132942107 steps:600[00m
[92m #3: episode_reward:-768.0366393723481 steps:800[00m
[92m #4: episode_reward:-754.1075630269876 steps:1000[00m
[92m #5: episode_reward:-1146.6488594084703 steps:1200[00m
[92m #6: episode_reward:-1509.1340118251692 steps:1400[00m
[92m #7: episode_reward:-1375.9579935184952 steps:1600[00m
[92m #8: episode_reward:-1170.2891982141132 steps:1800[00m
[92m #9: episode_reward:-946.7260754495705 steps:2000[00m
[93m [Evaluate] Step_0002000: mean_reward:-1134.301802490691[00m
[92m #10: episode_reward:-1577.5789908599033 steps:2200[00m
[92m #11: episode_reward:-762.3456996643465 steps:2400[00m
[92m #12: episode_reward:-980.4689121596575 steps:2600[00m
[92m #13: episode_reward:-1142.7666573218878 steps:2800[00m
[92m #14: episode_reward:-1590.9640896788364 steps:3000[00m
[92m #15: episode_rewa

  """Entry point for launching an IPython kernel.
  torch.from_numpy(ndarray), volatile=volatile, requires_grad=requires_grad


[92m #50: episode_reward:-1540.7055579532068 steps:10200[00m
[92m #51: episode_reward:-1157.2631457146163 steps:10400[00m
[92m #52: episode_reward:-1069.6598523322198 steps:10600[00m
[92m #53: episode_reward:-1729.7060968635906 steps:10800[00m
[92m #54: episode_reward:-1325.6697544339186 steps:11000[00m
[92m #55: episode_reward:-1335.4803897535414 steps:11200[00m
[92m #56: episode_reward:-1361.5915430645912 steps:11400[00m
[92m #57: episode_reward:-1510.2930495984328 steps:11600[00m
[92m #58: episode_reward:-1166.2455592268482 steps:11800[00m
[92m #59: episode_reward:-1364.2680219719175 steps:12000[00m
[93m [Evaluate] Step_0012000: mean_reward:-1390.202188356669[00m
[92m #60: episode_reward:-1644.4888980855815 steps:12200[00m
[92m #61: episode_reward:-861.4611418409073 steps:12400[00m
[92m #62: episode_reward:-1290.141663540618 steps:12600[00m
[92m #63: episode_reward:-1733.9154336641627 steps:12800[00m
[92m #64: episode_reward:-1177.0646692603389 steps:1

## Testing

In [None]:
def test( num_episodes, model_path, visualize=True, debug=False):
        if agent.load_weights(model_path) == False:
            prRed("model path not found")
            return

        agent.is_training = False
        agent.eval()
        policy = lambda x: agent.select_action(x, noise_enable=False, decay_epsilon=False)

        for i in range(num_episodes):
            validate_reward = evaluate(env, policy, debug=debug, visualize=visualize, save=False)
            if debug: prYellow('[Evaluate] #{}: mean_reward:{}'.format(i, validate_reward))

In [None]:
test(args.validate_episodes, args.checkpoint, visualize=True, debug=args.debug)

##Tensorboard

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir runs