# Deep Deterministic Policy Gradient (DDPG)

This implement the pseudo code in [CONTINUOUS CONTROL WITH DEEP REINFORCEMENT
LEARNING](https://arxiv.org/pdf/1509.02971.pdf) from Deepmind in pytorch.  
<img src="ddpg_pseudocode.png" width="500"/>

One change in the code here $\mathcal{N}$ is just a uniform random number of -0.5 to 0.5 that decay over number of iteration, $\mathcal{N} = (rand() - 0.5) / (1 + 1e^{-3} * iteration)$. In the paper they used Ornstein-Uhlenbeck process for $\mathcal{N}$, but for simple task like pendulum the uniform random noise seems to work just fine.

Couple thing worth mentioning when implementing DDPG.
- When updating the Q network, it use Q' and P' to calculate the Q target. i.e. $Q\_target=reward + \gamma * Q'(state\_next, P'(state\_next))$
- When caculate the Q loss, it use $Q\_target - Q(state, action)$ where action is from the replay memory instead of P(state)
- The policy gradient  
<img src="ddpg_policy_grad.png" width="350"/>  
Actuall mean the gradient of $\theta^\mu$ with respect to the mean of $Q(s,P(s))$. Here in pytorch it performs gradient descent, so the policy loss is $-\bar{Q}(s,P(s))$.

In [2]:
%matplotlib inline
from IPython import display
from IPython.display import HTML
import matplotlib.pyplot as plt
import matplotlib.animation as animation

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Normal, Categorical

import numpy as np
import random
import os
import gym

In [3]:
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor

In [4]:
def save_torch_model(model, filename):
  if not os.path.exists(os.path.dirname(filename)):
    os.makedirs(os.path.dirname(filename))
  torch.save(model.state_dict(), filename)

def load_torch_model(model, filename):
  model.load_state_dict(torch.load(filename))

In [5]:
def copy_grad(source, target):
  grads = []
  for param in source.parameters():
    grads.append(param.grad.clone())
  grads.reverse()
  for param in target.parameters():
    param.grad = grads.pop()

def zero_grad(model):
  for param in model.parameters():
    if type(param.grad) != type(None):
      param.grad.data.zero_()
      
def update_target(target_net, eval_net, tau):
  fast = eval_net.state_dict()
  slow = target_net.state_dict()
  for t in slow:
    slow[t] = slow[t] * (1. - tau) + fast[t] * tau

  target_net.load_state_dict(slow)

In [6]:
class ReplayMemory():
  def __init__(self, memory_size = 100000):
    self.transitions = []
    self.memory_size = memory_size
    self.loc_pointer = 0
  
  def clear(self):
    self.transitions = []
    self.loc_pointer = 0
  
  def add(self, step_tuple):
    # expect a tuple of transition contain:
    # state, action, reward, next_state, ended
    if len(self.transitions) <= self.loc_pointer:
      self.transitions.append(None)
    self.transitions[self.loc_pointer] = step_tuple
    self.loc_pointer += 1
    if self.loc_pointer >= self.memory_size:
      self.loc_pointer %= self.memory_size
  
  def get_sample(self, batch_size):
    return random.sample(self.transitions, batch_size)
  
  def usage(self):
    return len(self.transitions) / self.memory_size
  

In [7]:
class PolicyNet_continuous(nn.Module):
  def __init__(self, input_size, output_size):
    super(PolicyNet_continuous,self).__init__()
    self.l1_linear = nn.Linear(input_size, 128)
    self.l2_linear = nn.Linear(128, 64)
    self.l3_linear = nn.Linear(64, output_size)
    nn.init.kaiming_normal_(self.l1_linear.weight)
    nn.init.kaiming_normal_(self.l2_linear.weight)
    self.l3_linear.weight.data.zero_()
    
  def forward(self,x):
    out = F.relu(self.l1_linear(x))
    out = F.relu(self.l2_linear(out))
    out = F.tanh(self.l3_linear(out))
    return out

In [8]:
class QNet(nn.Module):
  def __init__(self, input_size):
    super(QNet,self).__init__()
    self.l1_linear = nn.Linear(input_size, 128)
    self.l2_linear = nn.Linear(128, 64)
    self.l3_linear = nn.Linear(64, 1)
    nn.init.kaiming_normal_(self.l1_linear.weight)
    nn.init.kaiming_normal_(self.l2_linear.weight)
    self.l3_linear.weight.data.zero_()

  def forward(self, state, action):
    out = F.relu(self.l1_linear(torch.cat([state,action],dim=1)))
    out = F.relu(self.l2_linear(out))
    out = self.l3_linear(out)
    return out    

In [20]:
class DDPG():
  def __init__(self, env, steps_in_state = 1):
    self.is_training = True
    self.state_value_range = [{'max':None, 'min':None}] * env.observation_space.shape[0]
    self.replay_memory = ReplayMemory(memory_size=100000)
    self.steps_in_state = steps_in_state
    self.actor = PolicyNet_continuous(env.observation_space.shape[0] * steps_in_state, env.action_space.shape[0])
    self.actor_prime = PolicyNet_continuous(env.observation_space.shape[0] * steps_in_state, env.action_space.shape[0])
    self.critic = QNet(env.observation_space.shape[0] * steps_in_state + env.action_space.shape[0])
    self.critic_prime = QNet(env.observation_space.shape[0] * steps_in_state + env.action_space.shape[0])
    if use_cuda:
      self.actor.cuda()
      self.actor_prime.cuda()
      self.critic.cuda()
      self.critic_prime.cuda()
    # copy the weights from target to eval net
    self.actor_prime.load_state_dict(self.actor.state_dict())
    self.critic_prime.load_state_dict(self.critic.state_dict())
    self.env = env
    self.range_scale = (env.action_space.high[0] - env.action_space.low[0]) / 2.0
    self._gamma = 0.96

  def pick_action(self, state):
    action = self.actor(state)
    action = action.item()
    # add noise
    if self.is_training:
      action += (np.random.rand() - 0.5) / (1 + 1e-3 * self.iteration)
    return np.clip(action, -1.0, 1.0)
  
  def update_ddpg(self, batch):
    (states, actions, rewards, next_states, ended) = zip(*batch)
    states_tensor = torch.stack(states)
    actions_tensor = FloatTensor(actions).view(-1,1)
    rewards_tensor = FloatTensor(rewards).view(-1,1)
    next_states_tensor = torch.stack(next_states)
    ended_tensor = FloatTensor(ended).view(-1,1)

    critic_target = rewards_tensor + self._gamma * (1 - ended_tensor) * \
      self.critic_prime(next_states_tensor, self.actor_prime(next_states_tensor))
    critic_loss = F.mse_loss(self.critic(states_tensor, actions_tensor), critic_target)

    self.critic_coptimizer.zero_grad()
    critic_loss.backward()
    self.critic_coptimizer.step()
    
    actor_loss = -self.critic(states_tensor, self.actor(states_tensor))
    actor_loss = actor_loss.mean()
    
    self.actor_optimizer.zero_grad()
    actor_loss.backward()
    self.actor_optimizer.step()
    
    update_target(self.critic_prime, self.critic, 0.1)
    update_target(self.actor_prime, self.actor, 0.1)
    
  def train(self, env, update_limit=1000, batch_size=200, lr=1e-3, lr_actor=None, lr_critic=None, checkpoint=100):
    lr_actor = lr if lr_actor == None else lr_actor
    lr_critic = lr if lr_critic == None else lr_critic
    
    self.actor_optimizer = torch.optim.SGD(self.actor.parameters(), lr=lr_actor, weight_decay=1e-3)
    self.critic_coptimizer = torch.optim.SGD(self.critic.parameters(), lr=lr_critic, weight_decay=1e-3)
    
    best_score = -99999
    running_score = None
    update_count = 0
    self.iteration = 0
    while update_count < update_limit:
      s0 = env.reset()
      seq = [s0] * self.steps_in_state
      state = FloatTensor(seq).view(-1)
      episode_ended = False
      score = 0
      while not episode_ended:
        action =  self.pick_action(state)
        (s1, reward, episode_ended, info) = env.step([action * self.range_scale])
        seq = seq[1:]
        seq.append(s1)
        next_state = FloatTensor(seq).view(-1)
        if episode_ended:
          ended = 1
        else:
          ended = 0
        self.replay_memory.add((state, action, reward, next_state, ended))
        if self.replay_memory.usage() > 0.8:
          if (update_count + 1) % checkpoint == 0:
            save_torch_model(self.actor,'model/ddpg_actor_iter_%d.pth' %(update_count+1))
            print('%d: running_score:%.2f, ' %(update_count+1, running_score))
          self.update_ddpg(self.replay_memory.get_sample(batch_size))
          update_count += 1
          self.iteration += 1

        s0 = s1
        state = next_state
        score += reward
      if running_score == None:
        running_score = score
      else:
        running_score = running_score * 0.9 + score * 0.1


In [33]:
env = gym.make('Pendulum-v0')
agent = DDPG(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [34]:
# agent.train(env, update_limit=1, batch_size=2, update_per_episode=1, lr=1e-5, checkpoint=1)

In [35]:
agent.is_training = True
agent.train(env, update_limit=20000, batch_size=64, lr_actor=1e-4, lr_critic=1e-3, checkpoint=1000)

1000: running_score:-1179.21, 
2000: running_score:-1095.16, 
3000: running_score:-1036.01, 
4000: running_score:-1022.48, 
5000: running_score:-986.71, 
6000: running_score:-955.99, 
7000: running_score:-923.19, 
8000: running_score:-886.69, 
9000: running_score:-825.40, 
10000: running_score:-701.01, 
11000: running_score:-547.35, 
12000: running_score:-421.68, 
13000: running_score:-328.89, 
14000: running_score:-292.08, 
15000: running_score:-268.45, 
16000: running_score:-212.12, 
17000: running_score:-215.08, 
18000: running_score:-157.03, 
19000: running_score:-174.16, 
20000: running_score:-208.39, 


In [36]:
# run a sample episode with a trained agent
agent.is_training = False
load_torch_model(agent.actor,'model/ddpg_actor_iter_18000.pth')
state = env.reset()
frames = []
frames.append(env.render(mode='rgb_array'))
ended = False
score = 0
while not ended:
  action = agent.pick_action(FloatTensor([state]).view(-1))
  (state, reward, ended, info) = env.step([action*2])
  score += reward
  frames.append(env.render(mode='rgb_array'))
print(score)

-124.88584956807608


In [37]:
%%capture
def animate(frames):
  fig, ax = plt.subplots()
  ax.grid('off')
  ax.axis('off')
  ims = []
  for i in range(len(frames)):
      im = plt.imshow(frames[i], animated=True)
      ims.append([im])
  ani = animation.ArtistAnimation(fig, ims, interval=20, blit=True, repeat_delay=1000)
  return ani

ani = animate(frames)
ani.save('pendulum_ddpg.mp4')

In [38]:
%%HTML
<video width="400" controls loop>
  <source src="pendulum_ddpg.mp4" type="video/mp4">
</video>