# DDPG

In [1]:
%matplotlib inline
from IPython import display
from IPython.display import HTML
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as animation

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Normal, Categorical

import numpy as np
import random
import os
import gym

In [2]:
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor

In [3]:
def save_torch_model(model, filename):
  if not os.path.exists(os.path.dirname(filename)):
    os.makedirs(os.path.dirname(filename))
  torch.save(model.state_dict(), filename)

def load_torch_model(model, filename):
  model.load_state_dict(torch.load(filename))

In [4]:
def copy_grad(source, target):
  grads = []
  for param in source.parameters():
    grads.append(param.grad.clone())
  grads.reverse()
  for param in target.parameters():
    param.grad = grads.pop()

def zero_grad(model):
  for param in model.parameters():
    if type(param.grad) != type(None):
      param.grad.data.zero_()
      
def update_target(target_net, eval_net, tau):
  fast = eval_net.state_dict()
  slow = target_net.state_dict()
  for t in slow:
    slow[t] = slow[t] * (1. - tau) + fast[t] * tau

  target_net.load_state_dict(slow)

In [5]:
class ReplayMemory():
  def __init__(self, memory_size = 100000):
    self.transitions = []
    self.memory_size = memory_size
    self.loc_pointer = 0
  
  def clear(self):
    self.transitions = []
    self.loc_pointer = 0
  
  def add(self, step_tuple):
    # expect a tuple of transition contain:
    # state, action, reward, next_state, ended
    if len(self.transitions) <= self.loc_pointer:
      self.transitions.append(None)
    self.transitions[self.loc_pointer] = step_tuple
    self.loc_pointer += 1
    if self.loc_pointer >= self.memory_size:
      self.loc_pointer %= self.memory_size
  
  def get_sample(self, batch_size):
    return random.sample(self.transitions, batch_size)
  
  def usage(self):
    return len(self.transitions) / self.memory_size
  

In [18]:
class PolicyNet_continuous(nn.Module):
  def __init__(self, input_size, output_size):
    super(PolicyNet_continuous,self).__init__()
    self.l1_linear = nn.Linear(input_size, 64)
    self.l2_linear = nn.Linear(64, 32)
    self.l3_linear = nn.Linear(32, output_size)
    nn.init.kaiming_normal_(self.l1_linear.weight)
    nn.init.kaiming_normal_(self.l2_linear.weight)
    self.l3_linear.weight.data.zero_()
    
  def forward(self,x):
    out = F.relu(self.l1_linear(x))
    out = F.relu(self.l2_linear(out))
    out = F.tanh(self.l3_linear(out))
    return out

In [20]:
class QNet(nn.Module):
  def __init__(self, input_size):
    super(QNet,self).__init__()
    self.l1_linear = nn.Linear(input_size, 64)
    self.l2_linear = nn.Linear(64, 32)
    self.l3_linear = nn.Linear(32, 1)
    nn.init.kaiming_normal_(self.l1_linear.weight)
    nn.init.kaiming_normal_(self.l2_linear.weight)
    self.l3_linear.weight.data.zero_()

  def forward(self, state, action):
    out = F.relu(self.l1_linear(torch.cat([state,action],dim=1)))
    out = F.relu(self.l2_linear(out))
    out = self.l3_linear(out)
    return out    

In [21]:
class DDPG():
  def __init__(self, env, steps_in_state = 1):
    self.state_value_range = [{'max':None, 'min':None}] * env.observation_space.shape[0]
    self.replay_memory = ReplayMemory(memory_size=100000)
    self.steps_in_state = steps_in_state
    self.policy_target = PolicyNet_continuous(env.observation_space.shape[0] * steps_in_state, env.action_space.shape[0])
    self.policy_eval = PolicyNet_continuous(env.observation_space.shape[0] * steps_in_state, env.action_space.shape[0])
    self.Q_target = QNet(env.observation_space.shape[0] * steps_in_state + env.action_space.shape[0])
    self.Q_eval = QNet(env.observation_space.shape[0] * steps_in_state + env.action_space.shape[0])
    if use_cuda:
      self.policy_target.cuda()
      self.policy_eval.cuda()
      self.Q_target.cuda()
      self.Q_eval.cuda()
    # copy the weights from target to eval net
    self.policy_eval.load_state_dict(self.policy_target.state_dict())
    self.Q_eval.load_state_dict(self.Q_target.state_dict())
    self.env = env
    self.range_scale = (env.action_space.high[0] - env.action_space.low[0]) / 2.0
    self._gamma = 0.96
    self._epsilon = 0.2
    
  def pick_action(self, state):
    if np.random.rand() < self._epsilon:
      action = (np.random.rand() * 2 - 1.0)
    else:
      probs = self.policy_target(state)
      action_dist = Normal(probs, 0.01)
      action = action_dist.rsample()
      action = action.item()
    return np.clip(action, -1.0, 1.0)
  
  def norm_state(self, state):
    eps = np.finfo.eps
    for i in range(len(self.state_value_range)):
      value_range = state_range[i]
      if state[i] > value_range.max or value_range.max == None:
        value_range.max = state[i]
      if state[i] < value_range.min or value_range.min == None:
        value_range.min = state[i]
      r = value_range.max - value_range.min
      scale = 1 / (r + eps)
      state[i] = state[i] - value_range.min
  
  def update_ddpg(self, batch):
    (states, actions, rewards, next_states, ended) = zip(*batch)
    states_tensor = torch.stack(states)
    actions_tensor = FloatTensor(actions).view(-1,1)
    rewards_tensor = FloatTensor(rewards).view(-1,1)
    next_states_tensor = torch.stack(next_states)
    ended_tensor = FloatTensor(ended).view(-1,1)

    # Q loss = reward + gamma * Q_Net(next_states + policy_net(next_states)) - Q_Net(state + action)
    # Q loss = F.mse(Q loss) 
    Q_target = rewards_tensor + self._gamma * (1 - ended_tensor) * self.Q_target(next_states_tensor, self.policy_target(next_states_tensor) * 2.0)
    Q_loss = F.mse_loss(self.Q_target(states_tensor, actions_tensor), Q_target)
    
    # policy loss = 
    policy_loss = -self.Q_target(states_tensor, self.policy_target(states_tensor) * 2.0)
    policy_loss = policy_loss.mean()
    
    self.Q_optimizer.zero_grad()
    zero_grad(self.Q_target)
    Q_loss.backward()
    copy_grad(self.Q_target, self.Q_eval)
    self.Q_optimizer.step()
    
    self.policy_optimizer.zero_grad()
    zero_grad(self.policy_target)
    policy_loss.backward()
    copy_grad(self.policy_target, self.policy_eval)
    self.policy_optimizer.step()
    
  def train(self, env, update_limit=1000, batch_size=200, update_per_episode=10, lr=1e-3, lr_policy=None, lr_q=None, checkpoint=100):
    lr_policy = lr if lr_policy == None else lr_policy
    lr_q = lr if lr_q == None else lr_q
    self.policy_optimizer = torch.optim.SGD(self.policy_eval.parameters(), lr=lr_policy, weight_decay=1e-3)
    self.Q_optimizer = torch.optim.SGD(self.Q_eval.parameters(), lr=lr_q, weight_decay=1e-3)
    best_score = -99999
    running_score = None
    update_count = 0
    while update_count < update_limit:
      s0 = env.reset()
      s0[2] /= 8
      seq = [s0] * self.steps_in_state
      state = FloatTensor(seq).view(-1)
      episode = []
      episode_ended = False
      score = 0
      while not episode_ended:
        action =  self.pick_action(state)
        (s1, reward, episode_ended, info) = env.step([action * 2.0])
        s1[2] /= 8
        seq = seq[1:]
        seq.append(s1)
        next_state = FloatTensor(seq).view(-1)
        if episode_ended:
          ended = 1
        else:
          ended = 0
        self.replay_memory.add((state, action, reward, next_state, ended))

        s0 = s1
        state = next_state
        score += reward
        
      if running_score == None:
        running_score = score
      else:
        running_score = running_score * 0.9 + score * 0.1
        
      if self.replay_memory.usage() > 0.3:
        if (update_count + 1) % checkpoint == 0:
          is_best = False
          if running_score > best_score:
            is_best = True
            save_torch_model(self.policy_target, 'model/ddpg_policy_best.pth')
            best_score = running_score
          save_torch_model(self.policy_target,'model/ddpg_policy_iter_%d.pth' %(update_count+1))
          print('%d: running_score:%.2f, is_best:%s' %(update_count+1, running_score, is_best))
        update_count += 1
        for i in range(update_per_episode):
          self.update_ddpg(self.replay_memory.get_sample(batch_size))
        
        if (update_count + 1) % 20 == 0:
          update_target(self.policy_target, self.policy_eval, 0.1)
          update_target(self.Q_target, self.Q_eval, 0.1)

In [22]:
env = gym.make('Pendulum-v0')
agent = DDPG(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [23]:
# agent.train(env, update_limit=1, batch_size=2, update_per_episode=1, lr=1e-5, checkpoint=1)

In [24]:
agent.train(env, update_limit=50000, batch_size=64, update_per_episode=5, lr_policy=1e-4, lr_q=1e-3, checkpoint=100)

100: running_score:-1216.66, is_best:True
200: running_score:-1396.58, is_best:False
300: running_score:-1443.62, is_best:False
400: running_score:-1442.63, is_best:False
500: running_score:-1453.49, is_best:False
600: running_score:-1395.50, is_best:False
700: running_score:-1375.38, is_best:False
800: running_score:-1437.00, is_best:False
900: running_score:-1438.07, is_best:False
1000: running_score:-1373.70, is_best:False
1100: running_score:-1460.23, is_best:False
1200: running_score:-1473.71, is_best:False
1300: running_score:-1419.84, is_best:False
1400: running_score:-1323.61, is_best:False
1500: running_score:-1452.72, is_best:False
1600: running_score:-1444.70, is_best:False
1700: running_score:-1417.82, is_best:False
1800: running_score:-1416.24, is_best:False
1900: running_score:-1425.59, is_best:False
2000: running_score:-1441.20, is_best:False
2100: running_score:-1418.61, is_best:False
2200: running_score:-1423.54, is_best:False
2300: running_score:-1356.16, is_best:Fals

KeyboardInterrupt: 