# Actor Critic Pendulum

In [None]:
%matplotlib inline
from IPython import display
from IPython.display import HTML
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as animation

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Normal, Categorical

import numpy as np
import random
import os
import gym

In [None]:
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor

In [None]:
def save_torch_model(model, filename):
  if not os.path.exists(os.path.dirname(filename)):
    os.makedirs(os.path.dirname(filename))
  torch.save(model.state_dict(), filename)

def load_torch_model(model, filename):
  model.load_state_dict(torch.load(filename))


In [None]:
def copy_grad(source, target):
  grads = []
  for param in source.parameters():
    grads.append(param.grad.clone())
  grads.reverse()
  for param in target.parameters():
    param.grad = grads.pop()

def zero_grad(model):
  for param in model.parameters():
    if type(param.grad) != type(None):
      param.grad.data.zero_()
      
def update_target(target_net, eval_net, tau):
  fast = eval_net.state_dict()
  slow = target_net.state_dict()
  for t in slow:
    slow[t] = slow[t] * (1. - tau) + fast[t] * tau

  target_net.load_state_dict(slow)

In [None]:
class PolicyNet_continuous(nn.Module):
  def __init__(self, input_size, output_size):
    super(PolicyNet_continuous,self).__init__()
    self.l1_linear = nn.Linear(input_size,256)
    self.l2_linear = nn.Linear(256,128)
    self.l3_linear = nn.Linear(128,output_size)
    nn.init.kaiming_normal_(self.l1_linear.weight)
    nn.init.kaiming_normal_(self.l2_linear.weight)
    self.l3_linear.weight.data.zero_()
    
  def forward(self,x):
    out = F.relu(self.l1_linear(x))
    out = F.relu(self.l2_linear(out))
    out = F.tanh(self.l3_linear(out))
    return out

In [None]:
class ValueNet(nn.Module):
  def __init__(self, input_size):
    super(ValueNet,self).__init__()
    self.l1_linear = nn.Linear(input_size, 256)
    self.l2_linear = nn.Linear(256,128)
    self.l3_linear = nn.Linear(128, 1)
    nn.init.kaiming_normal_(self.l1_linear.weight)
    nn.init.kaiming_normal_(self.l2_linear.weight)
    self.l3_linear.weight.data.zero_()

  def forward(self, x):
    out = F.relu(self.l1_linear(x))
    out = F.relu(self.l2_linear(out))
    out = self.l3_linear(out)
    return out    

In [74]:
class ActorCritic_continuous():
  def __init__(self, env, steps_in_state = 2):
    self.steps_in_state = steps_in_state
    self.policy_target = PolicyNet_discret(env.observation_space.shape[0] * steps_in_state, env.action_space.shape[0])
    self.value_target = ValueNet(env.observation_space.shape[0] * steps_in_state)
    if use_cuda:
      self.policy_target.cuda()
      self.value_target.cuda()
    self.env = env
    self.range_scale = (env.action_space.high[0] - env.action_space.low[0]) / 2.0

    self._gamma = 0.96
    
  def pick_action(self, state):
    probs = self.policy_target(state) * self.range_scale
    action_dist = Normal(probs, 0.01)
    action = action_dist.rsample()
    action = action.item()
    action = np.clip(action,-1.0,1.0)
    return (action, action_dist.log_prob(FloatTensor([action])))
  
  def update_actor_critic(self, episode):
    (states, actions, rewards, next_states, log_probs, ended) = zip(*episode)
    
    rewards = FloatTensor(rewards)
    ended = FloatTensor(ended)
    state_value = self.value_target(torch.stack(states))
    next_state_value = self.value_target(torch.stack(next_states))
    target_value = rewards + (1 - ended) * self._gamma * next_state_value
    
    delta = target_value - state_value

    value_loss = F.mse_loss(state_value, target_value)
    
    policy_loss = []
    for log_prob, d in zip(log_probs, delta):
      policy_loss.append(-log_prob * d)
    policy_loss = torch.stack(policy_loss).sum()
    
    self.value_optimizer.zero_grad()
    value_loss.backward(retain_graph=True)
    self.value_optimizer.step()
    
    self.policy_optimizer.zero_grad()
    policy_loss.backward()
    self.policy_optimizer.step()
    
  def train(self, env, update_limit=1000, samples_per_update=200, lr=1e-3, lr_policy=None, lr_value=None, checkpoint=100):
    lr_policy = lr if lr_policy == None else lr_policy
    lr_value = lr if lr_value == None else lr_value
    self.policy_optimizer = torch.optim.SGD(self.policy_target.parameters(), lr=lr_policy, weight_decay=1e-3)
    self.value_optimizer = torch.optim.SGD(self.value_target.parameters(), lr=lr_value, weight_decay=1e-3)
    best_score = -99999
    running_score = None
    samples = []
    samples_per_update = samples_per_update
    update_count = 0
    while update_count < update_limit:
      s0 = env.reset()
      s0[2] /= 8.
      seq = [s0] * self.steps_in_state
      state = FloatTensor(seq).view(-1)
      episode = []
      episode_ended = False
      score = 0
      while not episode_ended:
        (action, log_prob) =  self.pick_action(state)
        (s1, reward, episode_ended, info) = env.step([action * 2.0])
        s1[2] /= 8.
        seq = seq[1:]
        seq.append(s1)
        next_state = FloatTensor(seq).view(-1)
        if episode_ended:
          ended = 1
        else:
          ended = 0
        samples.append((state, action, reward, next_state, log_prob, ended))

        s0 = s1
        state = next_state
        score += reward
        
      if running_score == None:
        running_score = score
      else:
        running_score = running_score * 0.9 + score * 0.1
        
      if len(samples) > samples_per_update:
        if (update_count + 1) % checkpoint == 0:
          is_best = False
          if running_score > best_score:
            is_best = True
            save_torch_model(self.policy_target, 'model/actor_critic_cartpole_policy_best.pth')
            best_score = running_score
          save_torch_model(self.policy_target,'model/actor_critic_cartpole_policy_iter_%d.pth' %(update_count+1))
          print('%d: running_score:%.2f, is_best:%s' %(update_count+1, running_score, is_best))
        update_count += 1
        self.update_actor_critic(samples)
        samples = []

In [75]:
class ActorCritic_target_eval_continuous():
  def __init__(self, env, steps_in_state = 2):
    self.steps_in_state = steps_in_state
    self.policy_target = PolicyNet_discret(env.observation_space.shape[0] * steps_in_state, env.action_space.shape[0])
    self.policy_eval = PolicyNet_discret(env.observation_space.shape[0] * steps_in_state, env.action_space.shape[0])
    self.value_target = ValueNet(env.observation_space.shape[0] * steps_in_state)
    self.value_eval = ValueNet(env.observation_space.shape[0] * steps_in_state)
    if use_cuda:
      self.policy_target.cuda()
      self.policy_eval.cuda()
      self.value_target.cuda()
      self.value_eval.cuda()
    self.policy_eval.load_state_dict(self.policy_target.state_dict())
    self.value_eval.load_state_dict(self.value_target.state_dict())
    self.env = env
    self.range_scale = (env.action_space.high[0] - env.action_space.low[0]) / 2.0

    self._gamma = 0.96
    
  def pick_action(self, state):
    probs = self.policy_target(state) 
    action_dist = Normal(probs, 0.01)
    action = action_dist.rsample()
    action = action.item()
    action = np.clip(action, -1.0, 1.0)
    return (action, action_dist.log_prob(FloatTensor([action])))
  
  def update_actor_critic(self, episode):
    (states, actions, rewards, next_states, log_probs, ended) = zip(*episode)
    
    rewards = FloatTensor(rewards)
    ended = FloatTensor(ended)
    state_value = self.value_target(torch.stack(states))
    next_state_value = self.value_target(torch.stack(next_states))
    target_value = rewards + (1 - ended) * self._gamma * next_state_value
    
    delta = target_value - state_value

    value_loss = F.mse_loss(state_value, target_value)
    
    policy_loss = []
    for log_prob, d in zip(log_probs, delta):
      policy_loss.append(-log_prob * d)
    policy_loss = torch.stack(policy_loss).sum()
    
    self.value_optimizer.zero_grad()
    zero_grad(self.value_target)
    value_loss.backward(retain_graph=True)
    copy_grad(self.value_target, self.value_eval)
    self.value_optimizer.step()
    
    self.policy_optimizer.zero_grad()
    zero_grad(self.policy_target)
    policy_loss.backward()
    copy_grad(self.policy_target, self.policy_eval)
    self.policy_optimizer.step()
    
  def train(self, env, update_limit=1000, samples_per_update=200, lr=1e-3, lr_policy=None, lr_value=None, checkpoint=100):
    lr_policy = lr if lr_policy == None else lr_policy
    lr_value = lr if lr_value == None else lr_value
    self.policy_optimizer = torch.optim.SGD(self.policy_eval.parameters(), lr=lr_policy, weight_decay=1e-3)
    self.value_optimizer = torch.optim.SGD(self.value_eval.parameters(), lr=lr_value, weight_decay=1e-3)
    best_score = -99999
    running_score = None
    samples = []
    samples_per_update = samples_per_update
    update_count = 0
    while update_count < update_limit:
      s0 = env.reset()
      seq = [s0] * self.steps_in_state
      state = FloatTensor(seq).view(-1)
      episode = []
      episode_ended = False
      score = 0
      while not episode_ended:
        (action, log_prob) =  self.pick_action(state)
        (s1, reward, episode_ended, info) = env.step([action * 2.0])
        seq = seq[1:]
        seq.append(s1)
        next_state = FloatTensor(seq).view(-1)
        if episode_ended:
          ended = 1
        else:
          ended = 0
        samples.append((state, action, reward, next_state, log_prob, ended))

        s0 = s1
        state = next_state
        score += reward
        
      if running_score == None:
        running_score = score
      else:
        running_score = running_score * 0.9 + score * 0.1
        
      if len(samples) > samples_per_update:
        if (update_count + 1) % checkpoint == 0:
          is_best = False
          if running_score > best_score:
            is_best = True
            save_torch_model(self.policy_target, 'model/actor_critic_cartpole_policy_best.pth')
            best_score = running_score
          save_torch_model(self.policy_target,'model/actor_critic_cartpole_policy_iter_%d.pth' %(update_count+1))
          print('%d: running_score:%.2f, is_best:%s' %(update_count+1, running_score, is_best))
        update_count += 1
        self.update_actor_critic(samples)
        samples = []
        
        if (update_count + 1) % 20 == 0:
          update_target(self.policy_target, self.policy_eval, 0.1)
          update_target(self.value_target, self.value_eval, 0.1)

In [76]:
env = gym.make('Pendulum-v0')
agent = ActorCritic_continuous(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [77]:
agent.train(env, update_limit=50000, samples_per_update=100, lr_policy=1e-4, lr_value=1e-3, checkpoint=100)

100: running_score:-1398.54, is_best:True
200: running_score:-1450.90, is_best:False
300: running_score:-1499.19, is_best:False
400: running_score:-1422.71, is_best:False
500: running_score:-1446.01, is_best:False
600: running_score:-1331.19, is_best:True
700: running_score:-1313.73, is_best:True
800: running_score:-1436.07, is_best:False
900: running_score:-1476.28, is_best:False
1000: running_score:-1495.54, is_best:False
1100: running_score:-1516.20, is_best:False
1200: running_score:-1395.03, is_best:False
1300: running_score:-1520.37, is_best:False
1400: running_score:-1486.66, is_best:False
1500: running_score:-1350.49, is_best:False
1600: running_score:-1399.30, is_best:False
1700: running_score:-1429.65, is_best:False
1800: running_score:-1420.74, is_best:False
1900: running_score:-1449.31, is_best:False
2000: running_score:-1362.80, is_best:False
2100: running_score:-1494.58, is_best:False
2200: running_score:-1462.82, is_best:False
2300: running_score:-1426.00, is_best:False


In [57]:
env = gym.make('Pendulum-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [59]:
env.observation_space.high

array([1., 1., 8.], dtype=float32)

In [60]:
env.observation_space.low

array([-1., -1., -8.], dtype=float32)