In [6]:
%matplotlib inline
from IPython import display
from IPython.display import HTML
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as animation

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Normal, Categorical

import numpy as np
import random
import os
import gym

In [7]:
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor

In [8]:
def save_torch_model(model, filename):
  if not os.path.exists(os.path.dirname(filename)):
    os.makedirs(os.path.dirname(filename))
  torch.save(model.state_dict(), filename)

def load_torch_model(model, filename):
  model.load_state_dict(torch.load(filename))


In [9]:
def copy_grad(source, target):
  grads = []
  for param in source.parameters():
    grads.append(param.grad.clone())
  grads.reverse()
  for param in target.parameters():
    param.grad = grads.pop()

def zero_grad(model):
  for param in model.parameters():
    if type(param.grad) != type(None):
      param.grad.data.zero_()
      
def update_target(target_net, eval_net, tau):
  fast = eval_net.state_dict()
  slow = target_net.state_dict()
  for t in slow:
    slow[t] = slow[t] * (1. - tau) + fast[t] * tau

  target_net.load_state_dict(slow)

In [10]:
class PolicyNet_discret(nn.Module):
  def __init__(self, input_size, output_size):
    super(PolicyNet_discret,self).__init__()
    self.l1_linear = nn.Linear(input_size, 512)
    self.l2_linear = nn.Linear(512, 256)
    self.l3_linear = nn.Linear(256, output_size)
    nn.init.kaiming_normal_(self.l1_linear.weight)
    nn.init.kaiming_normal_(self.l2_linear.weight)
    self.l3_linear.weight.data.zero_()
    
  def forward(self,x):
    out = F.relu(self.l1_linear(x))
    out = F.relu(self.l2_linear(out))
    out = F.softmax(self.l3_linear(out),dim=0)
    return out

In [11]:
class PolicyNet_continuous(nn.Module):
  def __init__(self, input_size, output_size):
    super(PolicyNet_continuous,self).__init__()
    self.l1_linear = nn.Linear(input_size,2048)
    self.l2_linear = nn.Linear(2048,512)
    self.l3_linear = nn.Linear(512,output_size)
    nn.init.kaiming_normal_(self.l1_linear.weight)
    nn.init.kaiming_normal_(self.l2_linear.weight)
    self.l3_linear.weight.data.zero_()
    
  def forward(self,x):
    out = F.relu(self.l1_linear(x))
    out = F.relu(self.l2_linear(out))
    out = F.tanh(self.l3_linear(out))
    return out

In [12]:
class ValueNet(nn.Module):
  def __init__(self, input_size):
    super(ValueNet,self).__init__()
    self.l1_linear = nn.Linear(input_size, 512)
    self.l2_linear = nn.Linear(512,256)
    self.l3_linear = nn.Linear(256, 1)
    nn.init.kaiming_normal_(self.l1_linear.weight)
    nn.init.kaiming_normal_(self.l2_linear.weight)
    self.l3_linear.weight.data.zero_()

  def forward(self, x):
    out = F.relu(self.l1_linear(x))
    out = F.relu(self.l2_linear(out))
    out = self.l3_linear(out)
    return out    

In [13]:
class ActorCritic():
  def __init__(self, env, steps_in_state = 2):
    self.steps_in_state = steps_in_state
    self.policy = PolicyNet_discret(env.observation_space.shape[0] * steps_in_state,env.action_space.n)
    self.value = ValueNet(env.observation_space.shape[0] * steps_in_state)
    if use_cuda:
      self.policy.cuda()
      self.value.cuda()
    self.env = env
    self._gamma = 0.96
    
  def pick_action(self, state):
    probs = self.policy(state)
    action_dist = Categorical(probs)
    action = action_dist.sample()
    action = action.item()
    return (action, action_dist.log_prob(FloatTensor([action])))
  
  def update_actor_critic(self, episode):
    (states, actions, rewards, next_states, log_probs, ended) = zip(*episode)
    
    rewards = FloatTensor(rewards)
    ended = FloatTensor(ended)
    state_value = self.value(torch.stack(states))
    next_state_value = self.value(torch.stack(next_states))
    target_value = rewards + (1 - ended) * self._gamma * next_state_value
    
    delta = target_value - state_value

    value_loss = F.mse_loss(state_value, target_value)
    
    policy_loss = []
    for log_prob, d in zip(log_probs, delta):
      policy_loss.append(-log_prob * d)
    policy_loss = torch.stack(policy_loss).sum()
    
    self.value_optimizer.zero_grad()
    value_loss.backward(retain_graph=True)
    self.value_optimizer.step()
    
    self.policy_optimizer.zero_grad()
    policy_loss.backward()
    self.policy_optimizer.step()

  def train(self, env, episode, lr=1e-3, target_copylr=1e-3, lr_policy=None, lr_value=None, checkpoint=100):
    lr_policy = lr if lr_policy == None else lr_policy
    lr_value = lr if lr_value == None else lr_value
    self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr_policy, weight_decay=1e-3)
    self.value_optimizer = torch.optim.Adam(self.value.parameters(), lr=lr_value, weight_decay=1e-3)
    best_score = -99999
    running_score = None
    for i in range(episode):
      s0 = env.reset()
      seq = [s0] * self.steps_in_state
      state = FloatTensor(seq).view(-1)
      episode = []
      episode_ended = False
      score = 0
      while not episode_ended:
        (action, log_prob) =  self.pick_action(state)
        (s1, reward, episode_ended, info) = env.step(action)
        seq = seq[1:]
        seq.append(s1)
        next_state = FloatTensor(seq).view(-1)
        if episode_ended:
          ended = 1
        else:
          ended = 0
        episode.append((state, action, reward, next_state, log_prob, ended))

        s0 = s1
        state = next_state
        score += reward
        
      if running_score == None:
        running_score = score
      else:
        running_score = running_score * 0.9 + score * 0.1
        
      if (i + 1) % checkpoint == 0:
        is_best = False
        if running_score > best_score:
          is_best = True
          save_torch_model(self.policy, 'model/actor_critic_cartpole_policy_best.pth')
          best_score = running_score
        save_torch_model(self.policy,'model/actor_critic_cartpole_policy_iter_%d.pth' %(i+1))
        print('%d: running_score:%.2f, is_best:%s' %(i+1, running_score, is_best))

      self.update_actor_critic(episode)

In [112]:
env = gym.make('CartPole-v0')
agent = ActorCritic(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [177]:
agent.train(env, 1000, lr=1e-4, checkpoint=100)

100: running_score:-200.00, is_best:True
200: running_score:-200.00, is_best:False
300: running_score:-200.00, is_best:False
400: running_score:-200.00, is_best:False
500: running_score:-200.00, is_best:False
600: running_score:-200.00, is_best:False
700: running_score:-200.00, is_best:False
800: running_score:-200.00, is_best:False
900: running_score:-200.00, is_best:False
1000: running_score:-200.00, is_best:False


In [None]:
class ActorCritic_MountainCar():
  def __init__(self, env, steps_in_state = 2):
    self.steps_in_state = steps_in_state
    self.policy = PolicyNet_discret(env.observation_space.shape[0] * steps_in_state,env.action_space.n)
    self.value = ValueNet(env.observation_space.shape[0] * steps_in_state)
    if use_cuda:
      self.policy.cuda()
      self.value.cuda()
    self.env = env
    self._gamma = 0.96
    
  def pick_action(self, state):
    probs = self.policy(state)
    action_dist = Categorical(probs)
    action = action_dist.sample()
    action = action.item()
    return (action, action_dist.log_prob(FloatTensor([action])))    
  
  def update_actor_critic(self, episode):
    (states, actions, rewards, next_states, log_probs, ended) = zip(*episode)
    
    rewards = FloatTensor(rewards)
    ended = FloatTensor(ended)
    state_value = self.value(torch.stack(states))
    next_state_value = self.value(torch.stack(next_states))
    target_value = rewards + (1 - ended) * self._gamma * next_state_value
    
    delta = target_value - state_value

    value_loss = F.mse_loss(state_value, target_value)
    
    policy_loss = []
    for log_prob, d in zip(log_probs, delta):
      policy_loss.append(-log_prob * d)
    policy_loss = torch.stack(policy_loss).sum()
    
    self.value_optimizer.zero_grad()
    value_loss.backward(retain_graph=True)
    self.value_optimizer.step()
    
    self.policy_optimizer.zero_grad()
    policy_loss.backward()
    self.policy_optimizer.step()

  def train(self, env, episode, lr=1e-3, target_copylr=1e-3, lr_policy=None, lr_value=None, checkpoint=100):
    lr_policy = lr if lr_policy == None else lr_policy
    lr_value = lr if lr_value == None else lr_value
    self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr_policy, weight_decay=1e-3)
    self.value_optimizer = torch.optim.Adam(self.value.parameters(), lr=lr_value, weight_decay=1e-3)
    best_score = -99999
    running_score = None
    for i in range(episode):
      s0 = env.reset()
      seq = [s0] * self.steps_in_state
      state = FloatTensor(seq).view(-1)
      episode = []
      episode_ended = False
      score = 0
      while not episode_ended:
        (action, log_prob) =  self.pick_action(state)
        (s1, reward, episode_ended, info) = env.step(action)
        seq = seq[1:]
        seq.append(s1)
        next_state = FloatTensor(seq).view(-1)
        if episode_ended:
          ended = 1
        else:
          ended = 0
        episode.append((state, action, reward, next_state, log_prob, ended))

        s0 = s1
        state = next_state
        score += reward
        
      if running_score == None:
        running_score = score
      else:
        running_score = running_score * 0.9 + score * 0.1
        
      if (i + 1) % checkpoint == 0:
        is_best = False
        if running_score > best_score:
          is_best = True
          save_torch_model(self.policy, 'model/actor_critic_cartpole_policy_best.pth')
          best_score = running_score
        save_torch_model(self.policy,'model/actor_critic_cartpole_policy_iter_%d.pth' %(i+1))
        print('%d: running_score:%.2f, is_best:%s' %(i+1, running_score, is_best))

      self.update_actor_critic(episode)

In [171]:
env = gym.make('MountainCar-v0')
agent_mc = ActorCritic_MountainCar(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [175]:
env.observation_space.low

array([-1.2 , -0.07], dtype=float32)

In [184]:
env.observation_space.high

array([0.6 , 0.07], dtype=float32)

In [3]:
env = gym.make('MountainCar-v0')
o = env.reset()
env.render()
print(o)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[-0.44360457  0.        ]


In [183]:
(-1.2 + 0.3)/0.9

-0.9999999999999999

In [None]:
-0.07 = -1.0 , 0.07 = 1.0. in * 1.0/0.07
-1.2 = -1.0, 0.6 = 1.0

range = 1.8
in + 0.3 / 0.9 

In [142]:
agent_mc.train(env, 1000, lr=1e-6, checkpoint=100)

100: running_score:-200.00, is_best:True
200: running_score:-200.00, is_best:False
300: running_score:-200.00, is_best:False
400: running_score:-200.00, is_best:False
500: running_score:-200.00, is_best:False
600: running_score:-200.00, is_best:False
700: running_score:-200.00, is_best:False
800: running_score:-200.00, is_best:False
900: running_score:-200.00, is_best:False
1000: running_score:-200.00, is_best:False


In [20]:
class ActorCritic_target_eval():
  def __init__(self, env, steps_in_state = 2):
    self.steps_in_state = steps_in_state
    self.policy_target = PolicyNet_discret(env.observation_space.shape[0] * steps_in_state,env.action_space.n)
    self.policy_eval = PolicyNet_discret(env.observation_space.shape[0] * steps_in_state,env.action_space.n)
    self.value_target = ValueNet(env.observation_space.shape[0] * steps_in_state)
    self.value_eval = ValueNet(env.observation_space.shape[0] * steps_in_state)
    if use_cuda:
      self.policy_target.cuda()
      self.policy_eval.cuda()
      self.value_target.cuda()
      self.value_eval.cuda()
    self.policy_eval.load_state_dict(self.policy_target.state_dict())
    self.value_eval.load_state_dict(self.value_target.state_dict())
    self.env = env
    self._gamma = 0.96
    
  def pick_action(self, state):
    probs = self.policy_target(state)
    action_dist = Categorical(probs)
    action = action_dist.sample()
    action = action.item()
    return (action, action_dist.log_prob(FloatTensor([action])))
  
  def update_actor_critic(self, episode):
    (states, actions, rewards, next_states, log_probs, ended) = zip(*episode)
    
    rewards = FloatTensor(rewards)
    ended = FloatTensor(ended)
    state_value = self.value_target(torch.stack(states))
    next_state_value = self.value_target(torch.stack(next_states))
    target_value = rewards + (1 - ended) * self._gamma * next_state_value
    
    delta = target_value - state_value

    value_loss = F.mse_loss(state_value, target_value)
    
    policy_loss = []
    for log_prob, d in zip(log_probs, delta):
      policy_loss.append(-log_prob * d)
    policy_loss = torch.stack(policy_loss).sum()
    
    self.value_optimizer.zero_grad()
    zero_grad(self.value_target)
    value_loss.backward(retain_graph=True)
    copy_grad(self.value_target, self.value_eval)
    self.value_optimizer.step()
    
    self.policy_optimizer.zero_grad()
    zero_grad(self.policy_target)
    policy_loss.backward(retain_graph=True)
    copy_grad(self.policy_target, self.policy_eval)
    self.policy_optimizer.step()

  def train(self, env, episode, lr=1e-3, lr_policy=None, lr_value=None, checkpoint=100):
    lr_policy = lr if lr_policy == None else lr_policy
    lr_value = lr if lr_value == None else lr_value
    self.policy_optimizer = torch.optim.Adam(self.policy_eval.parameters(), lr=lr_policy, weight_decay=1e-3)
    self.value_optimizer = torch.optim.Adam(self.value_eval.parameters(), lr=lr_value, weight_decay=1e-3)
    best_score = -99999
    running_score = None
    sample_per_update = 500
    sample = []
    nn_update_count = 0
    for i in range(episode):
      s0 = env.reset()
      seq = [s0] * self.steps_in_state
      state = FloatTensor(seq).view(-1)
      episode_ended = False
      score = 0
      while not episode_ended:
        (action, log_prob) =  self.pick_action(state)
        (s1, reward, episode_ended, info) = env.step(action)
        seq = seq[1:]
        seq.append(s1)
        next_state = FloatTensor(seq).view(-1)
        if episode_ended:
          ended = 1
        else:
          ended = 0
        sample.append((state, action, reward, next_state, log_prob, ended))

        s0 = s1
        state = next_state
        score += reward
        
      if running_score == None:
        running_score = score
      else:
        running_score = running_score * 0.9 + score * 0.1

      if (i + 1) % checkpoint == 0:
        is_best = False
        if running_score > best_score:
          is_best = True
          save_torch_model(self.policy_target, 'model/actor_critic_cartpole_policy_best.pth')
          best_score = running_score
        save_torch_model(self.policy_target,'model/actor_critic_cartpole_policy_iter_%d.pth' %(i+1))
        print('%d: running_score:%.2f, is_best:%s' %(i+1, running_score, is_best))
      
      if len(sample) > sample_per_update:
        self.update_actor_critic(sample)
        nn_update_count += 1

      if nn_update_count % 5 == 0:
        update_target(self.policy_target, self.policy_eval, 0.1)
        update_target(self.value_target, self.value_eval, 0.1)
        nn_update_count = 0


In [21]:
env = gym.make('CartPole-v0')
agent_target_eval = ActorCritic_target_eval(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [22]:
agent_target_eval.train(env, 3000, lr=5e-5, checkpoint=100)

KeyboardInterrupt: 

In [138]:
class ActorCritic_target_eval_continuous():
  def __init__(self, env, steps_in_state = 2):
    self.steps_in_state = steps_in_state
    self.policy_target = PolicyNet_discret(env.observation_space.shape[0] * steps_in_state, env.action_space.shape[0])
    self.policy_eval = PolicyNet_discret(env.observation_space.shape[0] * steps_in_state, env.action_space.shape[0])
    self.value_target = ValueNet(env.observation_space.shape[0] * steps_in_state)
    self.value_eval = ValueNet(env.observation_space.shape[0] * steps_in_state)
    if use_cuda:
      self.policy_target.cuda()
      self.policy_eval.cuda()
      self.value_target.cuda()
      self.value_eval.cuda()
    self.policy_eval.load_state_dict(self.policy_target.state_dict())
    self.value_eval.load_state_dict(self.value_target.state_dict())
    self.env = env
    self.range_scale = (env.action_space.high[0] - env.action_space.low[0]) / 2.0

    self._gamma = 0.96
    
  def pick_action(self, state):
    probs = self.policy_target(state) * self.range_scale
    action_dist = Normal(probs, 0.01)
    action = action_dist.sample()
    action = action.item()
    return (action, action_dist.log_prob(FloatTensor([action])))
  
  def update_actor_critic(self, episode):
    (states, actions, rewards, next_states, log_probs, ended) = zip(*episode)
    
    rewards = FloatTensor(rewards)
    ended = FloatTensor(ended)
    state_value = self.value_target(torch.stack(states))
    next_state_value = self.value_target(torch.stack(next_states))
    target_value = rewards + (1 - ended) * self._gamma * next_state_value
    
    delta = target_value - state_value

    value_loss = F.mse_loss(state_value, target_value)
    
    policy_loss = []
    for log_prob, d in zip(log_probs, delta):
      policy_loss.append(-log_prob * d)
    policy_loss = torch.stack(policy_loss).sum()
    
    self.value_optimizer.zero_grad()
    zero_grad(self.value_target)
    value_loss.backward(retain_graph=True)
    copy_grad(self.value_target, self.value_eval)
    self.value_optimizer.step()
    
    self.policy_optimizer.zero_grad()
    zero_grad(self.policy_target)
    policy_loss.backward()
    copy_grad(self.policy_target, self.policy_eval)
    self.policy_optimizer.step()
    
  def train(self, env, episode, lr=1e-3, lr_policy=None, lr_value=None, checkpoint=100):
    train_data = {
      'model':{
        'policy_eval':self.policy_eval.state_dict(),
        'policy_target':self.policy_target.state_dict(),
        'value_eval':self.value_eval.state_dict(),
        'value_target':self.value_target.state_dict()
      },
      'episode':0,
      'statistic':{
      }
    }
    lr_policy = lr if lr_policy == None else lr_policy
    lr_value = lr if lr_value == None else lr_value
    self.policy_optimizer = torch.optim.SGD(self.policy_eval.parameters(), lr=lr_policy, weight_decay=1e-3)
    self.value_optimizer = torch.optim.SGD(self.value_eval.parameters(), lr=lr_value, weight_decay=1e-3)
    best_score = -99999
    running_score = None
    
    for i in range(episode):
      s0 = env.reset()
      seq = [s0] * self.steps_in_state
      state = FloatTensor(seq).view(-1)
      episode = []
      episode_ended = False
      score = 0
      while not episode_ended:
        (action, log_prob) =  self.pick_action(state)
        (s1, reward, episode_ended, info) = env.step([action])
        seq = seq[1:]
        seq.append(s1)
        next_state = FloatTensor(seq).view(-1)
        if episode_ended:
          ended = 1
        else:
          ended = 0
        episode.append((state, action, reward, next_state, log_prob, ended))

        s0 = s1
        state = next_state
        score += reward
        
      if running_score == None:
        running_score = score
      else:
        running_score = running_score * 0.9 + score * 0.1
        
      if (i + 1) % checkpoint == 0:
        is_best = False
        if running_score > best_score:
          is_best = True
          save_torch_model(self.policy_target, 'model/actor_critic_pendulum_policy_best.pth')
          best_score = running_score
        save_torch_model(self.policy_target,'model/actor_critic_pendulum_policy_iter_%d.pth' %(i+1))
        print('%d: running_score:%.2f, is_best:%s' %(i+1, running_score, is_best))
        
      self.update_actor_critic(episode)

      if i % 20 == 0:
        update_target(self.policy_target, self.policy_eval, 0.1)
        update_target(self.value_target, self.value_eval, 0.1)

In [139]:
env = gym.make('Pendulum-v0')
agent_target_eval_continuous = ActorCritic_target_eval_continuous(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [140]:
agent_target_eval_continuous.train(env, 8000, lr=1e-6, checkpoint=50)

50: running_score:-1377.57, is_best:True
100: running_score:-1523.91, is_best:False
150: running_score:-1418.31, is_best:False
200: running_score:-1345.32, is_best:True
250: running_score:-1478.96, is_best:False
300: running_score:-1399.82, is_best:False
350: running_score:-1368.12, is_best:False
400: running_score:-1390.04, is_best:False


KeyboardInterrupt: 

In [86]:
class ActorCritic_continuous():
  def __init__(self, env, steps_in_state = 2):
    self.steps_in_state = steps_in_state
    self.policy = PolicyNet_continuous(env.observation_space.shape[0] * steps_in_state, env.action_space.shape[0])
    self.value = ValueNet(env.observation_space.shape[0] * steps_in_state)
    self.range_scale = (env.action_space.high[0] - env.action_space.low[0]) / 2.0
    self.env = env
    self._gamma = 0.96
    
  def predict_value(self, state):
    return self.value(state)
  
  def predict_action(self, state):
    return self.policy(state)
  
  def pick_action(self, state):
    probs = self.predict_action(state) * self.range_scale
    action_dist = Normal(probs, 0.2)
    action = action_dist.sample()
    action = action.item()
    return (action, action_dist.log_prob(FloatTensor([action])))
  
  def update_actor_critic(self, episode):
    (states, actions, rewards, next_states, log_probs, ended) = zip(*episode)
    
    rewards = FloatTensor(rewards)
    ended = FloatTensor(ended)
    state_value = self.value(torch.stack(states))
    next_state_value = self.value(torch.stack(next_states))
    target_value = rewards + (1 - ended) * self._gamma * next_state_value
    
    delta = target_value - state_value

    value_loss = F.mse_loss(state_value, target_value)
    
    policy_loss = []
    for log_prob, d in zip(log_probs, delta):
      policy_loss.append(-log_prob * d)
    policy_loss = torch.stack(policy_loss).sum()
    
    self.value_optimizer.zero_grad()
    value_loss.backward(retain_graph=True)
    self.value_optimizer.step()
    
    self.policy_optimizer.zero_grad()
    policy_loss.backward()
    self.policy_optimizer.step()

  def train(self, env, episode, lr=1e-3, lr_policy=None, lr_value=None, checkpoint=100):
    lr_policy = lr if lr_policy == None else lr_policy
    lr_value = lr if lr_value == None else lr_value
    self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr_policy, weight_decay=1e-3)
    self.value_optimizer = torch.optim.Adam(self.value.parameters(), lr=lr_value, weight_decay=1e-3)
    best_score = -99999
    for i in range(episode):
      s0 = env.reset()
      seq = [s0] * self.steps_in_state
      state = FloatTensor(seq).view(-1)
      episode = []
      episode_ended = False
      score = 0
      while not episode_ended:
        (action, log_prob) =  self.pick_action(state)
        (s1, reward, episode_ended, info) = env.step([action])
        seq = seq[1:]
        seq.append(s1)
        next_state = FloatTensor(seq).view(-1)
        if episode_ended:
          ended = 1
        else:
          ended = 0
        episode.append((state, action, reward, next_state, log_prob, ended))

        s0 = s1
        state = next_state
        score += reward
        
      if score > best_score:
        save_torch_model(self.policy, 'model/actor_critic_Pendulum_policy_best.pth')
        best_score = score
        print('new best score:',best_score)
        
      self.update_actor_critic(episode)

      if (i + 1) % checkpoint == 0:
        save_torch_model(self.policy,'model/actor_critic_Pendulum_policy_iter_%d.pth' %(i+1))
        print(i+1,': score:', score)


In [87]:
env = gym.make('Pendulum-v0')
agent = ActorCritic_continuous(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [88]:
agent.train(env, 1000, lr=1e-4, checkpoint=50)

new best score: -1420.6990168430254
new best score: -1240.4158551602857
new best score: -1181.922163316851
new best score: -1116.51925322667
new best score: -1080.0822897288838
new best score: -715.8367356613004
50 : score: -1513.117540566958
100 : score: -1511.7525513447758
150 : score: -1558.7815375703308
200 : score: -1228.3368534381889
250 : score: -1386.1700872323856
300 : score: -1455.4761546471093
350 : score: -1368.2759435930686
400 : score: -1509.7915599460023
450 : score: -1517.2225833786401
500 : score: -1417.091764092609
550 : score: -1522.2141861350397
600 : score: -1366.1865629114368
650 : score: -1451.0626714816392
new best score: -528.1708752457364
700 : score: -1510.3500868767105
750 : score: -1532.600699011077
800 : score: -1483.236599622256
850 : score: -1574.0829310164986
900 : score: -1634.894965989276
950 : score: -1477.6349356056828
1000 : score: -859.2797486401734


In [56]:
env = gym.make('KungFuMaster-ram-v0')
env = gym.make('Pendulum-v0')
def avg_reward_random(env, iteration):
  total_reward = 0.0
  for i in range(iteration):
    env.reset()
    ended = False
    while not ended:
      (state, reward, ended, info) = env.step(env.action_space.sample())
      total_reward += reward
  print (total_reward / iteration)
avg_reward_random(env, 50)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
-1219.606081432608


In [206]:
l = nn.Linear(50,1)

In [211]:
l.weight.data.zero_()

tensor([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.]])

In [213]:
l.weight

Parameter containing:
tensor([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.]])