In [4]:
import numpy as np
import os
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.normal import Normal

In [13]:
# buffer.py
class ReplayBuffer():
  def __init__(self, max_size, input_shape, n_actions):
    self.mem_size = max_size
    self.mem_cntr = 0
    self.state_memory = np.zeros((self.mem_size, *input_shape))
    self.new_state_memory = np.zeros((self.mem_size, *input_shape))
    self.action_memory = np.zeros((self.mem_size, n_actions))
    self.reward_memory = np.zeros(self.mem_size)
    self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool_)

  def store_transition(self, state, action, reward, state_, done):
    index = self.mem_cntr % self.mem_size
    self.new_state_memory[index] = state_
    self.state_memory[index] = state
    self.action_memory[index] = action
    self.reward_memory[index] = reward
    self.terminal_memory[index] = done

    self.mem_cntr += 1

  def sample_buffer(self, batch_size):
    max_mem = min(self.mem_cntr, self.mem_size)
    batch = np.random.choice(max_mem, batch_size, replace=False)
    states = self.state_memory[batch]
    states_ = self.new_state_memory[batch]
    actions = self.action_memory[batch]
    rewards = self.reward_memory[batch]
    dones = self.terminal_memory[batch]

    return states, actions, rewards, states_, dones

  and should_run_async(code)


In [6]:
# networks.py
class CriticNetwork(nn.Module):
  def __init__(self, beta, input_dims, n_actions, fc1_dims=256, fc2_dims=256, name='critic', chkpt_dir='tmp/sac'):
    super(CriticNetwork, self).__init__()
    self.input_dims = input_dims
    self.fc1_dims = fc1_dims
    self.fc2_dims = fc2_dims
    self.n_actions = n_actions
    self.name = name
    self.checkpoint_dir = chkpt_dir
    self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac')

    self.fc1 = nn.Linear(self.input_dims[0] + n_actions, self.fc1_dims)   # incorporate state and action pairs
    self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
    self.q = nn.Linear(self.fc2_dims, 1)

    self.optimizer = optim.Adam(self.parameters(), lr=beta)
    self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
    self.to(self.device)

  def forward(self, state, action):
    action_value = self.fc1(T.cat([state, action], dim=1))
    action_value = F.relu(action_value)
    action_value = self.fc2(action_value)
    action_value = F.relu(action_value)
    q = self.q(action_value)
    return q

  def save_checkpoint(self):
    T.save(self.state_dict(), self.checkpoint_file)

  def load_checkpoint(self):
    self.load_state_dict(T.load(self.checkpoint_file))

In [7]:
# networks.py
class ValueNetwork(nn.Module):
  def __init__(self, beta, input_dims, fc1_dims=256, fc2_dims=256, name='value', chkpt_dir='tmp/sac'):
    super(ValueNetwork, self).__init__()
    self.input_dims = input_dims
    self.fc1_dims = fc1_dims
    self.fc2_dims = fc2_dims
    self.name = name
    self.checkpoint_dir = chkpt_dir
    self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac')

    self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
    self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
    self.v = nn.Linear(self.fc2_dims, 1)

    self.optimizer = optim.Adam(self.parameters(), lr=beta)
    self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
    self.to(self.device)

  def forward(self, state):
    state_value = self.fc1(state)
    state_value = F.relu(state_value)
    state_value = self.fc2(state_value)
    state_value = F.relu(state_value)

    v = self.v(state_value)
    return v

  def save_checkpoint(self):
    T.save(self.state_dict(), self.checkpoint_file)

  def load_checkpoint(self):
    self.load_state_dict(T.load(self.checkpoint_file))

In [17]:
# networks.py
# the harder part of the problem
class ActorNetwork(nn.Module):
  def __init__(self, alpha, input_dims, max_action, fc1_dims=256, fc2_dims=256,
               n_actions=2, name='actor', chkpt_dir='tmp/sac'):
    super(ActorNetwork, self).__init__()
    self.input_dims = input_dims
    self.fc1_dims = fc1_dims
    self.fc2_dims = fc2_dims
    self.n_actions = n_actions
    self.name = name
    self.checkpoint_dir = chkpt_dir
    self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac')
    self.max_action = max_action
    self.reparam_noise = 1e-6

    self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
    self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
    self.mu = nn.Linear(self.fc2_dims, self.n_actions)
    self.sigma = nn.Linear(self.fc2_dims, self.n_actions)

    self.optimizer = optim.Adam(self.parameters(), lr=alpha)
    self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

    self.to(self.device)

  def forward(self, state):
    prob = self.fc1(state)
    prob = F.relu(prob)
    prob = self.fc2(prob)
    prob = F.relu(prob)

    mu = self.mu(prob)
    sigma = self.sigma(prob)

    sigma = T.clamp(sigma, min=self.reparam_noise, max=1)

    return mu, sigma

  def sample_normal(self, state, reparameterize=True):
    mu, sigma = self.forward(state)
    probabilities = Normal(mu, sigma)

    if reparameterize:
      actions = probabilities.rsample()
    else:
      actions = probabilities.sample()

    action = T.tanh(actions) * T.tensor(self.max_action).to(self.device)
    log_probs = probabilities.log_prob(actions)
    log_probs -= T.log(1 - action.pow(2) + self.reparam_noise)
    log_probs = log_probs.sum(1, keepdim=True)

    return action, log_probs

  def save_checkpoint(self):
    T.save(self.state_dict(), self.checkpoint_file)

  def load_checkpoint(self):
    self.load_state_dict(T.load(self.checkpoint_file))

In [20]:
# sac_torch.py
class Agent():
  def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None,
               gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256,
               layer2_size=256, batch_size=256, reward_scale=2, chkpt_dir='tmp/sac'):
    self.gamma = gamma
    self.tau = tau
    self.memory = ReplayBuffer(max_size, input_dims, n_actions)
    self.batch_size = batch_size
    self.n_actions = n_actions

    self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor',
                              max_action=env.action_space.high, chkpt_dir=chkpt_dir)
    self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1')
    self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2')
    self.value = ValueNetwork(beta, input_dims, name='value')
    self.target_value = ValueNetwork(beta, input_dims, name='target_value')

    self.scale = reward_scale

    self.update_network_parameters(tau=1)

  def choose_action(self, observation):
    state = T.Tensor([observation]).to(self.actor.device)
    actions, _ = self.actor.sample_normal(state, reparameterize=False)
    return actions.cpu().detach().numpy()[0]

  def remember(self, state, action, reward, new_state, done):
    self.memory.store_transition(state, action, reward, new_state, done)

  def update_network_parameters(self, tau=None):
    if tau is None:
      tau = self.tau
    target_value_params = self.target_value.named_parameters()
    value_params = self.value.named_parameters()

    target_value_state_dict = dict(target_value_params)
    value_state_dict = dict(value_params)

    for name in value_state_dict:
      value_state_dict[name] = tau * value_state_dict[name].clone() + \
                                  (1 - tau) * target_value_state_dict[name].clone()

    self.target_value.load_state_dict(value_state_dict)

  def save_models(self):
    print('...saving models...')
    self.actor.save_checkpoint()
    self.value.save_checkpoint()
    self.target_value.save_checkpoint()
    self.critic_1.save_checkpoint()
    self.critic_2.save_checkpoint()

  def load_models(self):
    print('...loading models...')
    self.actor.load_checkpoint()
    self.value.load_checkpoint()
    self.target_value.load_checkpoint()
    self.critic_1.load_checkpoint()
    self.critic_2.load_checkpoint()

  def learn(self):
    if self.memory.mem_cntr < self.batch_size:
      return

    state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)

    reward = T.tensor(reward, dtype=T.float).to(self.actor.device)
    state = T.tensor(state, dtype=T.float).to(self.actor.device)
    state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device)
    done = T.tensor(done).to(self.actor.device)
    action = T.tensor(action, dtype=T.float).to(self.actor.device)

    value = self.value(state).view(-1)
    value_ = self.target_value(state_).view(-1)
    value_[done] = 0.0

    actions, log_probs = self.actor.sample_normal(state, reparameterize=False)
    log_probs = log_probs.view(-1)
    q1_new_policy = self.critic_1.forward(state, actions)
    q2_new_policy = self.critic_2.forward(state, actions)
    critic_value = T.min(q1_new_policy, q2_new_policy)
    critic_value = critic_value.view(-1)

    self.value.optimizer.zero_grad()
    value_target = critic_value - log_probs
    value_loss = 0.5 * F.mse_loss(value, value_target)
    value_loss.backward(retain_graph=True)
    self.value.optimizer.step()

    actions, log_probs = self.actor.sample_normal(state, reparameterize=True)
    log_probs = log_probs.view(-1)
    q1_new_policy = self.critic_1.forward(state, actions)
    q2_new_policy = self.critic_2.forward(state, actions)

    actor_loss = log_probs - critic_value
    actor_loss = T.mean(actor_loss)
    self.actor.optimizer.zero_grad()
    actor_loss.backward(retain_graph=True)
    self.actor.optimizer.step()

    self.critic_1.optimizer.zero_grad()
    self.critic_2.optimizer.zero_grad()
    q_hat = self.scale * reward + self.gamma * value_
    q1_old_policy = self.critic_1.forward(state, action).view(-1)
    q2_old_policy = self.critic_2.forward(state, action).view(-1)
    critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
    critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

    critic_loss = critic_1_loss + critic_2_loss
    critic_loss.backward()
    self.critic_1.optimizer.step()
    self.critic_2.optimizer.step()

    self.update_network_parameters()

In [9]:
!pip install pybullet

Collecting pybullet
  Downloading pybullet-3.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading pybullet-3.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (103.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.2/103.2 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-3.2.6


In [10]:
import pybullet_envs
import gym

  logger.warn(


In [11]:
from google.colab import drive
drive.mount('/content/drive')

  and should_run_async(code)


Mounted at /content/drive


In [21]:
if __name__ == '__main__':
  env = gym.make('InvertedPendulumBulletEnv-v0')
  agent = Agent(input_dims=env.observation_space.shape, env=env,
               n_actions=env.action_space.shape[0])
  n_games = 250
  filename = 'inverted_pendulum.png'

  figure_file = 'plots/' + filename

  best_score = env.reward_range[0]
  score_history = []
  load_checkpoint = False

  if load_checkpoint:
    agent.load_models()
    env.render(mode='human')

  for i in range(n_games):
    observation = env.reset()
    done = False
    score = 0
    while not done:
      action = agent.choose_action(observation)
      observation_, reward, done, info = env.step(action)
      score += reward
      agent.remember(observation, action, reward, observation_, done)
      if not load_checkpoint:
        agent.learn()
      observation = observation_
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])

    if avg_score > best_score:
      best_score = avg_score
      #if not load_checkpoint:
        #agent.save_models()

    print('episode ', i, 'score %.1f' % score, 'avg score %.1f' % avg_score)

  deprecation(
  deprecation(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


episode  0 score 22.0 avg score 22.0
episode  1 score 25.0 avg score 23.5
episode  2 score 33.0 avg score 26.7
episode  3 score 29.0 avg score 27.2
episode  4 score 22.0 avg score 26.2
episode  5 score 23.0 avg score 25.7
episode  6 score 46.0 avg score 28.6
episode  7 score 27.0 avg score 28.4
episode  8 score 74.0 avg score 33.4
episode  9 score 20.0 avg score 32.1
episode  10 score 36.0 avg score 32.5
episode  11 score 67.0 avg score 35.3
episode  12 score 31.0 avg score 35.0
episode  13 score 47.0 avg score 35.9
episode  14 score 23.0 avg score 35.0
episode  15 score 38.0 avg score 35.2
episode  16 score 38.0 avg score 35.4
episode  17 score 32.0 avg score 35.2
episode  18 score 20.0 avg score 34.4
episode  19 score 47.0 avg score 35.0
episode  20 score 17.0 avg score 34.1
episode  21 score 27.0 avg score 33.8
episode  22 score 23.0 avg score 33.3
episode  23 score 21.0 avg score 32.8
episode  24 score 19.0 avg score 32.3
episode  25 score 19.0 avg score 31.8
episode  26 score 37.0