In [None]:
################################################################################
# BELOW IS CODE ADAPTED FROM:
# 
# Escape Velocity Labs, 28/08/22, 'Advanced Reinforcement Learning in Python: from DQN to SAC', 
# Accessed at https://www.udemy.com/course/advanced-reinforcement/ 
#
# Note: Whilst the structure of this code is heavily taken from Escape Velocity Labs, there are also
# some self-coded elements involved in this adaptation.
################################################################################

In [None]:
### Install dependencies
### Note: you need to restart the runtime after you run this cell

!apt-get install -y xvfb # Used to display videos and GUI's in this notebook

!pip install \
    gym==0.21 \
    gym[box2d] \
    pytorch-lightning==1.6.0 \
    optuna \
    pyvirtualdisplay

# gym -> Environment 
# gym[box2d] -> Dependency for box2d category in the gym
# pytorch-lightning -> Reinforcement learning and neural networks
# pyvirtualdisplay -> Used to display videos in the environment

In [None]:
### Create a display to render the environment in google servers

from pyvirtualdisplay import Display
Display(visible=False, size=(1400, 900)).start()

<pyvirtualdisplay.display.Display at 0x7f1d932eb690>

In [None]:
### Import remaining packages and set up variables related to hardware

import copy
import gym
import torch
import random
import itertools
import matplotlib.pyplot as plt
import pandas as pd
import optuna

import numpy as np
import torch.nn.functional as F

from collections import deque, namedtuple
from IPython.display import HTML
from base64 import b64encode

from torch import Tensor, nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import IterableDataset
from torch.optim import AdamW

from pytorch_lightning import LightningModule, Trainer

from pytorch_lightning.callbacks import EarlyStopping

from gym.wrappers import RecordVideo, RecordEpisodeStatistics, TimeLimit

from optuna.integration import PyTorchLightningPruningCallback

# Use GPU if there is one available
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

num_gpus = torch.cuda.device_count()

In [None]:
### Create a function which takes episode number as input, and outputs a video of the environment of the given episode

def display_video(episode=0):
  video_file = open(f'/content/videos/rl-video-episode-{episode}.mp4', "r+b").read()
  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"<video width=600 controls><source src='{video_url}'></video>")

In [None]:
### Create the actor network class

class Actor(nn.Module):

  def __init__(self, hidden_size, obs_size, out_dims, min, max, num_layers):
    super().__init__()
    self.min = torch.from_numpy(min).to(device)
    self.max = torch.from_numpy(max).to(device)
    if num_layers == 1:
      self.net = nn.Sequential(
          nn.Linear(obs_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, out_dims),
          nn.Tanh()
      )
    elif num_layers == 2:
      self.net = nn.Sequential(
          nn.Linear(obs_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, out_dims),
          nn.Tanh()
      )
    elif num_layers == 3:
      self.net = nn.Sequential(
          nn.Linear(obs_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, out_dims),
          nn.Tanh()
      )
    
  # This function takes a state as input and outputs the highest return action
  def mu(self, x):
    if isinstance(x, np.ndarray):
      x = torch.from_numpy(x).to(device)
    return self.net(x.float()) * self.max

  # This function takes a state as input, and returns the highest return action with clipped noise
  def forward(self, x, epsilon=0.0, noise_clip=None):
    mu = self.mu(x)
    noise = torch.normal(0, epsilon, mu.size(), device=mu.device)
    if noise_clip is not None:
      noise = torch.clamp(noise, -noise_clip, noise_clip)
    mu = mu + noise
    action = torch.max(torch.min(mu, self.max), self.min)
    action = action.detach().cpu().numpy() # Maybe not detach
    return action


In [None]:
### Create the Q-Network class

class DQN(nn.Module):

  def __init__(self, hidden_size, obs_size, out_dims, num_layers):
    super().__init__()
    if num_layers == 1:
      self.net = nn.Sequential(
          nn.Linear(obs_size + out_dims, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),           
          nn.Linear(hidden_size, 1),
      )
    elif num_layers == 2:
      self.net = nn.Sequential(
          nn.Linear(obs_size + out_dims, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),
          nn.Linear(hidden_size, hidden_size),
          nn.ReLU(),                
          nn.Linear(hidden_size, 1),
      )
    elif num_layers == 3:
        self.net = nn.Sequential(
            nn.Linear(obs_size + out_dims, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),     
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),                
            nn.Linear(hidden_size, 1),
        )

  # This function takes a state-action pair and returns a Q-value (predicted expected return)
  def forward(self, state, action):
    if isinstance(state, np.ndarray):
      state = torch.from_numpy(state).to(device)
    if isinstance(action, np.ndarray):
      action = torch.from_numpy(action).to(device)
    in_vector = torch.hstack((state, action))
    return self.net(in_vector.float())


In [None]:
### Create the ReplayBuffer class, from which we make out buffer

class ReplayBuffer:

  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)
  
  def __len__(self):
    return len(self.buffer)

  def append(self, experience):
    self.buffer.append(experience)
  
  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

In [None]:
### Create the RLDataset class, which is used to create a dataset object out of samples from the buffer to be used as input for the training step

class RLDataset(IterableDataset):

  def __init__(self, buffer, sample_size=200):
    self.buffer = buffer
    self.sample_size = sample_size
  
  # This function sequentially gives an experience from the buffer to the pyTorch when requested
  def __iter__(self):
    for experience in self.buffer.sample(self.sample_size):
      yield  experience


In [None]:
### Create a function to incrementally move the weights of the target network closer to those of the main network

def polyak_average(net, target_net, tau=0.01):
    for qp, tp in zip(net.parameters(), target_net.parameters()):
        tp.data.copy_(tau * qp.data + (1 - tau) * tp.data)

In [None]:
### Create a function which wraps the default environment from gym in layers of additional funcitonality

def create_environment(name, max_steps):
  env = gym.make(name)
  env = TimeLimit(env, max_episode_steps=max_steps)
  #env = RecordVideo(env, video_folder='./videos', episode_trigger=lambda x: x % 50 == 0)
  env = RecordEpisodeStatistics(env)
  return env

In [None]:
### This class enacts the TD3 algorithm

class TD3(LightningModule):

  def __init__(self, env_name, capacity=50_000, batch_size=100, 
               actor_lr=1e-3, critic_lr=1e-3, hidden_size=128, gamma=0.995,
               loss_fn=F.smooth_l1_loss, optim=AdamW, eps_start=1.0, eps_end=0.1,
               eps_last_episode=100, samples_per_epoch=1_000, tau=0.004, epsilon=1, max_steps=400, num_layers=1):
    
    super().__init__()

    self.env = create_environment(env_name, max_steps)
    obs_size = self.env.observation_space.shape[0]
    action_dims = self.env.action_space.shape[0]
    max_action = self.env.action_space.high
    min_action = self.env.action_space.low

    self.q_net1 = DQN(hidden_size, obs_size, action_dims, num_layers)
    self.q_net2 = DQN(hidden_size, obs_size, action_dims, num_layers)
    self.actor = Actor(hidden_size, obs_size, action_dims, min_action, max_action, num_layers)

    self.target_q_net1 = copy.deepcopy(self.q_net1)
    self.target_q_net2 = copy.deepcopy(self.q_net2)
    self.target_actor = copy.deepcopy(self.actor)

    self.epsilon = epsilon

    self.buffer = ReplayBuffer(capacity=capacity)

    self.ep_returns = []

    self.max_return = 0
    self.best_actor = copy.deepcopy(self.actor)

    self.save_hyperparameters()

    self.hp_results = None

    # Fill the buffer up with experience (1000 experiences - i.e. self.hparams.samples_per_epoch)
    while len(self.buffer) < self.hparams.samples_per_epoch:
      print(f"{len(self.buffer)} samples in experience buffer. Filling...")
      self.play_episode(epsilon=self.hparams.eps_start)

  # This function plays a single full episode and stores all of the experience in the buffer
  @torch.no_grad()
  def play_episode(self, actor=None, epsilon=0.):
    state = self.env.reset()
    done = False

    while not done:
      if actor:
        action = actor(state, epsilon=epsilon)
      else:
        action = self.env.action_space.sample()
      next_state, reward, done, info = self.env.step(action)
      exp = (state, action, reward, done, next_state)
      self.buffer.append(exp)
      state = next_state


  # This function performs returns a predicted high returning action with noise applied
  def forward(self, x):
    output = self.actor(x)
    return output
  
  # This function configures the optimizers before the process begins
  def configure_optimizers(self):
    q_net_params = itertools.chain(self.q_net1.parameters(), self.q_net2.parameters())
    q_net_optimizer = self.hparams.optim(q_net_params, lr=self.hparams.critic_lr)
    actor_optimizer = self.hparams.optim(self.actor.parameters(), lr=self.hparams.actor_lr)
    return [q_net_optimizer, actor_optimizer]
  
  # This function creates the dataloader, which feeds experience to the training_step() function.
  def train_dataloader(self):
    dataset = RLDataset(self.buffer, self.hparams.samples_per_epoch)
    dataloader = DataLoader(
        dataset=dataset,
        batch_size = self.hparams.batch_size
    )
    return dataloader
  
  # Training step to update the neural networks
  def training_step(self, batch, batch_idx, optimizer_idx):

    states, actions, rewards, dones, next_states = batch
    rewards = rewards.unsqueeze(1)
    dones = dones.unsqueeze(1)

    # Update the target networks
    polyak_average(self.q_net1, self.target_q_net1, tau=self.hparams.tau)
    polyak_average(self.q_net2, self.target_q_net2, tau=self.hparams.tau)
    polyak_average(self.actor, self.target_actor, tau=self.hparams.tau)

    # If the q_net optimizer is selected, update the Q-network
    if optimizer_idx == 0:
      state_action_values1 = self.q_net1(states, actions)
      state_action_values2 = self.q_net2(states, actions)
      next_actions = self.target_actor(next_states, epsilon=self.epsilon, noise_clip=0.05)
      # Select the lowest next state/action value
      next_state_action_values = torch.min(
          self.target_q_net1(next_states, next_actions),
          self.target_q_net2(next_states, next_actions)
      )
      # Set all terminal states to have a value of 0
      next_state_action_values[dones] = 0.0
      expected_state_action_values = rewards + self.hparams.gamma * next_state_action_values

      q_loss1 = self.hparams.loss_fn(state_action_values1, expected_state_action_values)
      q_loss2 = self.hparams.loss_fn(state_action_values2, expected_state_action_values)
      total_loss = q_loss1 + q_loss2
      self.log_dict({"episode/Q-Loss": total_loss})
      return total_loss

    # If the actor optimizer is selected, update the Actor network
    elif optimizer_idx == 1 and batch_idx % 2 == 0:
      mu = self.actor.mu(states)
      actor_loss = - self.q_net1(states, mu).mean()
      self.log_dict({"episode/Actor Loss": actor_loss})
      return actor_loss


  # This function is called at the end of each epoch
  def training_epoch_end(self, training_step_outputs):

    # Decrease epsilon value as time goes on to reduce exploration and increase reward
    self.epsilon = max(
        self.hparams.eps_end,
        self.hparams.eps_start - self.current_epoch / self.hparams.eps_last_episode
    )

    # An episode is played out in full at the end of each epoch, therefore one epoch lasts as long as one episode
    self.play_episode(actor=self.actor, epsilon=self.epsilon)
    self.log('episode/Return', self.env.return_queue[-1])
    

    # If best actor, save as best actor
    if self.env.return_queue[-1] > self.max_return:
      self.best_actor = copy.deepcopy(self.actor)

    # Every 100 episodes calculate and log the hp metric
    if len(self.ep_returns)%100 == 0:
      hp_met = self.calculate_hp_metric()
      self.log('hp_metric', hp_met)

    self.ep_returns.append(self.env.return_queue[-1])


  # This function calculates the hp metric used by Optuna to select the next parameters
  # This metric is the average reward of 10 episodes (tests), and is calculated every 100 episodes of training
  def calculate_hp_metric(self):
    num_tests = 10
    ep_returns = 0
    for test in range(num_tests):
      state = self.env.reset()
      done = False
      while not done:
        action = self.actor(state, epsilon=0)
        next_state, reward, done, info = self.env.step(action)
        exp = (state, action, reward, done, next_state)
        self.buffer.append(exp)
        state = next_state
        ep_returns += reward
    return (ep_returns / num_tests)
    


In [None]:
### Setting up the tensorboard to display the results

!rm -r /content/lightning_logs/
!rm -r /content/videos/
%load_ext tensorboard
%tensorboard --logdir /content/lightning_logs/

In [None]:
### Create the objective function for Optuna

def objective(trial):

  # Test hidden_size between 50 and 500
  hidden_size = trial.suggest_int("hidden_size", 50, 500)
  # Test num_layers between 1 and 3
  num_layers = trial.suggest_int("num_layers", 1, 3)

  algo = TD3('BipedalWalker-v3', hidden_size=hidden_size, num_layers=num_layers)

  callback = PyTorchLightningPruningCallback(trial, monitor='hp_metric')

  trainer = Trainer(
      gpus=num_gpus,
      max_epochs=3000,
      # callbacks are functions that are executed after an epoch of training is complete
      callbacks=[callback]
  )

  hyperparameters = dict(hidden_size=hidden_size, num_layers=num_layers)

  # Send info to tensorboard
  trainer.logger.log_hyperparams(hyperparameters)
  # Train the algorithm
  trainer.fit(algo)

  # Store the results of the hp metric
  this_It = pd.DataFrame(algo.ep_returns)
  this_It.columns = [f'It [{hidden_size}, {num_layers}]']
  this_It.to_csv(f'TD3_Optuna_results_[{algo.hparams.hidden_size}, {algo.hparams.num_layers}].csv')

  return trainer.callback_metrics['hp_metric'].item()


In [None]:
### Create the study for Optuna

# Pruner stops trials early of not very promising
pruner = optuna.pruners.SuccessiveHalvingPruner()
# Tell optuna to maximise the value we are monitoring inside the objective function (ie 'hp_metric')
study = optuna.create_study(direction='maximize', pruner=pruner)

[32m[I 2022-07-17 11:03:27,130][0m A new study created in memory with name: no-name-dc5a0e63-9632-485a-8512-32599bfac631[0m


In [None]:
### Running the hyperparameter search

# n_trials is the number of different test combinations used in the search
study.optimize(objective, n_trials=50)

0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
527 samples in experience buffer. Filling...
598 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


998 samples in experience buffer. Filling...



  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 210 K 
1 | q_net2        | DQN   | 210 K 
2 | actor         | Actor | 210 K 
3 | target_q_net1 | DQN   | 210 K 
4 | target_q_net2 | DQN   | 210 K 
5 | target_actor  | Actor | 210 K 
6 | best_actor    | Actor | 210 K 
----------------------------------------
1.5 M     Trainable params
0         Non-trainable params
1.5 M     Total params
5.900     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 11:41:10,779][0m Trial 0 finished with value: 110.34634921415697 and parameters: {'hidden_size': 444, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
54 samples in experience buffer. Filling...
454 samples in experience buffer. Filling...
514 samples in experience buffer. Filling...
609 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 9.5 K 
1 | q_net2        | DQN   | 9.5 K 
2 | actor         | Actor | 9.4 K 
3 | target_q_net1 | DQN   | 9.5 K 
4 | target_q_net2 | DQN   | 9.5 K 
5 | target_actor  | Actor | 9.4 K 
6 | best_actor    | Actor | 9.4 K 
----------------------------------------
66.0 K    Trainable params
0         Non-trainable params
66.0 K    Total params
0.264     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 12:00:32,985][0m Trial 1 finished with value: -84.72759615056168 and parameters: {'hidden_size': 83, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
101 samples in experience buffer. Filling...
501 samples in experience buffer. Filling...
607 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 79.1 K
1 | q_net2        | DQN   | 79.1 K
2 | actor         | Actor | 79.0 K
3 | target_q_net1 | DQN   | 79.1 K
4 | target_q_net2 | DQN   | 79.1 K
5 | target_actor  | Actor | 79.0 K
6 | best_actor    | Actor | 79.0 K
----------------------------------------
553 K     Trainable params
0         Non-trainable params
553 K     Total params
2.214     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 12:28:13,676][0m Trial 2 finished with value: -52.35985683580975 and parameters: {'hidden_size': 157, 'num_layers': 3}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
481 samples in experience buffer. Filling...
592 samples in experience buffer. Filling...
744 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 76.7 K
1 | q_net2        | DQN   | 76.7 K
2 | actor         | Actor | 76.5 K
3 | target_q_net1 | DQN   | 76.7 K
4 | target_q_net2 | DQN   | 76.7 K
5 | target_actor  | Actor | 76.5 K
6 | best_actor    | Actor | 76.5 K
----------------------------------------
536 K     Trainable params
0         Non-trainable params
536 K     Total params
2.146     Total estimated model params size (MB)


805 samples in experience buffer. Filling...
880 samples in experience buffer. Filling...
964 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 12:52:17,592][0m Trial 3 finished with value: -27.06652229148104 and parameters: {'hidden_size': 188, 'num_layers': 2}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
81 samples in experience buffer. Filling...
240 samples in experience buffer. Filling...
307 samples in experience buffer. Filling...
367 samples in experience buffer. Filling...
767 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 215 K 
1 | q_net2        | DQN   | 215 K 
2 | actor         | Actor | 214 K 
3 | target_q_net1 | DQN   | 215 K 
4 | target_q_net2 | DQN   | 215 K 
5 | target_actor  | Actor | 214 K 
6 | best_actor    | Actor | 214 K 
----------------------------------------
1.5 M     Trainable params
0         Non-trainable params
1.5 M     Total params
6.017     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 13:31:23,983][0m Trial 4 finished with value: 52.35060151799752 and parameters: {'hidden_size': 320, 'num_layers': 2}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 566 K 
1 | q_net2        | DQN   | 566 K 
2 | actor         | Actor | 565 K 
3 | target_q_net1 | DQN   | 566 K 
4 | target_q_net2 | DQN   | 566 K 
5 | target_actor  | Actor | 565 K 
6 | best_actor    | Actor | 565 K 
----------------------------------------
4.0 M     Trainable params
0         Non-trainable params
4.0 M     Total params
15.851    Total estimated model params size (MB)


800 samples in experience buffer. Filling...
872 samples in experience buffer. Filling...
956 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 14:43:07,241][0m Trial 5 finished with value: 63.4644715462009 and parameters: {'hidden_size': 429, 'num_layers': 3}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
85 samples in experience buffer. Filling...
147 samples in experience buffer. Filling...
225 samples in experience buffer. Filling...
291 samples in experience buffer. Filling...
691 samples in experience buffer. Filling...
763 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 88.9 K
1 | q_net2        | DQN   | 88.9 K
2 | actor         | Actor | 88.7 K
3 | target_q_net1 | DQN   | 88.9 K
4 | target_q_net2 | DQN   | 88.9 K
5 | target_actor  | Actor | 88.7 K
6 | best_actor    | Actor | 88.7 K
----------------------------------------
621 K     Trainable params
0         Non-trainable params
621 K     Total params
2.487     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 15:09:32,118][0m Trial 6 finished with value: 26.983726389229645 and parameters: {'hidden_size': 203, 'num_layers': 2}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 21.2 K
1 | q_net2        | DQN   | 21.2 K
2 | actor         | Actor | 21.1 K
3 | target_q_net1 | DQN   | 21.2 K
4 | target_q_net2 | DQN   | 21.2 K
5 | target_actor  | Actor | 21.1 K
6 | best_actor    | Actor | 21.1 K
----------------------------------------
148 K     Trainable params
0         Non-trainable params
148 K     Total params
0.593     Total estimated model params size (MB)


800 samples in experience buffer. Filling...
863 samples in experience buffer. Filling...
929 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 15:30:32,509][0m Trial 7 finished with value: 63.748804105309354 and parameters: {'hidden_size': 131, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
488 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 35.3 K
1 | q_net2        | DQN   | 35.3 K
2 | actor         | Actor | 35.1 K
3 | target_q_net1 | DQN   | 35.3 K
4 | target_q_net2 | DQN   | 35.3 K
5 | target_actor  | Actor | 35.1 K
6 | best_actor    | Actor | 35.1 K
----------------------------------------
246 K     Trainable params
0         Non-trainable params
246 K     Total params
0.986     Total estimated model params size (MB)


888 samples in experience buffer. Filling...
960 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 15:50:47,371][0m Trial 8 finished with value: -14.256235964359558 and parameters: {'hidden_size': 125, 'num_layers': 2}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
494 samples in experience buffer. Filling...
894 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 42.5 K
1 | q_net2        | DQN   | 42.5 K
2 | actor         | Actor | 42.4 K
3 | target_q_net1 | DQN   | 42.5 K
4 | target_q_net2 | DQN   | 42.5 K
5 | target_actor  | Actor | 42.4 K
6 | best_actor    | Actor | 42.4 K
----------------------------------------
297 K     Trainable params
0         Non-trainable params
297 K     Total params
1.189     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 16:14:38,449][0m Trial 9 finished with value: 2.0823716410726267 and parameters: {'hidden_size': 138, 'num_layers': 2}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
800 samples in experience buffer. Filling...
861 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 247 K 
1 | q_net2        | DQN   | 247 K 
2 | actor         | Actor | 246 K 
3 | target_q_net1 | DQN   | 247 K 
4 | target_q_net2 | DQN   | 247 K 
5 | target_actor  | Actor | 246 K 
6 | best_actor    | Actor | 246 K 
----------------------------------------
1.7 M     Trainable params
0         Non-trainable params
1.7 M     Total params
6.918     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 16:51:21,941][0m Trial 10 finished with value: -44.36515900898081 and parameters: {'hidden_size': 482, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
62 samples in experience buffer. Filling...
462 samples in experience buffer. Filling...
583 samples in experience buffer. Filling...
666 samples in experience buffer. Filling...
748 samples in experience buffer. Filling...
806 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 139 K 
1 | q_net2        | DQN   | 139 K 
2 | actor         | Actor | 138 K 
3 | target_q_net1 | DQN   | 139 K 
4 | target_q_net2 | DQN   | 139 K 
5 | target_actor  | Actor | 138 K 
6 | best_actor    | Actor | 138 K 
----------------------------------------
973 K     Trainable params
0         Non-trainable params
973 K     Total params
3.895     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 17:20:41,447][0m Trial 11 finished with value: 21.211878871531514 and parameters: {'hidden_size': 358, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
485 samples in experience buffer. Filling...
575 samples in experience buffer. Filling...
646 samples in experience buffer. Filling...
734 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 70.3 K
1 | q_net2        | DQN   | 70.3 K
2 | actor         | Actor | 70.0 K
3 | target_q_net1 | DQN   | 70.3 K
4 | target_q_net2 | DQN   | 70.3 K
5 | target_actor  | Actor | 70.0 K
6 | best_actor    | Actor | 70.0 K
----------------------------------------
491 K     Trainable params
0         Non-trainable params
491 K     Total params
1.964     Total estimated model params size (MB)


818 samples in experience buffer. Filling...
933 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 17:43:47,341][0m Trial 12 finished with value: 101.78839511620184 and parameters: {'hidden_size': 250, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
502 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 75.1 K
1 | q_net2        | DQN   | 75.1 K
2 | actor         | Actor | 74.9 K
3 | target_q_net1 | DQN   | 75.1 K
4 | target_q_net2 | DQN   | 75.1 K
5 | target_actor  | Actor | 74.9 K
6 | best_actor    | Actor | 74.9 K
----------------------------------------
525 K     Trainable params
0         Non-trainable params
525 K     Total params
2.100     Total estimated model params size (MB)


902 samples in experience buffer. Filling...
965 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 18:09:34,782][0m Trial 13 finished with value: 80.3119496451433 and parameters: {'hidden_size': 259, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
51 samples in experience buffer. Filling...
131 samples in experience buffer. Filling...
531 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 150 K 
1 | q_net2        | DQN   | 150 K 
2 | actor         | Actor | 150 K 
3 | target_q_net1 | DQN   | 150 K 
4 | target_q_net2 | DQN   | 150 K 
5 | target_actor  | Actor | 150 K 
6 | best_actor    | Actor | 150 K 
----------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.215     Total estimated model params size (MB)


931 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 18:42:19,711][0m Trial 14 finished with value: 66.6623923383556 and parameters: {'hidden_size': 373, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
477 samples in experience buffer. Filling...
584 samples in experience buffer. Filling...
687 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 76.2 K
1 | q_net2        | DQN   | 76.2 K
2 | actor         | Actor | 76.0 K
3 | target_q_net1 | DQN   | 76.2 K
4 | target_q_net2 | DQN   | 76.2 K
5 | target_actor  | Actor | 76.0 K
6 | best_actor    | Actor | 76.0 K
----------------------------------------
532 K     Trainable params
0         Non-trainable params
532 K     Total params
2.131     Total estimated model params size (MB)


764 samples in experience buffer. Filling...
816 samples in experience buffer. Filling...
870 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 19:07:49,320][0m Trial 15 finished with value: 53.10935184500173 and parameters: {'hidden_size': 261, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
59 samples in experience buffer. Filling...
142 samples in experience buffer. Filling...
542 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 241 K 
1 | q_net2        | DQN   | 241 K 
2 | actor         | Actor | 240 K 
3 | target_q_net1 | DQN   | 241 K 
4 | target_q_net2 | DQN   | 241 K 
5 | target_actor  | Actor | 240 K 
6 | best_actor    | Actor | 240 K 
----------------------------------------
1.7 M     Trainable params
0         Non-trainable params
1.7 M     Total params
6.752     Total estimated model params size (MB)


942 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 19:48:42,178][0m Trial 16 finished with value: 17.203119261961973 and parameters: {'hidden_size': 476, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
504 samples in experience buffer. Filling...
904 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 502 K 
1 | q_net2        | DQN   | 502 K 
2 | actor         | Actor | 502 K 
3 | target_q_net1 | DQN   | 502 K 
4 | target_q_net2 | DQN   | 502 K 
5 | target_actor  | Actor | 502 K 
6 | best_actor    | Actor | 502 K 
----------------------------------------
3.5 M     Trainable params
0         Non-trainable params
3.5 M     Total params
14.079    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 20:53:16,099][0m Trial 17 finished with value: -39.488340001166904 and parameters: {'hidden_size': 404, 'num_layers': 3}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
99 samples in experience buffer. Filling...
161 samples in experience buffer. Filling...
561 samples in experience buffer. Filling...
647 samples in experience buffer. Filling...
728 samples in experience buffer. Filling...
784 samples in experience buffer. Filling...
843 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 119 K 
1 | q_net2        | DQN   | 119 K 
2 | actor         | Actor | 119 K 
3 | target_q_net1 | DQN   | 119 K 
4 | target_q_net2 | DQN   | 119 K 
5 | target_actor  | Actor | 119 K 
6 | best_actor    | Actor | 119 K 
----------------------------------------
837 K     Trainable params
0         Non-trainable params
837 K     Total params
3.351     Total estimated model params size (MB)


924 samples in experience buffer. Filling...
987 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 21:21:11,462][0m Trial 18 finished with value: -79.86496478970585 and parameters: {'hidden_size': 331, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
491 samples in experience buffer. Filling...
556 samples in experience buffer. Filling...
622 samples in experience buffer. Filling...
716 samples in experience buffer. Filling...
814 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 187 K 
1 | q_net2        | DQN   | 187 K 
2 | actor         | Actor | 186 K 
3 | target_q_net1 | DQN   | 187 K 
4 | target_q_net2 | DQN   | 187 K 
5 | target_actor  | Actor | 186 K 
6 | best_actor    | Actor | 186 K 
----------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.237     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 21:55:42,788][0m Trial 19 finished with value: -46.57269696224477 and parameters: {'hidden_size': 298, 'num_layers': 2}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
70 samples in experience buffer. Filling...
186 samples in experience buffer. Filling...
286 samples in experience buffer. Filling...
686 samples in experience buffer. Filling...
765 samples in experience buffer. Filling...
832 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 51.5 K
1 | q_net2        | DQN   | 51.5 K
2 | actor         | Actor | 51.3 K
3 | target_q_net1 | DQN   | 51.5 K
4 | target_q_net2 | DQN   | 51.5 K
5 | target_actor  | Actor | 51.3 K
6 | best_actor    | Actor | 51.3 K
----------------------------------------
359 K     Trainable params
0         Non-trainable params
359 K     Total params
1.440     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 22:16:54,285][0m Trial 20 finished with value: 73.02228948050616 and parameters: {'hidden_size': 212, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
476 samples in experience buffer. Filling...
876 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 75.7 K
1 | q_net2        | DQN   | 75.7 K
2 | actor         | Actor | 75.4 K
3 | target_q_net1 | DQN   | 75.7 K
4 | target_q_net2 | DQN   | 75.7 K
5 | target_actor  | Actor | 75.4 K
6 | best_actor    | Actor | 75.4 K
----------------------------------------
528 K     Trainable params
0         Non-trainable params
528 K     Total params
2.115     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 22:39:20,064][0m Trial 21 finished with value: -50.423687361367364 and parameters: {'hidden_size': 260, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
70 samples in experience buffer. Filling...
470 samples in experience buffer. Filling...
870 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 64.0 K
1 | q_net2        | DQN   | 64.0 K
2 | actor         | Actor | 63.8 K
3 | target_q_net1 | DQN   | 64.0 K
4 | target_q_net2 | DQN   | 64.0 K
5 | target_actor  | Actor | 63.8 K
6 | best_actor    | Actor | 63.8 K
----------------------------------------
447 K     Trainable params
0         Non-trainable params
447 K     Total params
1.790     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 23:02:51,554][0m Trial 22 finished with value: -81.17211778016551 and parameters: {'hidden_size': 238, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
112 samples in experience buffer. Filling...
186 samples in experience buffer. Filling...
256 samples in experience buffer. Filling...
307 samples in experience buffer. Filling...
369 samples in experience buffer. Filling...
427 samples in experience buffer. Filling...
509 samples in experience buffer. Filling...
608 samples in experience buffer. Filling...
691 samples in experience buffer. Filling...
791 samples in experience buffer. Filling...
903 samples in experience buffer. Filling...
952 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 102 K 
1 | q_net2        | DQN   | 102 K 
2 | actor         | Actor | 102 K 
3 | target_q_net1 | DQN   | 102 K 
4 | target_q_net2 | DQN   | 102 K 
5 | target_actor  | Actor | 102 K 
6 | best_actor    | Actor | 102 K 
----------------------------------------
716 K     Trainable params
0         Non-trainable params
716 K     Total params
2.866     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-17 23:28:37,294][0m Trial 23 finished with value: -106.20008635200293 and parameters: {'hidden_size': 305, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
498 samples in experience buffer. Filling...
898 samples in experience buffer. Filling...
968 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 189 K 
1 | q_net2        | DQN   | 189 K 
2 | actor         | Actor | 189 K 
3 | target_q_net1 | DQN   | 189 K 
4 | target_q_net2 | DQN   | 189 K 
5 | target_actor  | Actor | 189 K 
6 | best_actor    | Actor | 189 K 
----------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.299     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 00:01:47,827][0m Trial 24 finished with value: -116.15150977600618 and parameters: {'hidden_size': 420, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
59 samples in experience buffer. Filling...
459 samples in experience buffer. Filling...
592 samples in experience buffer. Filling...
638 samples in experience buffer. Filling...
702 samples in experience buffer. Filling...
780 samples in experience buffer. Filling...
868 samples in experience buffer. Filling...
951 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 120 K 
1 | q_net2        | DQN   | 120 K 
2 | actor         | Actor | 120 K 
3 | target_q_net1 | DQN   | 120 K 
4 | target_q_net2 | DQN   | 120 K 
5 | target_actor  | Actor | 120 K 
6 | best_actor    | Actor | 120 K 
----------------------------------------
845 K     Trainable params
0         Non-trainable params
845 K     Total params
3.383     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 00:30:39,985][0m Trial 25 finished with value: -40.93612259332531 and parameters: {'hidden_size': 238, 'num_layers': 2}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
800 samples in experience buffer. Filling...
882 samples in experience buffer. Filling...
934 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 89.5 K
1 | q_net2        | DQN   | 89.5 K
2 | actor         | Actor | 89.2 K
3 | target_q_net1 | DQN   | 89.5 K
4 | target_q_net2 | DQN   | 89.5 K
5 | target_actor  | Actor | 89.2 K
6 | best_actor    | Actor | 89.2 K
----------------------------------------
625 K     Trainable params
0         Non-trainable params
625 K     Total params
2.502     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 00:57:42,365][0m Trial 26 finished with value: -4.400307548061615 and parameters: {'hidden_size': 284, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
107 samples in experience buffer. Filling...
166 samples in experience buffer. Filling...
218 samples in experience buffer. Filling...
291 samples in experience buffer. Filling...
425 samples in experience buffer. Filling...
507 samples in experience buffer. Filling...
578 samples in experience buffer. Filling...
647 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 131 K 
1 | q_net2        | DQN   | 131 K 
2 | actor         | Actor | 131 K 
3 | target_q_net1 | DQN   | 131 K 
4 | target_q_net2 | DQN   | 131 K 
5 | target_actor  | Actor | 131 K 
6 | best_actor    | Actor | 131 K 
----------------------------------------
922 K     Trainable params
0         Non-trainable params
922 K     Total params
3.689     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 01:27:48,229][0m Trial 27 finished with value: -91.77899850787375 and parameters: {'hidden_size': 348, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
88 samples in experience buffer. Filling...
164 samples in experience buffer. Filling...
564 samples in experience buffer. Filling...
617 samples in experience buffer. Filling...
664 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 9.1 K 
1 | q_net2        | DQN   | 9.1 K 
2 | actor         | Actor | 9.1 K 
3 | target_q_net1 | DQN   | 9.1 K 
4 | target_q_net2 | DQN   | 9.1 K 
5 | target_actor  | Actor | 9.1 K 
6 | best_actor    | Actor | 9.1 K 
----------------------------------------
63.7 K    Trainable params
0         Non-trainable params
63.7 K    Total params
0.255     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 01:48:17,553][0m Trial 28 finished with value: -40.21808911555293 and parameters: {'hidden_size': 60, 'num_layers': 2}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
473 samples in experience buffer. Filling...
543 samples in experience buffer. Filling...
943 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 36.1 K
1 | q_net2        | DQN   | 36.1 K
2 | actor         | Actor | 35.9 K
3 | target_q_net1 | DQN   | 36.1 K
4 | target_q_net2 | DQN   | 36.1 K
5 | target_actor  | Actor | 35.9 K
6 | best_actor    | Actor | 35.9 K
----------------------------------------
251 K     Trainable params
0         Non-trainable params
251 K     Total params
1.007     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 02:08:56,483][0m Trial 29 finished with value: 78.25854496167017 and parameters: {'hidden_size': 175, 'num_layers': 1}. Best is trial 0 with value: 110.34634921415697.[0m


0 samples in experience buffer. Filling...
95 samples in experience buffer. Filling...
495 samples in experience buffer. Filling...
895 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 167 K 
1 | q_net2        | DQN   | 167 K 
2 | actor         | Actor | 167 K 
3 | target_q_net1 | DQN   | 167 K 
4 | target_q_net2 | DQN   | 167 K 
5 | target_actor  | Actor | 167 K 
6 | best_actor    | Actor | 167 K 
----------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.684     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 02:42:52,482][0m Trial 30 finished with value: 130.7904513843903 and parameters: {'hidden_size': 394, 'num_layers': 1}. Best is trial 30 with value: 130.7904513843903.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
464 samples in experience buffer. Filling...
542 samples in experience buffer. Filling...
942 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 164 K 
1 | q_net2        | DQN   | 164 K 
2 | actor         | Actor | 163 K 
3 | target_q_net1 | DQN   | 164 K 
4 | target_q_net2 | DQN   | 164 K 
5 | target_actor  | Actor | 163 K 
6 | best_actor    | Actor | 163 K 
----------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.593     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 03:15:43,946][0m Trial 31 finished with value: -116.39164660742134 and parameters: {'hidden_size': 390, 'num_layers': 1}. Best is trial 30 with value: 130.7904513843903.[0m


0 samples in experience buffer. Filling...
75 samples in experience buffer. Filling...
162 samples in experience buffer. Filling...
241 samples in experience buffer. Filling...
294 samples in experience buffer. Filling...
694 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 226 K 
1 | q_net2        | DQN   | 226 K 
2 | actor         | Actor | 226 K 
3 | target_q_net1 | DQN   | 226 K 
4 | target_q_net2 | DQN   | 226 K 
5 | target_actor  | Actor | 226 K 
6 | best_actor    | Actor | 226 K 
----------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.345     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 03:57:27,717][0m Trial 32 finished with value: -6.826511742327689 and parameters: {'hidden_size': 461, 'num_layers': 1}. Best is trial 30 with value: 130.7904513843903.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
465 samples in experience buffer. Filling...
865 samples in experience buffer. Filling...
925 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 212 K 
1 | q_net2        | DQN   | 212 K 
2 | actor         | Actor | 212 K 
3 | target_q_net1 | DQN   | 212 K 
4 | target_q_net2 | DQN   | 212 K 
5 | target_actor  | Actor | 212 K 
6 | best_actor    | Actor | 212 K 
----------------------------------------
1.5 M     Trainable params
0         Non-trainable params
1.5 M     Total params
5.951     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 04:31:56,882][0m Trial 33 finished with value: -15.637990998413613 and parameters: {'hidden_size': 446, 'num_layers': 1}. Best is trial 30 with value: 130.7904513843903.[0m


0 samples in experience buffer. Filling...
53 samples in experience buffer. Filling...
453 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 264 K 
1 | q_net2        | DQN   | 264 K 
2 | actor         | Actor | 263 K 
3 | target_q_net1 | DQN   | 264 K 
4 | target_q_net2 | DQN   | 264 K 
5 | target_actor  | Actor | 263 K 
6 | best_actor    | Actor | 263 K 
----------------------------------------
1.8 M     Trainable params
0         Non-trainable params
1.8 M     Total params
7.399     Total estimated model params size (MB)


853 samples in experience buffer. Filling...
932 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 05:15:26,513][0m Trial 34 finished with value: 105.95789769063317 and parameters: {'hidden_size': 499, 'num_layers': 1}. Best is trial 30 with value: 130.7904513843903.[0m


0 samples in experience buffer. Filling...
58 samples in experience buffer. Filling...
139 samples in experience buffer. Filling...
216 samples in experience buffer. Filling...
309 samples in experience buffer. Filling...
361 samples in experience buffer. Filling...
421 samples in experience buffer. Filling...
504 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 513 K 
1 | q_net2        | DQN   | 513 K 
2 | actor         | Actor | 513 K 
3 | target_q_net1 | DQN   | 513 K 
4 | target_q_net2 | DQN   | 513 K 
5 | target_actor  | Actor | 513 K 
6 | best_actor    | Actor | 513 K 
----------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.385    Total estimated model params size (MB)


904 samples in experience buffer. Filling...
985 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 06:20:44,150][0m Trial 35 finished with value: -32.37369726429968 and parameters: {'hidden_size': 499, 'num_layers': 2}. Best is trial 30 with value: 130.7904513843903.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...
469 samples in experience buffer. Filling...
524 samples in experience buffer. Filling...
592 samples in experience buffer. Filling...
712 samples in experience buffer. Filling...
908 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 587 K 
1 | q_net2        | DQN   | 587 K 
2 | actor         | Actor | 586 K 
3 | target_q_net1 | DQN   | 587 K 
4 | target_q_net2 | DQN   | 587 K 
5 | target_actor  | Actor | 586 K 
6 | best_actor    | Actor | 586 K 
----------------------------------------
4.1 M     Trainable params
0         Non-trainable params
4.1 M     Total params
16.440    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 07:35:31,529][0m Trial 36 finished with value: -64.282964604733 and parameters: {'hidden_size': 437, 'num_layers': 3}. Best is trial 30 with value: 130.7904513843903.[0m


0 samples in experience buffer. Filling...
400 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 172 K 
1 | q_net2        | DQN   | 172 K 
2 | actor         | Actor | 172 K 
3 | target_q_net1 | DQN   | 172 K 
4 | target_q_net2 | DQN   | 172 K 
5 | target_actor  | Actor | 172 K 
6 | best_actor    | Actor | 172 K 
----------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.822     Total estimated model params size (MB)


800 samples in experience buffer. Filling...
896 samples in experience buffer. Filling...


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 08:12:01,575][0m Trial 37 finished with value: -22.22088970165048 and parameters: {'hidden_size': 400, 'num_layers': 1}. Best is trial 30 with value: 130.7904513843903.[0m


0 samples in experience buffer. Filling...
68 samples in experience buffer. Filling...
468 samples in experience buffer. Filling...
868 samples in experience buffer. Filling...
929 samples in experience buffer. Filling...
997 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 225 K 
1 | q_net2        | DQN   | 225 K 
2 | actor         | Actor | 225 K 
3 | target_q_net1 | DQN   | 225 K 
4 | target_q_net2 | DQN   | 225 K 
5 | target_actor  | Actor | 225 K 
6 | best_actor    | Actor | 225 K 
----------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.319     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 08:52:50,020][0m Trial 38 finished with value: 101.1280982088175 and parameters: {'hidden_size': 460, 'num_layers': 1}. Best is trial 30 with value: 130.7904513843903.[0m


0 samples in experience buffer. Filling...
120 samples in experience buffer. Filling...
207 samples in experience buffer. Filling...
268 samples in experience buffer. Filling...
349 samples in experience buffer. Filling...
749 samples in experience buffer. Filling...
828 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 366 K 
1 | q_net2        | DQN   | 366 K 
2 | actor         | Actor | 365 K 
3 | target_q_net1 | DQN   | 366 K 
4 | target_q_net2 | DQN   | 366 K 
5 | target_actor  | Actor | 365 K 
6 | best_actor    | Actor | 365 K 
----------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.250    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

[32m[I 2022-07-18 09:47:51,509][0m Trial 39 finished with value: 96.78961360647165 and parameters: {'hidden_size': 420, 'num_layers': 2}. Best is trial 30 with value: 130.7904513843903.[0m


0 samples in experience buffer. Filling...
54 samples in experience buffer. Filling...
132 samples in experience buffer. Filling...
212 samples in experience buffer. Filling...
612 samples in experience buffer. Filling...


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type  | Params
----------------------------------------
0 | q_net1        | DQN   | 513 K 
1 | q_net2        | DQN   | 513 K 
2 | actor         | Actor | 513 K 
3 | target_q_net1 | DQN   | 513 K 
4 | target_q_net2 | DQN   | 513 K 
5 | target_actor  | Actor | 513 K 
6 | best_actor    | Actor | 513 K 
----------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params
14.385    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [None]:
study.best_params