In [1]:
import argparse
import datetime
import os,json
import pprint

import numpy as np
import torch, random
from torch import nn
from torch.utils.tensorboard import SummaryWriter

import gymnasium as gym

from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer
from tianshou.policy import RainbowPolicy, TD3Policy, DQNPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.exploration import GaussianNoise
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import Actor, Critic
from tianshou.env import ShmemVectorEnv

In [2]:
seed = 2023
torch.manual_seed(seed)
np.random.seed(seed)

In [3]:
def make_env(seed, task, training_num, test_num):
    env = gym.make(task)
    train_envs = ShmemVectorEnv(
        [lambda: gym.make(task) for _ in range(training_num)]
    )
    test_envs = ShmemVectorEnv([lambda: gym.make(task) for _ in range(test_num)])
    # env.seed(seed)
    train_envs.seed(seed)
    test_envs.seed(seed)
    
    return env, train_envs, test_envs

In [4]:
env, train_envs, test_envs = make_env(seed = 2023, task = "ALE/Breakout-v5", training_num = 1, test_num = 1)

In [5]:
class Net(nn.Module):
    def __init__(self, n_channels, filters, strides, obs_space, action_space):
        super().__init__()
        
        in_channel = obs_space.shape[-1]
        self.cnn = nn.Sequential(
            nn.Conv2d(in_channel, n_channels[0], filters[0], strides[0]),
            nn.Conv2d(n_channels[0], n_channels[1], filters[1], strides[1]),
            nn.Conv2d(n_channels[1], n_channels[2], filters[2], strides[2]),
        )
        
        with torch.no_grad():
            obs_sample = torch.tensor(obs_space.sample()).float()
            obs_sample = torch.permute(obs_sample, (2, 0, 1))
            output_dim = self.cnn(obs_sample).flatten().size(dim = 0)
            
        self.linear = nn.Sequential(
            # nn.Linear(output_dim, int(np.prod(action_space.shape))),
            nn.Linear(output_dim, action_space.n),
        )

    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float)
        batch = obs.shape[0]
        
        obs = torch.permute(obs, (0, 3, 1, 2))
        preprocessed = self.cnn(obs).flatten(1)
        logits = self.linear(preprocessed.view(batch, -1)).unsqueeze(dim = -1)

        return logits, state

In [6]:
device = 'cpu'
n_channels = [32, 64, 64]
strides = [4, 2, 1]
filters = [8, 4, 1]
lr = 1e-4
gamma = 0.99
n_step = 1
start_timesteps = 25000
#Collector
buffer_size = 100000

#Trainer
max_epoch = 200
step_per_epoch = 1000
step_per_collect = 1
test_num = 10
batch_size = 32
update_per_step = 1


In [7]:
# model
model = Net(n_channels, filters, strides, env.observation_space, env.action_space)
model_optim = torch.optim.Adam(model.parameters(), lr=lr)

policy = RainbowPolicy(
    model,
    model_optim,
    discount_factor=gamma,
    estimation_step=n_step,
)


In [8]:
# collector
buffer = ReplayBuffer(buffer_size)

train_collector = Collector(policy, train_envs, buffer, exploration_noise=True)
test_collector = Collector(policy, test_envs, exploration_noise = True)
train_collector.collect(n_step = start_timesteps, random=True)

{'n/ep': 133,
 'n/st': 25000,
 'rews': array([2., 4., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0., 2., 1., 2., 0., 1.,
        0., 0., 2., 1., 2., 3., 3., 1., 3., 2., 0., 0., 2., 3., 2., 3., 2.,
        1., 1., 0., 3., 1., 0., 1., 2., 2., 0., 3., 0., 0., 1., 0., 2., 0.,
        1., 1., 3., 0., 2., 0., 2., 0., 4., 0., 1., 2., 0., 1., 0., 0., 0.,
        0., 2., 5., 3., 0., 2., 3., 2., 0., 1., 2., 0., 2., 0., 4., 2., 1.,
        0., 0., 1., 1., 3., 2., 2., 1., 2., 1., 1., 3., 1., 3., 1., 2., 2.,
        0., 3., 2., 1., 1., 0., 0., 0., 0., 3., 1., 0., 0., 5., 1., 0., 1.,
        2., 4., 2., 1., 3., 0., 0., 0., 0., 2., 0., 1., 3., 1.]),
 'lens': array([209, 279, 133, 128, 193, 176, 177, 178, 154, 176, 142, 139, 197,
        189, 207, 135, 168, 134, 137, 190, 180, 203, 263, 239, 173, 247,
        217, 138, 130, 260, 273, 211, 253, 255, 172, 156, 125, 240, 165,
        132, 163, 201, 212, 128, 231, 138, 128, 183, 133, 212, 162, 179,
        189, 263, 133, 205, 135, 227, 136, 291, 136, 170, 212, 1

In [9]:
# log
now = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
algo_name = "rainbow_baselines"
log_name = os.path.join("breakout", algo_name, str(seed), now)
log_path = os.path.join("log_results", log_name)

# logger
writer = SummaryWriter(log_path)
# writer.add_text("args", str(args))
logger = TensorboardLogger(writer)

def save_best_fn(policy):
    torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth"))

In [10]:
result = offpolicy_trainer(
    policy,
    train_collector,
    test_collector,
    max_epoch,
    step_per_epoch,
    step_per_collect,
    test_num,
    batch_size,
    save_best_fn=save_best_fn,
    logger=logger,
    train_fn=lambda epoch, env_step: policy.set_eps(0.2),
    test_fn=lambda epoch, env_step: policy.set_eps(0.1),
    update_per_step=update_per_step,
    test_in_train = False
)
pprint.pprint(result)

Epoch #1: 1001it [00:51, 19.35it/s, env_step=1000, len=282, loss=-14994881957068.801, n/ep=0, n/st=1, rew=3.00]        


Epoch #1: test_reward: 1.800000 ± 1.249000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #2: 1001it [00:59, 16.73it/s, env_step=2000, len=133, loss=-237662449350737.906, n/ep=0, n/st=1, rew=0.00]       


Epoch #2: test_reward: 1.900000 ± 1.640122, best_reward: 2.700000 ± 2.368544 in #0


Epoch #3: 1001it [01:03, 15.80it/s, env_step=3000, len=162, loss=-1029486274286714.875, n/ep=0, n/st=1, rew=1.00]      


Epoch #3: test_reward: 1.000000 ± 1.414214, best_reward: 2.700000 ± 2.368544 in #0


Epoch #4: 1001it [01:03, 15.79it/s, env_step=4000, len=136, loss=-2764289637844254.500, n/ep=0, n/st=1, rew=0.00]      


Epoch #4: test_reward: 2.300000 ± 2.325941, best_reward: 2.700000 ± 2.368544 in #0


Epoch #5: 1001it [00:56, 17.73it/s, env_step=5000, len=201, loss=-5819028677832212.000, n/ep=0, n/st=1, rew=2.00]      


Epoch #5: test_reward: 1.600000 ± 1.496663, best_reward: 2.700000 ± 2.368544 in #0


Epoch #6: 1001it [00:52, 19.24it/s, env_step=6000, len=257, loss=-10586597338812252.000, n/ep=0, n/st=1, rew=3.00]     


Epoch #6: test_reward: 1.600000 ± 1.019804, best_reward: 2.700000 ± 2.368544 in #0


Epoch #7: 1001it [00:49, 20.09it/s, env_step=7000, len=129, loss=-17503656460523930.000, n/ep=0, n/st=1, rew=0.00]     


Epoch #7: test_reward: 1.000000 ± 0.774597, best_reward: 2.700000 ± 2.368544 in #0


Epoch #8: 1001it [00:49, 20.10it/s, env_step=8000, len=206, loss=-27031925249309408.000, n/ep=0, n/st=1, rew=2.00]     


Epoch #8: test_reward: 2.000000 ± 1.341641, best_reward: 2.700000 ± 2.368544 in #0


Epoch #9: 1001it [00:51, 19.52it/s, env_step=9000, len=127, loss=-39656300283764408.000, n/ep=0, n/st=1, rew=0.00]     


Epoch #9: test_reward: 0.900000 ± 0.830662, best_reward: 2.700000 ± 2.368544 in #0


Epoch #10: 1001it [00:50, 19.78it/s, env_step=10000, len=160, loss=-55952355048963440.000, n/ep=0, n/st=1, rew=0.00]   


Epoch #10: test_reward: 1.300000 ± 1.100000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #11: 1001it [00:53, 18.80it/s, env_step=11000, len=228, loss=-76413993539652816.000, n/ep=0, n/st=1, rew=2.00]   


Epoch #11: test_reward: 0.900000 ± 0.700000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #12: 1001it [00:57, 17.46it/s, env_step=12000, len=134, loss=-101765630537980976.000, n/ep=0, n/st=1, rew=0.00]  


Epoch #12: test_reward: 1.000000 ± 1.000000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #13: 1001it [00:56, 17.57it/s, env_step=13000, len=210, loss=-132513440824683392.000, n/ep=0, n/st=1, rew=2.00]  


Epoch #13: test_reward: 1.900000 ± 2.700000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #14: 1001it [00:54, 18.27it/s, env_step=14000, len=195, loss=-169437175196952832.000, n/ep=0, n/st=1, rew=1.00]  


Epoch #14: test_reward: 1.100000 ± 0.830662, best_reward: 2.700000 ± 2.368544 in #0


Epoch #15: 1001it [00:51, 19.54it/s, env_step=15000, len=162, loss=-213322070241910464.000, n/ep=0, n/st=1, rew=1.00]  


Epoch #15: test_reward: 1.400000 ± 0.916515, best_reward: 2.700000 ± 2.368544 in #0


Epoch #16: 1001it [00:51, 19.26it/s, env_step=16000, len=204, loss=-264722993987117376.000, n/ep=0, n/st=1, rew=2.00]  


Epoch #16: test_reward: 2.000000 ± 1.788854, best_reward: 2.700000 ± 2.368544 in #0


Epoch #17: 1001it [00:51, 19.30it/s, env_step=17000, len=204, loss=-324472800377212736.000, n/ep=0, n/st=1, rew=2.00]  


Epoch #17: test_reward: 1.000000 ± 0.894427, best_reward: 2.700000 ± 2.368544 in #0


Epoch #18: 1001it [00:52, 19.03it/s, env_step=18000, len=236, loss=-393485053007872640.000, n/ep=0, n/st=1, rew=2.00]  


Epoch #18: test_reward: 1.300000 ± 1.345362, best_reward: 2.700000 ± 2.368544 in #0


Epoch #19: 1001it [00:51, 19.25it/s, env_step=19000, len=142, loss=-472737735689245824.000, n/ep=0, n/st=1, rew=0.00]  


Epoch #19: test_reward: 1.400000 ± 1.356466, best_reward: 2.700000 ± 2.368544 in #0


Epoch #20: 1001it [00:52, 19.15it/s, env_step=20000, len=172, loss=-562546300368925504.000, n/ep=0, n/st=1, rew=1.00]  


Epoch #20: test_reward: 1.600000 ± 1.356466, best_reward: 2.700000 ± 2.368544 in #0


Epoch #21: 1001it [00:52, 19.24it/s, env_step=21000, len=133, loss=-664571950030892544.000, n/ep=0, n/st=1, rew=0.00]  


Epoch #21: test_reward: 1.200000 ± 0.979796, best_reward: 2.700000 ± 2.368544 in #0


Epoch #22: 1001it [00:52, 19.24it/s, env_step=22000, len=133, loss=-779638732218842496.000, n/ep=0, n/st=1, rew=0.00]  


Epoch #22: test_reward: 1.300000 ± 1.846619, best_reward: 2.700000 ± 2.368544 in #0


Epoch #23: 1001it [00:52, 19.24it/s, env_step=23000, len=151, loss=-908550654846262272.000, n/ep=0, n/st=1, rew=0.00]  


Epoch #23: test_reward: 1.000000 ± 0.774597, best_reward: 2.700000 ± 2.368544 in #0


Epoch #24: 1001it [00:52, 19.18it/s, env_step=24000, len=271, loss=-1052079379225342080.000, n/ep=0, n/st=1, rew=4.00] 


Epoch #24: test_reward: 2.000000 ± 1.183216, best_reward: 2.700000 ± 2.368544 in #0


Epoch #25: 1001it [00:52, 19.16it/s, env_step=25000, len=136, loss=-1211145399310986752.000, n/ep=0, n/st=1, rew=0.00] 


Epoch #25: test_reward: 2.300000 ± 1.100000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #26: 1001it [00:47, 21.02it/s, env_step=26000, len=162, loss=-1387690366738502656.000, n/ep=0, n/st=1, rew=1.00] 


Epoch #26: test_reward: 0.900000 ± 1.757840, best_reward: 2.700000 ± 2.368544 in #0


Epoch #27: 1001it [00:47, 21.04it/s, env_step=27000, len=162, loss=-1582302308572761856.000, n/ep=0, n/st=1, rew=1.00] 


Epoch #27: test_reward: 1.700000 ± 1.676305, best_reward: 2.700000 ± 2.368544 in #0


Epoch #28: 1001it [00:47, 21.00it/s, env_step=28000, len=206, loss=-1796900280797925120.000, n/ep=0, n/st=1, rew=2.00] 


Epoch #28: test_reward: 2.000000 ± 1.183216, best_reward: 2.700000 ± 2.368544 in #0


Epoch #29: 1001it [00:47, 20.99it/s, env_step=29000, len=145, loss=-2033754978335001088.000, n/ep=0, n/st=1, rew=0.00] 


Epoch #29: test_reward: 2.000000 ± 0.894427, best_reward: 2.700000 ± 2.368544 in #0


Epoch #30: 1001it [00:48, 20.65it/s, env_step=30000, len=165, loss=-2288244367661633280.000, n/ep=1, n/st=1, rew=1.00] 


Epoch #30: test_reward: 0.700000 ± 0.900000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #31: 1001it [00:47, 20.99it/s, env_step=31000, len=153, loss=-2567866954526899712.000, n/ep=0, n/st=1, rew=1.00] 


Epoch #31: test_reward: 1.600000 ± 1.959592, best_reward: 2.700000 ± 2.368544 in #0


Epoch #32: 1001it [00:47, 21.17it/s, env_step=32000, len=204, loss=-2870103533766070784.000, n/ep=0, n/st=1, rew=2.00] 


Epoch #32: test_reward: 1.600000 ± 0.663325, best_reward: 2.700000 ± 2.368544 in #0


Epoch #33: 1001it [00:46, 21.37it/s, env_step=33000, len=215, loss=-3201375048501318144.000, n/ep=0, n/st=1, rew=2.00] 


Epoch #33: test_reward: 0.700000 ± 1.004988, best_reward: 2.700000 ± 2.368544 in #0


Epoch #34: 1001it [00:46, 21.32it/s, env_step=34000, len=209, loss=-3558434800325005312.000, n/ep=0, n/st=1, rew=2.00] 


Epoch #34: test_reward: 1.000000 ± 0.774597, best_reward: 2.700000 ± 2.368544 in #0


Epoch #35: 1001it [00:46, 21.37it/s, env_step=35000, len=210, loss=-3945234959044161024.000, n/ep=0, n/st=1, rew=2.00] 


Epoch #35: test_reward: 1.000000 ± 0.894427, best_reward: 2.700000 ± 2.368544 in #0


Epoch #36: 1001it [00:46, 21.34it/s, env_step=36000, len=181, loss=-4362087365397876736.000, n/ep=0, n/st=1, rew=1.00] 


Epoch #36: test_reward: 1.600000 ± 1.019804, best_reward: 2.700000 ± 2.368544 in #0


Epoch #37: 1001it [00:47, 21.26it/s, env_step=37000, len=125, loss=-4805973835228895232.000, n/ep=0, n/st=1, rew=0.00] 


Epoch #37: test_reward: 2.200000 ± 1.886796, best_reward: 2.700000 ± 2.368544 in #0


Epoch #38: 1001it [00:46, 21.47it/s, env_step=38000, len=170, loss=-5285661841271753728.000, n/ep=0, n/st=1, rew=1.00] 


Epoch #38: test_reward: 1.500000 ± 1.118034, best_reward: 2.700000 ± 2.368544 in #0


Epoch #39: 1001it [00:46, 21.45it/s, env_step=39000, len=163, loss=-5800735474011791360.000, n/ep=0, n/st=1, rew=1.00] 


Epoch #39: test_reward: 1.600000 ± 1.685230, best_reward: 2.700000 ± 2.368544 in #0


Epoch #40: 1001it [00:47, 20.95it/s, env_step=40000, len=291, loss=-6350766017372665856.000, n/ep=0, n/st=1, rew=4.00] 


Epoch #40: test_reward: 1.700000 ± 1.615549, best_reward: 2.700000 ± 2.368544 in #0


Epoch #41: 1001it [00:49, 20.04it/s, env_step=41000, len=125, loss=-6939520211274160128.000, n/ep=0, n/st=1, rew=0.00] 


Epoch #41: test_reward: 2.300000 ± 1.100000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #42: 1001it [00:49, 20.20it/s, env_step=42000, len=130, loss=-7566486057131683840.000, n/ep=0, n/st=1, rew=0.00] 


Epoch #42: test_reward: 1.300000 ± 1.486607, best_reward: 2.700000 ± 2.368544 in #0


Epoch #43: 1001it [00:49, 20.14it/s, env_step=43000, len=128, loss=-8238639626369986560.000, n/ep=0, n/st=1, rew=0.00] 


Epoch #43: test_reward: 1.100000 ± 1.220656, best_reward: 2.700000 ± 2.368544 in #0


Epoch #44: 1001it [00:49, 20.29it/s, env_step=44000, len=204, loss=-8952673021764231168.000, n/ep=0, n/st=1, rew=2.00] 


Epoch #44: test_reward: 1.300000 ± 1.187434, best_reward: 2.700000 ± 2.368544 in #0


Epoch #45: 1001it [00:49, 20.27it/s, env_step=45000, len=160, loss=-9703218592474359808.000, n/ep=0, n/st=1, rew=1.00] 


Epoch #45: test_reward: 1.000000 ± 1.000000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #46: 1001it [00:49, 20.09it/s, env_step=46000, len=218, loss=-10505328982538113024.000, n/ep=0, n/st=1, rew=2.00]


Epoch #46: test_reward: 2.100000 ± 0.538516, best_reward: 2.700000 ± 2.368544 in #0


Epoch #47: 1001it [00:49, 20.20it/s, env_step=47000, len=136, loss=-11355222707584778240.000, n/ep=0, n/st=1, rew=0.00]


Epoch #47: test_reward: 2.400000 ± 1.113553, best_reward: 2.700000 ± 2.368544 in #0


Epoch #48: 1001it [00:49, 20.19it/s, env_step=48000, len=177, loss=-12258778526434584576.000, n/ep=0, n/st=1, rew=1.00]


Epoch #48: test_reward: 1.900000 ± 1.513275, best_reward: 2.700000 ± 2.368544 in #0


Epoch #49: 1001it [00:49, 20.23it/s, env_step=49000, len=159, loss=-13211704726345953280.000, n/ep=0, n/st=1, rew=1.00]


Epoch #49: test_reward: 1.400000 ± 1.280625, best_reward: 2.700000 ± 2.368544 in #0


Epoch #50: 1001it [00:49, 20.12it/s, env_step=50000, len=135, loss=-14225275292730796032.000, n/ep=0, n/st=1, rew=0.00]


Epoch #50: test_reward: 1.400000 ± 0.800000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #51: 1001it [00:49, 20.23it/s, env_step=51000, len=213, loss=-15289769421297711104.000, n/ep=0, n/st=1, rew=2.00]


Epoch #51: test_reward: 1.100000 ± 1.220656, best_reward: 2.700000 ± 2.368544 in #0


Epoch #52: 1001it [00:49, 20.20it/s, env_step=52000, len=212, loss=-16415896289330540544.000, n/ep=0, n/st=1, rew=2.00]


Epoch #52: test_reward: 1.300000 ± 1.268858, best_reward: 2.700000 ± 2.368544 in #0


Epoch #53: 1001it [00:49, 20.21it/s, env_step=53000, len=206, loss=-17604493636728719360.000, n/ep=0, n/st=1, rew=2.00]


Epoch #53: test_reward: 1.200000 ± 0.979796, best_reward: 2.700000 ± 2.368544 in #0


Epoch #54: 1001it [00:49, 20.11it/s, env_step=54000, len=142, loss=-18833166159857938432.000, n/ep=0, n/st=1, rew=0.00]


Epoch #54: test_reward: 2.300000 ± 1.004988, best_reward: 2.700000 ± 2.368544 in #0


Epoch #55: 1001it [00:49, 20.32it/s, env_step=55000, len=140, loss=-20140051925858537472.000, n/ep=0, n/st=1, rew=0.00]


Epoch #55: test_reward: 0.500000 ± 0.806226, best_reward: 2.700000 ± 2.368544 in #0


Epoch #56: 1001it [00:49, 20.31it/s, env_step=56000, len=176, loss=-21514349167572500480.000, n/ep=0, n/st=1, rew=1.00]


Epoch #56: test_reward: 2.100000 ± 1.757840, best_reward: 2.700000 ± 2.368544 in #0


Epoch #57: 1001it [00:49, 20.11it/s, env_step=57000, len=134, loss=-22966512371410214912.000, n/ep=0, n/st=1, rew=0.00]


Epoch #57: test_reward: 0.800000 ± 1.077033, best_reward: 2.700000 ± 2.368544 in #0


Epoch #58: 1001it [00:48, 20.64it/s, env_step=58000, len=126, loss=-24468699396069875712.000, n/ep=0, n/st=1, rew=0.00]


Epoch #58: test_reward: 1.100000 ± 0.943398, best_reward: 2.700000 ± 2.368544 in #0


Epoch #59: 1001it [00:48, 20.60it/s, env_step=59000, len=185, loss=-26054299946780995584.000, n/ep=1, n/st=1, rew=1.00]


Epoch #59: test_reward: 1.200000 ± 1.326650, best_reward: 2.700000 ± 2.368544 in #0


Epoch #60: 1001it [00:48, 20.61it/s, env_step=60000, len=205, loss=-27721150778396389376.000, n/ep=0, n/st=1, rew=2.00]


Epoch #60: test_reward: 0.700000 ± 1.004988, best_reward: 2.700000 ± 2.368544 in #0


Epoch #61: 1001it [00:48, 20.63it/s, env_step=61000, len=132, loss=-29471403437259522048.000, n/ep=0, n/st=1, rew=0.00]


Epoch #61: test_reward: 0.900000 ± 1.374773, best_reward: 2.700000 ± 2.368544 in #0


Epoch #62: 1001it [00:48, 20.52it/s, env_step=62000, len=207, loss=-31296255189297954816.000, n/ep=0, n/st=1, rew=2.00]


Epoch #62: test_reward: 1.100000 ± 1.374773, best_reward: 2.700000 ± 2.368544 in #0


Epoch #63: 1001it [00:48, 20.61it/s, env_step=63000, len=163, loss=-33193947871438405632.000, n/ep=0, n/st=1, rew=1.00]


Epoch #63: test_reward: 1.100000 ± 0.830662, best_reward: 2.700000 ± 2.368544 in #0


Epoch #64: 1001it [00:48, 20.56it/s, env_step=64000, len=228, loss=-35216408581120741376.000, n/ep=0, n/st=1, rew=2.00]


Epoch #64: test_reward: 1.400000 ± 1.019804, best_reward: 2.700000 ± 2.368544 in #0


Epoch #65: 1001it [00:49, 20.40it/s, env_step=65000, len=143, loss=-37274923112706686976.000, n/ep=0, n/st=1, rew=0.00]


Epoch #65: test_reward: 0.700000 ± 0.640312, best_reward: 2.700000 ± 2.368544 in #0


Epoch #66: 1001it [00:51, 19.54it/s, env_step=66000, len=129, loss=-39414954182932692992.000, n/ep=0, n/st=1, rew=0.00]


Epoch #66: test_reward: 2.500000 ± 1.431782, best_reward: 2.700000 ± 2.368544 in #0


Epoch #67: 1001it [00:53, 18.56it/s, env_step=67000, len=232, loss=-41672085748403355648.000, n/ep=0, n/st=1, rew=2.00]


Epoch #67: test_reward: 1.900000 ± 1.757840, best_reward: 2.700000 ± 2.368544 in #0


Epoch #68: 1001it [00:54, 18.52it/s, env_step=68000, len=209, loss=-44030921215853527040.000, n/ep=0, n/st=1, rew=2.00]


Epoch #68: test_reward: 0.600000 ± 0.800000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #69: 1001it [00:53, 18.69it/s, env_step=69000, len=177, loss=-46495936873041747968.000, n/ep=0, n/st=1, rew=1.00]


Epoch #69: test_reward: 1.300000 ± 1.345362, best_reward: 2.700000 ± 2.368544 in #0


Epoch #70: 1001it [00:52, 19.23it/s, env_step=70000, len=209, loss=-49047810298445668352.000, n/ep=0, n/st=1, rew=2.00]


Epoch #70: test_reward: 0.900000 ± 0.830662, best_reward: 2.700000 ± 2.368544 in #0


Epoch #71: 1001it [00:51, 19.43it/s, env_step=71000, len=132, loss=-51689412624788668416.000, n/ep=0, n/st=1, rew=0.00]


Epoch #71: test_reward: 1.300000 ± 1.100000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #72: 1001it [00:54, 18.27it/s, env_step=72000, len=140, loss=-54473620817605844992.000, n/ep=0, n/st=1, rew=0.00]


Epoch #72: test_reward: 1.900000 ± 1.044031, best_reward: 2.700000 ± 2.368544 in #0


Epoch #73: 1001it [00:50, 19.79it/s, env_step=73000, len=303, loss=-57357281200549773312.000, n/ep=0, n/st=1, rew=4.00]


Epoch #73: test_reward: 0.800000 ± 0.871780, best_reward: 2.700000 ± 2.368544 in #0


Epoch #74: 1001it [00:49, 20.19it/s, env_step=74000, len=207, loss=-60342264878528151552.000, n/ep=0, n/st=1, rew=2.00]


Epoch #74: test_reward: 2.500000 ± 1.746425, best_reward: 2.700000 ± 2.368544 in #0


Epoch #75: 1001it [00:54, 18.26it/s, env_step=75000, len=258, loss=-63471827694508834816.000, n/ep=0, n/st=1, rew=3.00]


Epoch #75: test_reward: 1.700000 ± 1.268858, best_reward: 2.700000 ± 2.368544 in #0


Epoch #76: 1001it [00:55, 18.19it/s, env_step=76000, len=160, loss=-66706337400024924160.000, n/ep=0, n/st=1, rew=1.00]


Epoch #76: test_reward: 1.200000 ± 1.077033, best_reward: 2.700000 ± 2.368544 in #0


Epoch #77: 1001it [00:55, 18.11it/s, env_step=77000, len=135, loss=-70056148315977515008.000, n/ep=0, n/st=1, rew=0.00]


Epoch #77: test_reward: 1.300000 ± 1.345362, best_reward: 2.700000 ± 2.368544 in #0


Epoch #78: 1001it [00:50, 19.96it/s, env_step=78000, len=170, loss=-73567543152884809728.000, n/ep=0, n/st=1, rew=1.00]


Epoch #78: test_reward: 2.000000 ± 1.414214, best_reward: 2.700000 ± 2.368544 in #0


Epoch #79: 1001it [00:49, 20.34it/s, env_step=79000, len=140, loss=-77083277674226008064.000, n/ep=0, n/st=1, rew=0.00]


Epoch #79: test_reward: 1.000000 ± 1.095445, best_reward: 2.700000 ± 2.368544 in #0


Epoch #80: 1001it [00:49, 20.36it/s, env_step=80000, len=140, loss=-80749392224976273408.000, n/ep=0, n/st=1, rew=0.00]


Epoch #80: test_reward: 1.400000 ± 1.200000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #81: 1001it [00:49, 20.31it/s, env_step=81000, len=213, loss=-84590415590137315328.000, n/ep=0, n/st=1, rew=2.00]


Epoch #81: test_reward: 0.900000 ± 1.300000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #82: 1001it [00:49, 20.33it/s, env_step=82000, len=185, loss=-88561043052793626624.000, n/ep=0, n/st=1, rew=1.00]


Epoch #82: test_reward: 1.100000 ± 1.135782, best_reward: 2.700000 ± 2.368544 in #0


Epoch #83: 1001it [00:50, 19.94it/s, env_step=83000, len=271, loss=-92664372860790407168.000, n/ep=0, n/st=1, rew=3.00]


Epoch #83: test_reward: 1.300000 ± 1.004988, best_reward: 2.700000 ± 2.368544 in #0


Epoch #84: 1001it [00:49, 20.33it/s, env_step=84000, len=249, loss=-96895268535607345152.000, n/ep=0, n/st=1, rew=3.00]


Epoch #84: test_reward: 1.100000 ± 1.135782, best_reward: 2.700000 ± 2.368544 in #0


Epoch #85: 1001it [00:48, 20.44it/s, env_step=85000, len=251, loss=-101320304538736033792.000, n/ep=0, n/st=1, rew=3.00]


Epoch #85: test_reward: 1.400000 ± 1.496663, best_reward: 2.700000 ± 2.368544 in #0


Epoch #86: 1001it [00:49, 20.36it/s, env_step=86000, len=134, loss=-105868750958419329024.000, n/ep=0, n/st=1, rew=0.00]


Epoch #86: test_reward: 1.300000 ± 1.004988, best_reward: 2.700000 ± 2.368544 in #0


Epoch #87: 1001it [00:49, 20.35it/s, env_step=87000, len=168, loss=-110582144529204838400.000, n/ep=0, n/st=1, rew=1.00]


Epoch #87: test_reward: 1.000000 ± 0.632456, best_reward: 2.700000 ± 2.368544 in #0


Epoch #88: 1001it [00:49, 20.32it/s, env_step=88000, len=182, loss=-115418231722924556288.000, n/ep=0, n/st=1, rew=1.00]


Epoch #88: test_reward: 2.100000 ± 1.700000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #89: 1001it [00:49, 20.20it/s, env_step=89000, len=288, loss=-120456561238277865472.000, n/ep=0, n/st=1, rew=4.00]


Epoch #89: test_reward: 0.700000 ± 0.900000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #90: 1001it [00:49, 20.27it/s, env_step=90000, len=145, loss=-125611094179328966656.000, n/ep=0, n/st=1, rew=0.00]


Epoch #90: test_reward: 1.200000 ± 0.979796, best_reward: 2.700000 ± 2.368544 in #0


Epoch #91: 1001it [00:49, 20.29it/s, env_step=91000, len=199, loss=-131015916516850712576.000, n/ep=0, n/st=1, rew=2.00]


Epoch #91: test_reward: 1.300000 ± 1.268858, best_reward: 2.700000 ± 2.368544 in #0


Epoch #92: 1001it [00:49, 20.31it/s, env_step=92000, len=212, loss=-136521683345660903424.000, n/ep=0, n/st=1, rew=2.00]


Epoch #92: test_reward: 1.200000 ± 1.536229, best_reward: 2.700000 ± 2.368544 in #0


Epoch #93: 1001it [00:49, 20.41it/s, env_step=93000, len=175, loss=-142278038462182621184.000, n/ep=0, n/st=1, rew=1.00]


Epoch #93: test_reward: 1.300000 ± 1.187434, best_reward: 2.700000 ± 2.368544 in #0


Epoch #94: 1001it [00:49, 20.19it/s, env_step=94000, len=218, loss=-148182089744210493440.000, n/ep=0, n/st=1, rew=2.00]


Epoch #94: test_reward: 1.700000 ± 1.004988, best_reward: 2.700000 ± 2.368544 in #0


Epoch #95: 1001it [00:49, 20.29it/s, env_step=95000, len=213, loss=-154130229947176353792.000, n/ep=0, n/st=1, rew=2.00]


Epoch #95: test_reward: 1.600000 ± 1.562050, best_reward: 2.700000 ± 2.368544 in #0


Epoch #96: 1001it [00:49, 20.27it/s, env_step=96000, len=133, loss=-160247633983895076864.000, n/ep=0, n/st=1, rew=0.00]


Epoch #96: test_reward: 1.600000 ± 0.663325, best_reward: 2.700000 ± 2.368544 in #0


Epoch #97: 1001it [00:49, 20.28it/s, env_step=97000, len=172, loss=-166656536321245118464.000, n/ep=0, n/st=1, rew=1.00]


Epoch #97: test_reward: 1.300000 ± 1.791647, best_reward: 2.700000 ± 2.368544 in #0


Epoch #98: 1001it [00:49, 20.31it/s, env_step=98000, len=139, loss=-173283797093903106048.000, n/ep=0, n/st=1, rew=0.00]


Epoch #98: test_reward: 0.300000 ± 0.640312, best_reward: 2.700000 ± 2.368544 in #0


Epoch #99: 1001it [00:49, 20.18it/s, env_step=99000, len=133, loss=-180023465954041626624.000, n/ep=0, n/st=1, rew=0.00]


Epoch #99: test_reward: 1.600000 ± 1.113553, best_reward: 2.700000 ± 2.368544 in #0


Epoch #100: 1001it [00:49, 20.24it/s, env_step=100000, len=134, loss=-187061253758107451392.000, n/ep=0, n/st=1, rew=0.00]


Epoch #100: test_reward: 1.300000 ± 1.345362, best_reward: 2.700000 ± 2.368544 in #0


Epoch #101: 1001it [00:49, 20.27it/s, env_step=101000, len=137, loss=-194307561039670312960.000, n/ep=0, n/st=1, rew=0.00]


Epoch #101: test_reward: 1.500000 ± 1.500000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #102: 1001it [00:49, 20.38it/s, env_step=102000, len=131, loss=-201734744141267435520.000, n/ep=0, n/st=1, rew=0.00]


Epoch #102: test_reward: 1.500000 ± 1.284523, best_reward: 2.700000 ± 2.368544 in #0


Epoch #103: 1001it [00:49, 20.32it/s, env_step=103000, len=225, loss=-209368016008015806464.000, n/ep=0, n/st=1, rew=2.00]


Epoch #103: test_reward: 0.900000 ± 0.830662, best_reward: 2.700000 ± 2.368544 in #0


Epoch #104: 1001it [00:49, 20.20it/s, env_step=104000, len=127, loss=-217135457672303542272.000, n/ep=0, n/st=1, rew=0.00]


Epoch #104: test_reward: 1.400000 ± 1.113553, best_reward: 2.700000 ± 2.368544 in #0


Epoch #105: 1001it [00:50, 19.97it/s, env_step=105000, len=134, loss=-225203369652105281536.000, n/ep=0, n/st=1, rew=0.00]


Epoch #105: test_reward: 0.800000 ± 0.748331, best_reward: 2.700000 ± 2.368544 in #0


Epoch #106: 1001it [00:52, 19.03it/s, env_step=106000, len=206, loss=-233507719732734623744.000, n/ep=0, n/st=1, rew=2.00]


Epoch #106: test_reward: 1.200000 ± 1.469694, best_reward: 2.700000 ± 2.368544 in #0


Epoch #107: 1001it [00:51, 19.30it/s, env_step=107000, len=129, loss=-242196718035413073920.000, n/ep=0, n/st=1, rew=0.00]


Epoch #107: test_reward: 1.000000 ± 0.774597, best_reward: 2.700000 ± 2.368544 in #0


Epoch #108: 1001it [00:55, 17.99it/s, env_step=108000, len=135, loss=-250802238600560967680.000, n/ep=0, n/st=1, rew=0.00]


Epoch #108: test_reward: 0.500000 ± 0.806226, best_reward: 2.700000 ± 2.368544 in #0


Epoch #109: 1001it [00:55, 18.01it/s, env_step=109000, len=223, loss=-259746847672105697280.000, n/ep=0, n/st=1, rew=2.00]


Epoch #109: test_reward: 1.000000 ± 1.000000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #110: 1001it [00:49, 20.15it/s, env_step=110000, len=186, loss=-269008130518042476544.000, n/ep=0, n/st=1, rew=1.00]


Epoch #110: test_reward: 2.100000 ± 1.374773, best_reward: 2.700000 ± 2.368544 in #0


Epoch #111: 1001it [00:49, 20.20it/s, env_step=111000, len=159, loss=-278418590175823790080.000, n/ep=0, n/st=1, rew=0.00]


Epoch #111: test_reward: 2.100000 ± 1.300000, best_reward: 2.700000 ± 2.368544 in #0


Epoch #112: 1001it [00:49, 20.27it/s, env_step=112000, len=132, loss=-288115448159502172160.000, n/ep=0, n/st=1, rew=0.00]


Epoch #112: test_reward: 1.800000 ± 1.469694, best_reward: 2.700000 ± 2.368544 in #0


Epoch #113: 1001it [00:49, 20.24it/s, env_step=113000, len=258, loss=-298178522328223449088.000, n/ep=0, n/st=1, rew=3.00]


Epoch #113: test_reward: 1.800000 ± 0.979796, best_reward: 2.700000 ± 2.368544 in #0


Epoch #114: 1001it [00:49, 20.24it/s, env_step=114000, len=299, loss=-308248433875972718592.000, n/ep=0, n/st=1, rew=4.00]


Epoch #114: test_reward: 1.500000 ± 1.024695, best_reward: 2.700000 ± 2.368544 in #0


Epoch #115: 1001it [00:49, 20.18it/s, env_step=115000, len=130, loss=-318546523817279946752.000, n/ep=0, n/st=1, rew=0.00]


Epoch #115: test_reward: 2.100000 ± 1.044031, best_reward: 2.700000 ± 2.368544 in #0


Epoch #116: 1001it [00:49, 20.25it/s, env_step=116000, len=180, loss=-329000580802401140736.000, n/ep=0, n/st=1, rew=1.00]


Epoch #116: test_reward: 1.700000 ± 1.417745, best_reward: 2.700000 ± 2.368544 in #0


Epoch #117: 1001it [00:49, 20.13it/s, env_step=117000, len=176, loss=-340003759579025309696.000, n/ep=0, n/st=1, rew=1.00]


Epoch #117: test_reward: 1.400000 ± 1.280625, best_reward: 2.700000 ± 2.368544 in #0


Epoch #118: 1001it [00:49, 20.20it/s, env_step=118000, len=207, loss=-351107851356924870656.000, n/ep=0, n/st=1, rew=2.00]


Epoch #118: test_reward: 2.600000 ± 1.428286, best_reward: 2.700000 ± 2.368544 in #0


Epoch #119: 1001it [00:50, 19.97it/s, env_step=119000, len=140, loss=-362611186889947086848.000, n/ep=0, n/st=1, rew=0.00]


Epoch #119: test_reward: 2.800000 ± 0.979796, best_reward: 2.800000 ± 0.979796 in #119


Epoch #120: 1001it [00:51, 19.29it/s, env_step=120000, len=206, loss=-374426751003791720448.000, n/ep=0, n/st=1, rew=2.00]


Epoch #120: test_reward: 1.400000 ± 1.685230, best_reward: 2.800000 ± 0.979796 in #119


Epoch #121: 1001it [00:57, 17.40it/s, env_step=121000, len=182, loss=-386655725508336812032.000, n/ep=0, n/st=1, rew=1.00]


Epoch #121: test_reward: 2.000000 ± 0.632456, best_reward: 2.800000 ± 0.979796 in #119


Epoch #122: 1001it [00:54, 18.38it/s, env_step=122000, len=284, loss=-398717386009069289472.000, n/ep=0, n/st=1, rew=3.00]


Epoch #122: test_reward: 1.500000 ± 1.627882, best_reward: 2.800000 ± 0.979796 in #119


Epoch #123: 1001it [00:56, 17.78it/s, env_step=123000, len=162, loss=-411388198469587828736.000, n/ep=0, n/st=1, rew=1.00]


Epoch #123: test_reward: 1.000000 ± 0.774597, best_reward: 2.800000 ± 0.979796 in #119


Epoch #124: 1001it [00:57, 17.37it/s, env_step=124000, len=139, loss=-424286960273401970688.000, n/ep=0, n/st=1, rew=0.00]


Epoch #124: test_reward: 1.900000 ± 1.300000, best_reward: 2.800000 ± 0.979796 in #119


Epoch #125: 1001it [00:51, 19.28it/s, env_step=125000, len=227, loss=-437373286446384545792.000, n/ep=0, n/st=1, rew=2.00]


Epoch #125: test_reward: 2.500000 ± 1.284523, best_reward: 2.800000 ± 0.979796 in #119


Epoch #126: 1001it [00:49, 20.05it/s, env_step=126000, len=155, loss=-450990528257532624896.000, n/ep=0, n/st=1, rew=1.00]


Epoch #126: test_reward: 1.100000 ± 1.220656, best_reward: 2.800000 ± 0.979796 in #119


Epoch #127: 1001it [00:51, 19.25it/s, env_step=127000, len=195, loss=-464880381398374744064.000, n/ep=0, n/st=1, rew=1.00]


Epoch #127: test_reward: 1.600000 ± 1.356466, best_reward: 2.800000 ± 0.979796 in #119


Epoch #128: 1001it [00:58, 17.23it/s, env_step=128000, len=124, loss=-479016970929833115648.000, n/ep=0, n/st=1, rew=0.00]


Epoch #128: test_reward: 1.100000 ± 1.220656, best_reward: 2.800000 ± 0.979796 in #119


Epoch #129: 1001it [00:57, 17.52it/s, env_step=129000, len=157, loss=-493710445332652097536.000, n/ep=0, n/st=1, rew=1.00]


Epoch #129: test_reward: 1.700000 ± 1.486607, best_reward: 2.800000 ± 0.979796 in #119


Epoch #130: 1001it [00:56, 17.62it/s, env_step=130000, len=142, loss=-508552320028223602688.000, n/ep=0, n/st=1, rew=0.00]


Epoch #130: test_reward: 1.300000 ± 1.268858, best_reward: 2.800000 ± 0.979796 in #119


Epoch #131: 1001it [00:56, 17.60it/s, env_step=131000, len=194, loss=-523857233375014420480.000, n/ep=0, n/st=1, rew=1.00]


Epoch #131: test_reward: 2.100000 ± 1.044031, best_reward: 2.800000 ± 0.979796 in #119


Epoch #132: 1001it [00:57, 17.42it/s, env_step=132000, len=133, loss=-539231625653212741632.000, n/ep=0, n/st=1, rew=0.00]


Epoch #132: test_reward: 1.400000 ± 1.019804, best_reward: 2.800000 ± 0.979796 in #119


Epoch #133: 1001it [00:59, 16.80it/s, env_step=133000, len=137, loss=-555146356648109473792.000, n/ep=0, n/st=1, rew=0.00]


Epoch #133: test_reward: 0.600000 ± 0.663325, best_reward: 2.800000 ± 0.979796 in #119


Epoch #134: 1001it [00:57, 17.27it/s, env_step=134000, len=273, loss=-571128387958094036992.000, n/ep=0, n/st=1, rew=3.00]


Epoch #134: test_reward: 1.000000 ± 1.000000, best_reward: 2.800000 ± 0.979796 in #119


Epoch #135: 1001it [00:58, 16.99it/s, env_step=135000, len=144, loss=-587879584826312556544.000, n/ep=0, n/st=1, rew=0.00]


Epoch #135: test_reward: 1.200000 ± 1.077033, best_reward: 2.800000 ± 0.979796 in #119


Epoch #136: 1001it [00:58, 17.01it/s, env_step=136000, len=124, loss=-605198695107322970112.000, n/ep=0, n/st=1, rew=0.00]


Epoch #136: test_reward: 1.600000 ± 1.562050, best_reward: 2.800000 ± 0.979796 in #119


Epoch #137: 1001it [00:59, 16.72it/s, env_step=137000, len=252, loss=-622245947707889418240.000, n/ep=0, n/st=1, rew=3.00]


Epoch #137: test_reward: 1.600000 ± 1.200000, best_reward: 2.800000 ± 0.979796 in #119


Epoch #138: 1001it [01:00, 16.53it/s, env_step=138000, len=196, loss=-639685159843024994304.000, n/ep=0, n/st=1, rew=2.00]


Epoch #138: test_reward: 1.200000 ± 1.166190, best_reward: 2.800000 ± 0.979796 in #119


Epoch #139: 1001it [00:57, 17.55it/s, env_step=139000, len=228, loss=-657404595094912368640.000, n/ep=0, n/st=1, rew=2.00]


Epoch #139: test_reward: 1.800000 ± 1.469694, best_reward: 2.800000 ± 0.979796 in #119


Epoch #140: 1001it [00:51, 19.50it/s, env_step=140000, len=135, loss=-675947537464037474304.000, n/ep=0, n/st=1, rew=0.00]


Epoch #140: test_reward: 1.300000 ± 0.781025, best_reward: 2.800000 ± 0.979796 in #119


Epoch #141: 1001it [00:50, 19.84it/s, env_step=141000, len=239, loss=-694251736276353810432.000, n/ep=0, n/st=1, rew=2.00]


Epoch #141: test_reward: 1.300000 ± 1.417745, best_reward: 2.800000 ± 0.979796 in #119


Epoch #142: 1001it [00:52, 18.92it/s, env_step=142000, len=125, loss=-713192476674752380928.000, n/ep=0, n/st=1, rew=0.00]


Epoch #142: test_reward: 1.900000 ± 1.374773, best_reward: 2.800000 ± 0.979796 in #119


Epoch #143: 1001it [00:50, 19.67it/s, env_step=143000, len=179, loss=-733027867694439727104.000, n/ep=0, n/st=1, rew=1.00]


Epoch #143: test_reward: 1.200000 ± 1.077033, best_reward: 2.800000 ± 0.979796 in #119


Epoch #144: 1001it [00:50, 19.99it/s, env_step=144000, len=238, loss=-753156737894907117568.000, n/ep=0, n/st=1, rew=3.00]


Epoch #144: test_reward: 0.900000 ± 0.943398, best_reward: 2.800000 ± 0.979796 in #119


Epoch #145: 1001it [00:50, 19.98it/s, env_step=145000, len=133, loss=-773288947092285685760.000, n/ep=0, n/st=1, rew=0.00]


Epoch #145: test_reward: 1.300000 ± 0.900000, best_reward: 2.800000 ± 0.979796 in #119


Epoch #146: 1001it [00:50, 19.85it/s, env_step=146000, len=220, loss=-794840880887820582912.000, n/ep=0, n/st=1, rew=2.00]


Epoch #146: test_reward: 1.100000 ± 0.943398, best_reward: 2.800000 ± 0.979796 in #119


Epoch #147: 1001it [00:51, 19.61it/s, env_step=147000, len=262, loss=-815198329176312774656.000, n/ep=0, n/st=1, rew=3.00]


Epoch #147: test_reward: 1.300000 ± 1.417745, best_reward: 2.800000 ± 0.979796 in #119


Epoch #148: 1001it [00:50, 19.79it/s, env_step=148000, len=134, loss=-836861446291335938048.000, n/ep=0, n/st=1, rew=0.00]


Epoch #148: test_reward: 1.800000 ± 1.326650, best_reward: 2.800000 ± 0.979796 in #119


Epoch #149: 1001it [00:50, 19.96it/s, env_step=149000, len=267, loss=-858720555136330235904.000, n/ep=0, n/st=1, rew=3.00]


Epoch #149: test_reward: 1.300000 ± 1.004988, best_reward: 2.800000 ± 0.979796 in #119


Epoch #150: 1001it [00:50, 19.94it/s, env_step=150000, len=227, loss=-880954563317555134464.000, n/ep=0, n/st=1, rew=2.00]


Epoch #150: test_reward: 0.600000 ± 0.800000, best_reward: 2.800000 ± 0.979796 in #119


Epoch #151: 1001it [00:50, 19.91it/s, env_step=151000, len=130, loss=-903732562001457577984.000, n/ep=0, n/st=1, rew=0.00]


Epoch #151: test_reward: 2.100000 ± 1.445683, best_reward: 2.800000 ± 0.979796 in #119


Epoch #152: 1001it [00:54, 18.20it/s, env_step=152000, len=161, loss=-927260376246319972352.000, n/ep=0, n/st=1, rew=1.00]


Epoch #152: test_reward: 0.600000 ± 1.019804, best_reward: 2.800000 ± 0.979796 in #119


Epoch #153: 1001it [00:53, 18.66it/s, env_step=153000, len=143, loss=-950535029082378993664.000, n/ep=0, n/st=1, rew=0.00]


Epoch #153: test_reward: 1.600000 ± 1.019804, best_reward: 2.800000 ± 0.979796 in #119


Epoch #154: 1001it [00:54, 18.38it/s, env_step=154000, len=203, loss=-974775653373013262336.000, n/ep=0, n/st=1, rew=2.00]


Epoch #154: test_reward: 1.500000 ± 1.024695, best_reward: 2.800000 ± 0.979796 in #119


Epoch #155: 1001it [00:50, 19.78it/s, env_step=155000, len=288, loss=-999351505916660678656.000, n/ep=0, n/st=1, rew=3.00]


Epoch #155: test_reward: 1.300000 ± 1.268858, best_reward: 2.800000 ± 0.979796 in #119


Epoch #156: 1001it [00:49, 20.03it/s, env_step=156000, len=135, loss=-1024679220344348606464.000, n/ep=0, n/st=1, rew=0.00]


Epoch #156: test_reward: 1.100000 ± 1.300000, best_reward: 2.800000 ± 0.979796 in #119


Epoch #157: 1001it [00:50, 19.92it/s, env_step=157000, len=204, loss=-1050138369512411758592.000, n/ep=0, n/st=1, rew=2.00]


Epoch #157: test_reward: 1.300000 ± 1.100000, best_reward: 2.800000 ± 0.979796 in #119


Epoch #158: 1001it [00:51, 19.60it/s, env_step=158000, len=224, loss=-1076498378639749808128.000, n/ep=0, n/st=1, rew=2.00]


Epoch #158: test_reward: 1.900000 ± 1.220656, best_reward: 2.800000 ± 0.979796 in #119


Epoch #159: 1001it [00:55, 18.17it/s, env_step=159000, len=133, loss=-1102649523482744324096.000, n/ep=0, n/st=1, rew=0.00]


Epoch #159: test_reward: 1.200000 ± 0.871780, best_reward: 2.800000 ± 0.979796 in #119


Epoch #160: 1001it [00:56, 17.76it/s, env_step=160000, len=243, loss=-1129402429456323969024.000, n/ep=0, n/st=1, rew=3.00]


Epoch #160: test_reward: 1.400000 ± 0.916515, best_reward: 2.800000 ± 0.979796 in #119


Epoch #161: 1001it [00:57, 17.32it/s, env_step=161000, len=180, loss=-1157572191094359982080.000, n/ep=0, n/st=1, rew=1.00]


Epoch #161: test_reward: 2.900000 ± 1.640122, best_reward: 2.900000 ± 1.640122 in #161


Epoch #162: 1001it [00:55, 17.91it/s, env_step=162000, len=251, loss=-1185301435949594443776.000, n/ep=0, n/st=1, rew=3.00]


Epoch #162: test_reward: 2.200000 ± 1.166190, best_reward: 2.900000 ± 1.640122 in #161


Epoch #163: 1001it [00:58, 17.18it/s, env_step=163000, len=199, loss=-1214265182905623379968.000, n/ep=0, n/st=1, rew=2.00]


Epoch #163: test_reward: 2.000000 ± 1.000000, best_reward: 2.900000 ± 1.640122 in #161


Epoch #164: 1001it [00:57, 17.33it/s, env_step=164000, len=211, loss=-1243849891807773589504.000, n/ep=0, n/st=1, rew=2.00]


Epoch #164: test_reward: 1.300000 ± 1.552417, best_reward: 2.900000 ± 1.640122 in #161


Epoch #165: 1001it [00:57, 17.30it/s, env_step=165000, len=237, loss=-1272174749676414173184.000, n/ep=0, n/st=1, rew=3.00]


Epoch #165: test_reward: 1.600000 ± 1.200000, best_reward: 2.900000 ± 1.640122 in #161


Epoch #166: 1001it [00:58, 17.03it/s, env_step=166000, len=190, loss=-1302515725545990520832.000, n/ep=0, n/st=1, rew=1.00]


Epoch #166: test_reward: 1.600000 ± 0.916515, best_reward: 2.900000 ± 1.640122 in #161


Epoch #167: 1001it [01:01, 16.40it/s, env_step=167000, len=126, loss=-1331974161958838730752.000, n/ep=0, n/st=1, rew=0.00]


Epoch #167: test_reward: 2.000000 ± 1.095445, best_reward: 2.900000 ± 1.640122 in #161


Epoch #168: 1001it [00:58, 17.02it/s, env_step=168000, len=154, loss=-1363231853976815665152.000, n/ep=0, n/st=1, rew=1.00]


Epoch #168: test_reward: 1.300000 ± 0.900000, best_reward: 2.900000 ± 1.640122 in #161


Epoch #169: 1001it [00:52, 19.23it/s, env_step=169000, len=133, loss=-1394328189057018363904.000, n/ep=0, n/st=1, rew=0.00]


Epoch #169: test_reward: 1.600000 ± 1.019804, best_reward: 2.900000 ± 1.640122 in #161


Epoch #170: 1001it [00:51, 19.53it/s, env_step=170000, len=202, loss=-1425964262256687972352.000, n/ep=0, n/st=1, rew=2.00]


Epoch #170: test_reward: 1.300000 ± 1.004988, best_reward: 2.900000 ± 1.640122 in #161


Epoch #171: 1001it [00:54, 18.47it/s, env_step=171000, len=258, loss=-1458012061571165978624.000, n/ep=0, n/st=1, rew=3.00]


Epoch #171: test_reward: 1.400000 ± 1.113553, best_reward: 2.900000 ± 1.640122 in #161


Epoch #172: 1001it [00:58, 17.08it/s, env_step=172000, len=185, loss=-1492571564841199992832.000, n/ep=0, n/st=1, rew=1.00]


Epoch #172: test_reward: 1.300000 ± 1.004988, best_reward: 2.900000 ± 1.640122 in #161


Epoch #173: 1001it [00:58, 17.18it/s, env_step=173000, len=211, loss=-1526386956640677003264.000, n/ep=0, n/st=1, rew=2.00]


Epoch #173: test_reward: 1.100000 ± 1.220656, best_reward: 2.900000 ± 1.640122 in #161


Epoch #174: 1001it [00:58, 17.05it/s, env_step=174000, len=181, loss=-1561051178453671346176.000, n/ep=0, n/st=1, rew=1.00]


Epoch #174: test_reward: 1.700000 ± 1.100000, best_reward: 2.900000 ± 1.640122 in #161


Epoch #175: 1001it [00:59, 16.94it/s, env_step=175000, len=140, loss=-1596482983935530500096.000, n/ep=0, n/st=1, rew=0.00]


Epoch #175: test_reward: 0.700000 ± 0.900000, best_reward: 2.900000 ± 1.640122 in #161


Epoch #176: 1001it [00:59, 16.87it/s, env_step=176000, len=247, loss=-1631780666590987288576.000, n/ep=0, n/st=1, rew=3.00]


Epoch #176: test_reward: 1.300000 ± 1.100000, best_reward: 2.900000 ± 1.640122 in #161


Epoch #177: 1001it [01:00, 16.46it/s, env_step=177000, len=179, loss=-1666965009124946608128.000, n/ep=0, n/st=1, rew=1.00]


Epoch #177: test_reward: 1.200000 ± 1.249000, best_reward: 2.900000 ± 1.640122 in #161


Epoch #178: 1001it [00:59, 16.76it/s, env_step=178000, len=211, loss=-1703751943512557092864.000, n/ep=0, n/st=1, rew=2.00]


Epoch #178: test_reward: 1.000000 ± 1.183216, best_reward: 2.900000 ± 1.640122 in #161


Epoch #179: 1001it [00:59, 16.95it/s, env_step=179000, len=133, loss=-1741627922880934445056.000, n/ep=0, n/st=1, rew=0.00]


Epoch #179: test_reward: 1.200000 ± 1.536229, best_reward: 2.900000 ± 1.640122 in #161


Epoch #180: 1001it [00:58, 17.13it/s, env_step=180000, len=231, loss=-1779906048363288330240.000, n/ep=0, n/st=1, rew=3.00]


Epoch #180: test_reward: 0.800000 ± 1.077033, best_reward: 2.900000 ± 1.640122 in #161


Epoch #181: 1001it [00:58, 17.08it/s, env_step=181000, len=137, loss=-1818156459819435032576.000, n/ep=0, n/st=1, rew=0.00]


Epoch #181: test_reward: 1.900000 ± 1.300000, best_reward: 2.900000 ± 1.640122 in #161


Epoch #182: 1001it [01:00, 16.42it/s, env_step=182000, len=129, loss=-1857135301775186198528.000, n/ep=0, n/st=1, rew=0.00]


Epoch #182: test_reward: 1.200000 ± 1.077033, best_reward: 2.900000 ± 1.640122 in #161


Epoch #183: 1001it [00:51, 19.27it/s, env_step=183000, len=141, loss=-1896894676656105717760.000, n/ep=0, n/st=1, rew=0.00]


Epoch #183: test_reward: 1.400000 ± 1.113553, best_reward: 2.900000 ± 1.640122 in #161


Epoch #184: 1001it [00:51, 19.35it/s, env_step=184000, len=253, loss=-1937558585448433713152.000, n/ep=0, n/st=1, rew=3.00]


Epoch #184: test_reward: 1.100000 ± 0.830662, best_reward: 2.900000 ± 1.640122 in #161


Epoch #185: 1001it [00:51, 19.41it/s, env_step=185000, len=178, loss=-1978034481622693183488.000, n/ep=0, n/st=1, rew=1.00]


Epoch #185: test_reward: 1.200000 ± 1.326650, best_reward: 2.900000 ± 1.640122 in #161


Epoch #186: 1001it [00:52, 19.10it/s, env_step=186000, len=133, loss=-2020870831173772771328.000, n/ep=0, n/st=1, rew=0.00]


Epoch #186: test_reward: 1.300000 ± 1.552417, best_reward: 2.900000 ± 1.640122 in #161


Epoch #187: 1001it [00:51, 19.58it/s, env_step=187000, len=195, loss=-2063807757363530432512.000, n/ep=0, n/st=1, rew=2.00]


Epoch #187: test_reward: 1.300000 ± 0.900000, best_reward: 2.900000 ± 1.640122 in #161


Epoch #188: 1001it [00:51, 19.62it/s, env_step=188000, len=187, loss=-2106394820008861237248.000, n/ep=0, n/st=1, rew=1.00]


Epoch #188: test_reward: 1.000000 ± 1.095445, best_reward: 2.900000 ± 1.640122 in #161


Epoch #189: 1001it [00:50, 19.64it/s, env_step=189000, len=185, loss=-2150299837159768588288.000, n/ep=0, n/st=1, rew=1.00]


Epoch #189: test_reward: 1.600000 ± 1.200000, best_reward: 2.900000 ± 1.640122 in #161


Epoch #190: 1001it [00:51, 19.62it/s, env_step=190000, len=132, loss=-2195182715268267769856.000, n/ep=0, n/st=1, rew=0.00]


Epoch #190: test_reward: 1.000000 ± 1.000000, best_reward: 2.900000 ± 1.640122 in #161


Epoch #191: 1001it [00:50, 19.64it/s, env_step=191000, len=140, loss=-2240869879975219822592.000, n/ep=0, n/st=1, rew=0.00]


Epoch #191: test_reward: 1.200000 ± 0.748331, best_reward: 2.900000 ± 1.640122 in #161


Epoch #192: 1001it [00:51, 19.58it/s, env_step=192000, len=279, loss=-2286483096983665377280.000, n/ep=0, n/st=1, rew=3.00]


Epoch #192: test_reward: 1.300000 ± 1.791647, best_reward: 2.900000 ± 1.640122 in #161


Epoch #193: 1001it [00:50, 19.73it/s, env_step=193000, len=137, loss=-2333367015885972439040.000, n/ep=0, n/st=1, rew=0.00]


Epoch #193: test_reward: 1.400000 ± 1.280625, best_reward: 2.900000 ± 1.640122 in #161


Epoch #194: 1001it [00:50, 19.66it/s, env_step=194000, len=133, loss=-2380947437581275496448.000, n/ep=0, n/st=1, rew=0.00]


Epoch #194: test_reward: 1.000000 ± 0.894427, best_reward: 2.900000 ± 1.640122 in #161


Epoch #195: 1001it [00:51, 19.58it/s, env_step=195000, len=127, loss=-2429265232236193513472.000, n/ep=0, n/st=1, rew=0.00]


Epoch #195: test_reward: 1.400000 ± 0.916515, best_reward: 2.900000 ± 1.640122 in #161


Epoch #196: 1001it [00:54, 18.43it/s, env_step=196000, len=131, loss=-2479101578760965586944.000, n/ep=0, n/st=1, rew=0.00]


Epoch #196: test_reward: 0.900000 ± 0.943398, best_reward: 2.900000 ± 1.640122 in #161


Epoch #197: 1001it [00:55, 17.91it/s, env_step=197000, len=130, loss=-2529073689111254532096.000, n/ep=0, n/st=1, rew=0.00]


Epoch #197: test_reward: 1.500000 ± 1.204159, best_reward: 2.900000 ± 1.640122 in #161


Epoch #198: 1001it [00:53, 18.62it/s, env_step=198000, len=175, loss=-2578654763164898426880.000, n/ep=0, n/st=1, rew=1.00]


Epoch #198: test_reward: 1.800000 ± 1.249000, best_reward: 2.900000 ± 1.640122 in #161


Epoch #199: 1001it [00:51, 19.57it/s, env_step=199000, len=237, loss=-2628661742635301994496.000, n/ep=0, n/st=1, rew=3.00]


Epoch #199: test_reward: 1.700000 ± 1.268858, best_reward: 2.900000 ± 1.640122 in #161


Epoch #200: 1001it [00:51, 19.43it/s, env_step=200000, len=231, loss=-2680093804580044275712.000, n/ep=0, n/st=1, rew=2.00]


Epoch #200: test_reward: 1.400000 ± 1.113553, best_reward: 2.900000 ± 1.640122 in #161
{'best_result': '2.90 ± 1.64',
 'best_reward': 2.9,
 'duration': '11944.42s',
 'test_episode': 2010,
 'test_speed': '301.54 step/s',
 'test_step': 451988,
 'test_time': '1498.91s',
 'train_episode': 1056,
 'train_speed': '19.15 step/s',
 'train_step': 200000,
 'train_time/collector': '938.47s',
 'train_time/model': '9507.03s'}


In [11]:
support = torch.nn.Parameter(
            torch.linspace(-10, 10, 51),
            requires_grad=False,
        )
support

Parameter containing:
tensor([-10.0000,  -9.6000,  -9.2000,  -8.8000,  -8.4000,  -8.0000,  -7.6000,
         -7.2000,  -6.8000,  -6.4000,  -6.0000,  -5.6000,  -5.2000,  -4.8000,
         -4.4000,  -4.0000,  -3.6000,  -3.2000,  -2.8000,  -2.4000,  -2.0000,
         -1.6000,  -1.2000,  -0.8000,  -0.4000,   0.0000,   0.4000,   0.8000,
          1.2000,   1.6000,   2.0000,   2.4000,   2.8000,   3.2000,   3.6000,
          4.0000,   4.4000,   4.8000,   5.2000,   5.6000,   6.0000,   6.4000,
          6.8000,   7.2000,   7.6000,   8.0000,   8.4000,   8.8000,   9.2000,
          9.6000,  10.0000])

In [16]:
support * torch.tensor([[2], [3]])

tensor([[-20.0000, -19.2000, -18.4000, -17.6000, -16.8000, -16.0000, -15.2000,
         -14.4000, -13.6000, -12.8000, -12.0000, -11.2000, -10.4000,  -9.6000,
          -8.8000,  -8.0000,  -7.2000,  -6.4000,  -5.6000,  -4.8000,  -4.0000,
          -3.2000,  -2.4000,  -1.6000,  -0.8000,   0.0000,   0.8000,   1.6000,
           2.4000,   3.2000,   4.0000,   4.8000,   5.6000,   6.4000,   7.2000,
           8.0000,   8.8000,   9.6000,  10.4000,  11.2000,  12.0000,  12.8000,
          13.6000,  14.4000,  15.2000,  16.0000,  16.8000,  17.6000,  18.4000,
          19.2000,  20.0000],
        [-30.0000, -28.8000, -27.6000, -26.4000, -25.2000, -24.0000, -22.8000,
         -21.6000, -20.4000, -19.2000, -18.0000, -16.8000, -15.6000, -14.4000,
         -13.2000, -12.0000, -10.8000,  -9.6000,  -8.4000,  -7.2000,  -6.0000,
          -4.8000,  -3.6000,  -2.4000,  -1.2000,   0.0000,   1.2000,   2.4000,
           3.6000,   4.8000,   6.0000,   7.2000,   8.4000,   9.6000,  10.8000,
          12.0000,  13

In [12]:
a = torch.tensor([1,2,3])
a

tensor([1, 2, 3])

In [14]:
a.unsqueeze(dim = -1)

tensor([[1],
        [2],
        [3]])