In [None]:
# test pi_0
import os
import gym
import torch
import pprint
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter

from tianshou.env import DummyVectorEnv
from tianshou.utils.net.common import Net
from tianshou.trainer import offpolicy_trainer    ##
from tianshou.data import Collector, ReplayBuffer ##
from tianshou.utils.net.continuous import Actor, ActorProb, Critic


#-------------------------------------------------------------
# The belows are special for policy distral
from distral_offpolicy_trainer import Distral_offpolicy_trainer
####from distral_collector import Distral_Collector    # 惊了，这个也不需要
from distral_task_policy import TaskPolicy
from distral_distilled_policy import DistilledPolicy
#-------------------------------------------------------------



def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='Swimmer-v2')  # ‘Pendulum-v0’
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--buffer-size', type=int, default=20000)
    parser.add_argument('--actor-lr', type=float, default=3e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--il-lr', type=float, default=1e-3)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--tau', type=float, default=0.005)
    parser.add_argument('--alpha', type=float, default=0.2)
    parser.add_argument('--epoch', type=int, default=20)
    parser.add_argument('--step-per-epoch', type=int, default=2400)
    parser.add_argument('--collect-per-step', type=int, default=10)
    parser.add_argument('--batch-size', type=int, default=128)
    parser.add_argument('--layer-num', type=int, default=1)
    parser.add_argument('--training-num', type=int, default=8)
    parser.add_argument('--test-num', type=int, default=100)
    parser.add_argument('--logdir', type=str, default='log')
    parser.add_argument('--render', type=float, default=0.)
    parser.add_argument('--rew-norm', type=int, default=1)
    parser.add_argument('--ignore-done', type=int, default=1)
    parser.add_argument('--n-step', type=int, default=4)
    parser.add_argument(
        '--device', type=str,
        default='cuda' if torch.cuda.is_available() else 'cpu')
    args = parser.parse_known_args()[0]
    return args


args = get_args()
np.random.seed(args.seed)
torch.manual_seed(args.seed)
#args.step_per_epoch = 800
args.epoch = 3

torch.set_num_threads(1)  # we just need only one thread for NN
env = gym.make(args.task)
if args.task == 'Pendulum-v0':
    env.spec.reward_threshold = -250
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
# you can also use tianshou.env.SubprocVectorEnv
# train_envs = gym.make(args.task)

# model 1

train_envs_1 = DummyVectorEnv(
    [lambda: gym.make(args.task) for _ in range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs_1 = DummyVectorEnv(
    [lambda: gym.make(args.task) for _ in range(args.test_num)])

# seed
train_envs_1.seed(args.seed)
test_envs_1.seed(args.seed)

net_1 = Net(args.layer_num, args.state_shape, device=args.device)
actor_1 = ActorProb(
    net_1, args.action_shape, args.max_action, args.device, unbounded=True
).to(args.device)
actor_optim_1 = torch.optim.Adam(actor_1.parameters(), lr=args.actor_lr)
net_c1_1 = Net(args.layer_num, args.state_shape,
               args.action_shape, concat=True, device=args.device)
critic1_1 = Critic(net_c1_1, args.device).to(args.device)
critic1_optim_1 = torch.optim.Adam(critic1_1.parameters(), lr=args.critic_lr)
net_c2_1 = Net(args.layer_num, args.state_shape,
               args.action_shape, concat=True, device=args.device)
critic2_1 = Critic(net_c2_1, args.device).to(args.device)
critic2_optim_1 = torch.optim.Adam(critic2_1.parameters(), lr=args.critic_lr)
policy_1 = TaskPolicy(
    actor_1, actor_optim_1, critic1_1, critic1_optim_1, critic2_1, critic2_optim_1,
    action_range=[env.action_space.low[0], env.action_space.high[0]],
    tau=args.tau, gamma=args.gamma, alpha=args.alpha,
    reward_normalization=args.rew_norm,
    ignore_done=args.ignore_done,
    estimation_step=args.n_step)

# model 2

train_envs_2 = DummyVectorEnv(
    [lambda: gym.make(args.task) for _ in range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs_2 = DummyVectorEnv(
    [lambda: gym.make(args.task) for _ in range(args.test_num)])

# seed
train_envs_2.seed(args.seed)
test_envs_2.seed(args.seed)

net_2 = Net(args.layer_num, args.state_shape, device=args.device)
actor_2 = ActorProb(
    net_2, args.action_shape, args.max_action, args.device, unbounded=True
).to(args.device)
actor_optim_2 = torch.optim.Adam(actor_2.parameters(), lr=args.actor_lr)
net_c1_2 = Net(args.layer_num, args.state_shape,
               args.action_shape, concat=True, device=args.device)
critic1_2 = Critic(net_c1_2, args.device).to(args.device)
critic1_optim_2 = torch.optim.Adam(critic1_2.parameters(), lr=args.critic_lr)
net_c2_2 = Net(args.layer_num, args.state_shape,
               args.action_shape, concat=True, device=args.device)
critic2_2 = Critic(net_c2_2, args.device).to(args.device)
critic2_optim_2 = torch.optim.Adam(critic2_2.parameters(), lr=args.critic_lr)
policy_2 = TaskPolicy(
    actor_2, actor_optim_2, critic1_2, critic1_optim_2, critic2_2, critic2_optim_2,
    action_range=[env.action_space.low[0], env.action_space.high[0]],
    tau=args.tau, gamma=args.gamma, alpha=args.alpha,
    reward_normalization=args.rew_norm,
    ignore_done=args.ignore_done,
    estimation_step=args.n_step)


# collector
train_collector_1 = Collector(
    policy_1, train_envs_1, ReplayBuffer(args.buffer_size))
test_collector_1 = Collector(policy_1, test_envs_1)
train_collector_2 = Collector(
    policy_2, train_envs_2, ReplayBuffer(args.buffer_size))
test_collector_2 = Collector(policy_2, test_envs_2)
# train_collector.collect(n_step=args.buffer_size)
# log
log_path = os.path.join(args.logdir, args.task, 'sac_distral')
writer = SummaryWriter(log_path)


def save_fn(policy):
    torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))


def stop_fn(mean_rewards):
    return mean_rewards >= env.spec.reward_threshold


for itr in range(10):
    if itr == 0:
        # train policy 1
        result = offpolicy_trainer(
                policy_1, train_collector_1, test_collector_1, args.epoch,
                args.step_per_epoch, args.collect_per_step, args.test_num,
                args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
        if __name__ == '__main__':
            pprint.pprint(result)
            # Let's watch its performance!
            policy_1.eval()
            collector = Collector(policy_1, env)
            result = collector.collect(n_episode=1, render=args.render)
            print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        
        # train policy 2
        result = offpolicy_trainer(
                policy_2, train_collector_2, test_collector_2, args.epoch,
                args.step_per_epoch, args.collect_per_step, args.test_num,
                args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
        if __name__ == '__main__':
            pprint.pprint(result)
            # Let's watch its performance!
            policy_2.eval()
            collector = Collector(policy_2, env)
            result = collector.collect(n_episode=1, render=args.render)
            print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        
        # train distilled policy
        if args.task == 'Pendulum-v0':
            env.spec.reward_threshold = -300  # lower the goal
        net = ActorProb(
                Net(1, args.state_shape, device=args.device),
                args.action_shape, args.max_action, args.device
                ).to(args.device)
        
        optim = torch.optim.Adam(net.parameters(), lr=args.il_lr)

        distilled_policy = DistilledPolicy(net, optim, 
                    action_range=[env.action_space.low[0], env.action_space.high[0]],
                            mode='continuous')

        distilled_policy_test_collector = Collector(
                    distilled_policy,
                    DummyVectorEnv(
                        [lambda: gym.make(args.task) for _ in range(args.test_num)])
                )

        train_collector_1.reset()
        train_collector_2.reset()

        result = Distral_offpolicy_trainer(
                    distilled_policy, train_collector_1, train_collector_2, 
                    distilled_policy_test_collector, 20,
                    args.step_per_epoch // 5, args.collect_per_step, args.test_num,
                    args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
        if __name__ == '__main__':
            pprint.pprint(result)
            # Let's watch its performance!
            distilled_policy.eval()
            collector = Collector(distilled_policy, env)
            result = collector.collect(n_episode=1, render=args.render)
            print(f'Final reward: {result["rew"]}, length: {result["len"]}')
    else:
        # train policy 1
        result = offpolicy_trainer(
                policy_1, train_collector_1, test_collector_1, args.epoch,
                args.step_per_epoch, args.collect_per_step, args.test_num,
                args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
        if __name__ == '__main__':
            pprint.pprint(result)
            # Let's watch its performance!
            policy_1.eval()
            collector = Collector(policy_1, env)
            result = collector.collect(n_episode=1, render=args.render)
            print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        # train policy 2
        result = offpolicy_trainer(
                policy_2, train_collector_2, test_collector_2, args.epoch,
                args.step_per_epoch, args.collect_per_step, args.test_num,
                args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
        if __name__ == '__main__':
            pprint.pprint(result)
            # Let's watch its performance!
            policy_2.eval()
            collector = Collector(policy_2, env)
            result = collector.collect(n_episode=1, render=args.render)
            print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        # train distilled policy
        train_collector_1.reset()
        train_collector_2.reset()
        
        result = Distral_offpolicy_trainer(
                    distilled_policy, train_collector_1, train_collector_2, 
                    distilled_policy_test_collector, 20,
                    args.step_per_epoch // 5, args.collect_per_step, args.test_num,
                    args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
        if __name__ == '__main__':
            pprint.pprint(result)
            # Let's watch its performance!
            distilled_policy.eval()
            collector = Collector(distilled_policy, env)
            result = collector.collect(n_episode=1, render=args.render)
            print(f'Final reward: {result["rew"]}, length: {result["len"]}')

Epoch #1: 2401it [00:27, 88.67it/s, env_step=24000, len=1000, loss/actor=-19.829323, loss/critic1=57.594335, loss/critic2=57.912906, n/ep=8, n/st=8000, rew=28.19, v/ep=4.26, v/st=4261.11]                           
Epoch #2:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch #1: test_reward: 30.411748 ± 3.299431, best_reward: 30.411748 ± 3.299431 in #1


Epoch #2: 2401it [00:27, 87.19it/s, env_step=48000, len=1000, loss/actor=-24.099110, loss/critic1=4.999235, loss/critic2=5.078643, n/ep=8, n/st=8000, rew=39.19, v/ep=4.66, v/st=4659.87]                             
Epoch #3:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch #2: test_reward: 45.800676 ± 2.478960, best_reward: 45.800676 ± 2.478960 in #2


Epoch #3: 2401it [00:26, 89.05it/s, env_step=72000, len=1000, loss/actor=5.716048, loss/critic1=6.864888, loss/critic2=6.587452, n/ep=8, n/st=8000, rew=25.27, v/ep=4.32, v/st=4322.36]                             


Epoch #3: test_reward: 34.668162 ± 1.736513, best_reward: 45.800676 ± 2.478960 in #2
{'best_result': '45.80 ± 2.48',
 'best_reward': 45.800675757776474,
 'duration': '111.75s',
 'test_episode': 300.0,
 'test_speed': '9978.28 step/s',
 'test_step': 300000,
 'test_time': '30.07s',
 'train_episode': 72.0,
 'train_speed': '881.49 step/s',
 'train_step': 72000,
 'train_time/collector': '16.65s',
 'train_time/model': '65.03s'}


Epoch #1:   0%|          | 0/2400 [00:00<?, ?it/s]

Final reward: 32.76420152375009, length: 1000.0


Epoch #1: 2401it [00:27, 88.07it/s, env_step=24000, len=1000, loss/actor=-5.636499, loss/critic1=38.515647, loss/critic2=38.030474, n/ep=8, n/st=8000, rew=30.00, v/ep=4.39, v/st=4387.43]                           
Epoch #2:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch #1: test_reward: 33.050672 ± 2.220341, best_reward: 33.050672 ± 2.220341 in #1


Epoch #2: 2401it [00:27, 86.51it/s, env_step=48000, len=1000, loss/actor=7.497104, loss/critic1=1.411483, loss/critic2=1.351916, n/ep=8, n/st=8000, rew=32.84, v/ep=4.31, v/st=4311.14]                             
Epoch #3:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch #2: test_reward: 30.895671 ± 3.348600, best_reward: 33.050672 ± 2.220341 in #1


Epoch #3: 2401it [00:27, 87.02it/s, env_step=72000, len=1000, loss/actor=7.638667, loss/critic1=1.940789, loss/critic2=2.010615, n/ep=8, n/st=8000, rew=33.37, v/ep=4.27, v/st=4271.93]                           


Epoch #3: test_reward: 31.513203 ± 1.910661, best_reward: 33.050672 ± 2.220341 in #1
{'best_result': '33.05 ± 2.22',
 'best_reward': 33.05067233637816,
 'duration': '114.16s',
 'test_episode': 300.0,
 'test_speed': '9539.17 step/s',
 'test_step': 300000,
 'test_time': '31.45s',
 'train_episode': 72.0,
 'train_speed': '870.54 step/s',
 'train_step': 72000,
 'train_time/collector': '16.74s',
 'train_time/model': '65.96s'}
Final reward: 33.36484841347395, length: 1000.0


Epoch #1: 481it [00:05, 85.27it/s, env_step=8000, len=1000, n/ep=8, n/st=8000, rew=35.45, v/ep=4.12, v/st=4115.18]                          
Epoch #2:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #1: test_reward: 20.694402 ± 1.940805, best_reward: 20.694402 ± 1.940805 in #1


Epoch #2: 481it [00:05, 86.02it/s, env_step=24000, len=1000, n/ep=8, n/st=8000, rew=34.29, v/ep=4.24, v/st=4239.12]                          
Epoch #3:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #2: test_reward: 24.165471 ± 2.072411, best_reward: 24.165471 ± 2.072411 in #2


Epoch #3: 481it [00:05, 83.52it/s, env_step=40000, len=1000, n/ep=8, n/st=8000, rew=34.77, v/ep=4.16, v/st=4157.87]                         
Epoch #4:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #3: test_reward: 18.428151 ± 1.790882, best_reward: 24.165471 ± 2.072411 in #2


Epoch #4: 481it [00:05, 84.61it/s, env_step=56000, len=1000, n/ep=8, n/st=8000, rew=34.34, v/ep=4.20, v/st=4200.16]                         
Epoch #5:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #4: test_reward: 19.231434 ± 1.598835, best_reward: 24.165471 ± 2.072411 in #2


Epoch #5: 481it [00:05, 85.28it/s, env_step=72000, len=1000, n/ep=8, n/st=8000, rew=35.24, v/ep=4.02, v/st=4023.25]                         
Epoch #6:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #5: test_reward: 18.848443 ± 2.065006, best_reward: 24.165471 ± 2.072411 in #2


Epoch #6: 481it [00:05, 85.52it/s, env_step=88000, len=1000, n/ep=8, n/st=8000, rew=33.69, v/ep=4.20, v/st=4198.93]                         
Epoch #7:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #6: test_reward: 16.226321 ± 2.121226, best_reward: 24.165471 ± 2.072411 in #2


Epoch #7: 481it [00:05, 84.99it/s, env_step=104000, len=1000, n/ep=8, n/st=8000, rew=34.87, v/ep=4.22, v/st=4220.62]                         
Epoch #8:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #7: test_reward: 21.685700 ± 2.686777, best_reward: 24.165471 ± 2.072411 in #2


Epoch #8: 481it [00:05, 86.70it/s, env_step=120000, len=1000, n/ep=8, n/st=8000, rew=34.60, v/ep=4.25, v/st=4254.67]                          
Epoch #9:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #8: test_reward: 26.605751 ± 1.860812, best_reward: 26.605751 ± 1.860812 in #8


Epoch #9: 481it [00:05, 88.06it/s, env_step=136000, len=1000, n/ep=8, n/st=8000, rew=34.19, v/ep=4.34, v/st=4343.98]                          
Epoch #10:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #9: test_reward: 24.657786 ± 1.970144, best_reward: 26.605751 ± 1.860812 in #8


Epoch #10: 481it [00:05, 83.58it/s, env_step=152000, len=1000, n/ep=8, n/st=8000, rew=33.92, v/ep=4.31, v/st=4306.62]                         
Epoch #11:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #10: test_reward: 23.832915 ± 2.751337, best_reward: 26.605751 ± 1.860812 in #8


Epoch #11: 481it [00:05, 85.77it/s, env_step=168000, len=1000, n/ep=8, n/st=8000, rew=34.94, v/ep=4.20, v/st=4204.74]                         
Epoch #12:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #11: test_reward: 21.158019 ± 2.990783, best_reward: 26.605751 ± 1.860812 in #8


Epoch #12: 481it [00:05, 89.00it/s, env_step=184000, len=1000, n/ep=8, n/st=8000, rew=35.20, v/ep=4.19, v/st=4191.99]                         
Epoch #13:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #12: test_reward: 25.485538 ± 1.233821, best_reward: 26.605751 ± 1.860812 in #8


Epoch #13: 481it [00:05, 85.97it/s, env_step=200000, len=1000, n/ep=8, n/st=8000, rew=33.82, v/ep=4.25, v/st=4250.37]                         
Epoch #14:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #13: test_reward: 23.460654 ± 2.214894, best_reward: 26.605751 ± 1.860812 in #8


Epoch #14: 481it [00:05, 83.63it/s, env_step=216000, len=1000, n/ep=8, n/st=8000, rew=35.20, v/ep=4.23, v/st=4229.10]                         
Epoch #15:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #14: test_reward: 23.873407 ± 1.610401, best_reward: 26.605751 ± 1.860812 in #8


Epoch #15: 481it [00:05, 84.86it/s, env_step=232000, len=1000, n/ep=8, n/st=8000, rew=35.92, v/ep=4.20, v/st=4195.89]                         
Epoch #16:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #15: test_reward: 29.060145 ± 0.902724, best_reward: 29.060145 ± 0.902724 in #15


Epoch #16: 481it [00:05, 85.39it/s, env_step=248000, len=1000, n/ep=8, n/st=8000, rew=34.20, v/ep=4.29, v/st=4291.87]                          
Epoch #17:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #16: test_reward: 24.727574 ± 2.114161, best_reward: 29.060145 ± 0.902724 in #15


Epoch #17: 481it [00:05, 83.38it/s, env_step=264000, len=1000, n/ep=8, n/st=8000, rew=34.64, v/ep=4.18, v/st=4179.26]                         
Epoch #18:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #17: test_reward: 25.352780 ± 1.972908, best_reward: 29.060145 ± 0.902724 in #15


Epoch #18: 481it [00:05, 87.47it/s, env_step=280000, len=1000, n/ep=8, n/st=8000, rew=34.72, v/ep=4.36, v/st=4359.94]                          
Epoch #19:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #18: test_reward: 25.950047 ± 1.708709, best_reward: 29.060145 ± 0.902724 in #15


Epoch #19: 481it [00:05, 82.20it/s, env_step=296000, len=1000, n/ep=8, n/st=8000, rew=34.54, v/ep=4.07, v/st=4067.90]                          
Epoch #20:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #19: test_reward: 24.965775 ± 2.457616, best_reward: 29.060145 ± 0.902724 in #15


Epoch #20: 481it [00:05, 87.01it/s, env_step=312000, len=1000, n/ep=8, n/st=8000, rew=34.64, v/ep=4.33, v/st=4328.00]                          


Epoch #20: test_reward: 28.044324 ± 1.239596, best_reward: 29.060145 ± 0.902724 in #15
{'best_result': '29.06 ± 0.90',
 'best_reward': 29.060144506520086,
 'duration': '327.10s',
 'test_episode': 2000.0,
 'test_speed': '9356.46 step/s',
 'test_step': 2000000,
 'test_time': '213.76s',
 'train_episode': 160.0,
 'train_speed': '1411.69 step/s',
 'train_step': 160000,
 'train_time/collector': '37.95s',
 'train_time/model': '75.39s'}


Epoch #1:   0%|          | 0/2400 [00:00<?, ?it/s]

Final reward: 26.887919973347007, length: 1000.0


Epoch #1: 2401it [00:27, 85.95it/s, env_step=24000, len=1000, loss/actor=-9.450734, loss/critic1=0.948970, loss/critic2=0.918185, n/ep=8, n/st=8000, rew=38.84, v/ep=4.30, v/st=4303.25]                            
Epoch #2:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch #1: test_reward: 29.019034 ± 3.335907, best_reward: 29.019034 ± 3.335907 in #1


Epoch #2: 2401it [00:27, 86.67it/s, env_step=48000, len=1000, loss/actor=4.019114, loss/critic1=1.232047, loss/critic2=1.196373, n/ep=8, n/st=8000, rew=32.33, v/ep=4.30, v/st=4302.10]                            
Epoch #3:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch #2: test_reward: 31.372385 ± 2.222536, best_reward: 31.372385 ± 2.222536 in #2


Epoch #3: 2401it [00:24, 97.76it/s, env_step=72000, len=1000, loss/actor=3.027163, loss/critic1=0.553395, loss/critic2=0.496708, n/ep=8, n/st=8000, rew=33.77, v/ep=4.64, v/st=4635.48]                           


Epoch #3: test_reward: 37.084234 ± 2.498763, best_reward: 37.084234 ± 2.498763 in #3
{'best_result': '37.08 ± 2.50',
 'best_reward': 37.08423425652073,
 'duration': '108.71s',
 'test_episode': 300.0,
 'test_speed': '10556.56 step/s',
 'test_step': 300000,
 'test_time': '28.42s',
 'train_episode': 72.0,
 'train_speed': '896.68 step/s',
 'train_step': 72000,
 'train_time/collector': '16.37s',
 'train_time/model': '63.92s'}


Epoch #1:   0%|          | 0/2400 [00:00<?, ?it/s]

Final reward: 38.46644630756987, length: 1000.0


Epoch #1: 2401it [00:25, 95.02it/s, env_step=24000, len=1000, loss/actor=8.673987, loss/critic1=1.127765, loss/critic2=1.154422, n/ep=8, n/st=8000, rew=38.93, v/ep=4.72, v/st=4723.52]                           
Epoch #2:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch #1: test_reward: 40.654142 ± 3.352417, best_reward: 40.654142 ± 3.352417 in #1


Epoch #2: 2401it [00:24, 97.98it/s, env_step=48000, len=1000, loss/actor=8.251520, loss/critic1=0.903367, loss/critic2=0.900132, n/ep=8, n/st=8000, rew=37.66, v/ep=4.96, v/st=4964.76]                           
Epoch #3:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch #2: test_reward: 38.659874 ± 2.479557, best_reward: 40.654142 ± 3.352417 in #1


Epoch #3: 2401it [00:24, 96.65it/s, env_step=72000, len=1000, loss/actor=4.920770, loss/critic1=1.405331, loss/critic2=1.429772, n/ep=8, n/st=8000, rew=35.14, v/ep=4.74, v/st=4737.21]                           


Epoch #3: test_reward: 36.198693 ± 9.374089, best_reward: 40.654142 ± 3.352417 in #1
{'best_result': '40.65 ± 3.35',
 'best_reward': 40.65414183878467,
 'duration': '101.06s',
 'test_episode': 300.0,
 'test_speed': '11381.24 step/s',
 'test_step': 300000,
 'test_time': '26.36s',
 'train_episode': 72.0,
 'train_speed': '963.88 step/s',
 'train_step': 72000,
 'train_time/collector': '15.10s',
 'train_time/model': '59.60s'}


Epoch #1:   0%|          | 0/480 [00:00<?, ?it/s]

Final reward: 38.77892641631444, length: 1000.0


Epoch #1: 481it [00:04, 98.72it/s, env_step=8000, len=1000, n/ep=8, n/st=8000, rew=36.41, v/ep=4.92, v/st=4922.72]                         
Epoch #2:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #1: test_reward: 37.353772 ± 1.073546, best_reward: 37.353772 ± 1.073546 in #1


Epoch #2: 481it [00:04, 99.56it/s, env_step=24000, len=1000, n/ep=8, n/st=8000, rew=37.77, v/ep=4.90, v/st=4904.75]                         
Epoch #3:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #2: test_reward: 37.192656 ± 1.163928, best_reward: 37.353772 ± 1.073546 in #1


Epoch #3: 481it [00:04, 99.06it/s, env_step=40000, len=1000, n/ep=8, n/st=8000, rew=36.88, v/ep=4.88, v/st=4876.17]                         
Epoch #4:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #3: test_reward: 39.247144 ± 1.139200, best_reward: 39.247144 ± 1.139200 in #3


Epoch #4: 481it [00:05, 88.58it/s, env_step=56000, len=1000, n/ep=8, n/st=8000, rew=37.55, v/ep=4.33, v/st=4328.62]                         
Epoch #5:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #4: test_reward: 36.061168 ± 1.619854, best_reward: 39.247144 ± 1.139200 in #3


Epoch #5: 481it [00:05, 87.78it/s, env_step=72000, len=1000, n/ep=8, n/st=8000, rew=37.96, v/ep=4.32, v/st=4321.81]                         
Epoch #6:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #5: test_reward: 38.897759 ± 2.427856, best_reward: 39.247144 ± 1.139200 in #3


Epoch #6: 481it [00:05, 81.17it/s, env_step=88000, len=1000, n/ep=8, n/st=8000, rew=38.36, v/ep=4.24, v/st=4236.29]                         
Epoch #7:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #6: test_reward: 41.409287 ± 2.234166, best_reward: 41.409287 ± 2.234166 in #6


Epoch #7: 481it [00:05, 84.50it/s, env_step=104000, len=1000, n/ep=8, n/st=8000, rew=37.39, v/ep=4.27, v/st=4270.46]                         
Epoch #8:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #7: test_reward: 41.747759 ± 1.995299, best_reward: 41.747759 ± 1.995299 in #7


Epoch #8: 481it [00:05, 86.02it/s, env_step=120000, len=1000, n/ep=8, n/st=8000, rew=35.91, v/ep=4.23, v/st=4230.25]                          
Epoch #9:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #8: test_reward: 36.438003 ± 1.204271, best_reward: 41.747759 ± 1.995299 in #7


Epoch #9: 481it [00:05, 85.25it/s, env_step=136000, len=1000, n/ep=8, n/st=8000, rew=37.42, v/ep=4.35, v/st=4347.42]                          
