In [1]:
# test pi_0
import os
import gym
import torch
import pprint
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter

from tianshou.env import DummyVectorEnv
from tianshou.utils.net.common import Net
from tianshou.trainer import offpolicy_trainer    ##
from tianshou.data import Collector, ReplayBuffer ##
from tianshou.utils.net.continuous import Actor, ActorProb, Critic


#-------------------------------------------------------------
# The belows are special for policy distral
from distral_offpolicy_trainer import Distral_offpolicy_trainer
####from distral_collector import Distral_Collector    # 惊了，这个也不需要
from distral_task_policy import TaskPolicy
from distral_distilled_policy import DistilledPolicy
#-------------------------------------------------------------



def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='Swimmer-v2')  # ‘Pendulum-v0’
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--buffer-size', type=int, default=20000)
    parser.add_argument('--actor-lr', type=float, default=3e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--il-lr', type=float, default=1e-3)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--tau', type=float, default=0.005)
    parser.add_argument('--alpha', type=float, default=0.2)
    parser.add_argument('--epoch', type=int, default=20)
    parser.add_argument('--step-per-epoch', type=int, default=2400)
    parser.add_argument('--collect-per-step', type=int, default=10)
    parser.add_argument('--batch-size', type=int, default=128)
    parser.add_argument('--layer-num', type=int, default=1)
    parser.add_argument('--training-num', type=int, default=8)
    parser.add_argument('--test-num', type=int, default=100)
    parser.add_argument('--logdir', type=str, default='log')
    parser.add_argument('--render', type=float, default=0.)
    parser.add_argument('--rew-norm', type=int, default=1)
    parser.add_argument('--ignore-done', type=int, default=1)
    parser.add_argument('--n-step', type=int, default=4)
    parser.add_argument(
        '--device', type=str,
        default='cuda' if torch.cuda.is_available() else 'cpu')
    args = parser.parse_known_args()[0]
    return args


args = get_args()
np.random.seed(args.seed)
torch.manual_seed(args.seed)
#args.step_per_epoch = 800
#args.epoch = 2

torch.set_num_threads(1)  # we just need only one thread for NN
env = gym.make(args.task)
if args.task == 'Pendulum-v0':
    env.spec.reward_threshold = -250
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]
# you can also use tianshou.env.SubprocVectorEnv
# train_envs = gym.make(args.task)

# model 1

train_envs_1 = DummyVectorEnv(
    [lambda: gym.make(args.task) for _ in range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs_1 = DummyVectorEnv(
    [lambda: gym.make(args.task) for _ in range(args.test_num)])

# seed
train_envs_1.seed(args.seed)
test_envs_1.seed(args.seed)

net_1 = Net(args.layer_num, args.state_shape, device=args.device)
actor_1 = ActorProb(
    net_1, args.action_shape, args.max_action, args.device, unbounded=True
).to(args.device)
actor_optim_1 = torch.optim.Adam(actor_1.parameters(), lr=args.actor_lr)
net_c1_1 = Net(args.layer_num, args.state_shape,
               args.action_shape, concat=True, device=args.device)
critic1_1 = Critic(net_c1_1, args.device).to(args.device)
critic1_optim_1 = torch.optim.Adam(critic1_1.parameters(), lr=args.critic_lr)
net_c2_1 = Net(args.layer_num, args.state_shape,
               args.action_shape, concat=True, device=args.device)
critic2_1 = Critic(net_c2_1, args.device).to(args.device)
critic2_optim_1 = torch.optim.Adam(critic2_1.parameters(), lr=args.critic_lr)
policy_1 = TaskPolicy(
    actor_1, actor_optim_1, critic1_1, critic1_optim_1, critic2_1, critic2_optim_1,
    action_range=[env.action_space.low[0], env.action_space.high[0]],
    tau=args.tau, gamma=args.gamma, alpha=args.alpha,
    reward_normalization=args.rew_norm,
    ignore_done=args.ignore_done,
    estimation_step=args.n_step)

# model 2

train_envs_2 = DummyVectorEnv(
    [lambda: gym.make(args.task) for _ in range(args.training_num)])
# test_envs = gym.make(args.task)
test_envs_2 = DummyVectorEnv(
    [lambda: gym.make(args.task) for _ in range(args.test_num)])

# seed
train_envs_2.seed(args.seed)
test_envs_2.seed(args.seed)

net_2 = Net(args.layer_num, args.state_shape, device=args.device)
actor_2 = ActorProb(
    net_2, args.action_shape, args.max_action, args.device, unbounded=True
).to(args.device)
actor_optim_2 = torch.optim.Adam(actor_2.parameters(), lr=args.actor_lr)
net_c1_2 = Net(args.layer_num, args.state_shape,
               args.action_shape, concat=True, device=args.device)
critic1_2 = Critic(net_c1_2, args.device).to(args.device)
critic1_optim_2 = torch.optim.Adam(critic1_2.parameters(), lr=args.critic_lr)
net_c2_2 = Net(args.layer_num, args.state_shape,
               args.action_shape, concat=True, device=args.device)
critic2_2 = Critic(net_c2_2, args.device).to(args.device)
critic2_optim_2 = torch.optim.Adam(critic2_2.parameters(), lr=args.critic_lr)
policy_2 = TaskPolicy(
    actor_2, actor_optim_2, critic1_2, critic1_optim_2, critic2_2, critic2_optim_2,
    action_range=[env.action_space.low[0], env.action_space.high[0]],
    tau=args.tau, gamma=args.gamma, alpha=args.alpha,
    reward_normalization=args.rew_norm,
    ignore_done=args.ignore_done,
    estimation_step=args.n_step)


# collector
train_collector_1 = Collector(
    policy_1, train_envs_1, ReplayBuffer(args.buffer_size))
test_collector_1 = Collector(policy_1, test_envs_1)
train_collector_2 = Collector(
    policy_2, train_envs_2, ReplayBuffer(args.buffer_size))
test_collector_2 = Collector(policy_2, test_envs_2)
# train_collector.collect(n_step=args.buffer_size)
# log
log_path = os.path.join(args.logdir, args.task, 'sac_distral')
writer = SummaryWriter(log_path)


def save_fn(policy):
    torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))


def stop_fn(mean_rewards):
    return mean_rewards >= env.spec.reward_threshold


for itr in range(10):
    if itr == 0:
        # train policy 1
        result = offpolicy_trainer(
                policy_1, train_collector_1, test_collector_1, args.epoch,
                args.step_per_epoch, args.collect_per_step, args.test_num,
                args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
        if __name__ == '__main__':
            pprint.pprint(result)
            # Let's watch its performance!
            policy_1.eval()
            collector = Collector(policy_1, env)
            result = collector.collect(n_episode=1, render=args.render)
            print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        
        # train policy 2
        result = offpolicy_trainer(
                policy_2, train_collector_2, test_collector_2, args.epoch,
                args.step_per_epoch, args.collect_per_step, args.test_num,
                args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
        if __name__ == '__main__':
            pprint.pprint(result)
            # Let's watch its performance!
            policy_2.eval()
            collector = Collector(policy_2, env)
            result = collector.collect(n_episode=1, render=args.render)
            print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        
        # train distilled policy
        if args.task == 'Pendulum-v0':
            env.spec.reward_threshold = -300  # lower the goal
        net = ActorProb(
                Net(1, args.state_shape, device=args.device),
                args.action_shape, args.max_action, args.device
                ).to(args.device)
        
        optim = torch.optim.Adam(net.parameters(), lr=args.il_lr)

        distilled_policy = DistilledPolicy(net, optim, 
                    action_range=[env.action_space.low[0], env.action_space.high[0]],
                            mode='continuous')

        distilled_policy_test_collector = Collector(
                    distilled_policy,
                    DummyVectorEnv(
                        [lambda: gym.make(args.task) for _ in range(args.test_num)])
                )

        train_collector_1.reset()
        train_collector_2.reset()

        result = Distral_offpolicy_trainer(
                    distilled_policy, train_collector_1, train_collector_2, 
                    distilled_policy_test_collector, 20,
                    args.step_per_epoch // 5, args.collect_per_step, args.test_num,
                    args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
        if __name__ == '__main__':
            pprint.pprint(result)
            # Let's watch its performance!
            distilled_policy.eval()
            collector = Collector(distilled_policy, env)
            result = collector.collect(n_episode=1, render=args.render)
            print(f'Final reward: {result["rew"]}, length: {result["len"]}')
    else:
        # train policy 1
        result = offpolicy_trainer(
                policy_1, train_collector_1, test_collector_1, args.epoch,
                args.step_per_epoch, args.collect_per_step, args.test_num,
                args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
        if __name__ == '__main__':
            pprint.pprint(result)
            # Let's watch its performance!
            policy_1.eval()
            collector = Collector(policy_1, env)
            result = collector.collect(n_episode=1, render=args.render)
            print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        # train policy 2
        result = offpolicy_trainer(
                policy_2, train_collector_2, test_collector_2, args.epoch,
                args.step_per_epoch, args.collect_per_step, args.test_num,
                args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
        if __name__ == '__main__':
            pprint.pprint(result)
            # Let's watch its performance!
            policy_2.eval()
            collector = Collector(policy_2, env)
            result = collector.collect(n_episode=1, render=args.render)
            print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        # train distilled policy
        train_collector_1.reset()
        train_collector_2.reset()
        
        result = Distral_offpolicy_trainer(
                    distilled_policy, train_collector_1, train_collector_2, 
                    distilled_policy_test_collector, 20,
                    args.step_per_epoch // 5, args.collect_per_step, args.test_num,
                    args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer)
        if __name__ == '__main__':
            pprint.pprint(result)
            # Let's watch its performance!
            distilled_policy.eval()
            collector = Collector(distilled_policy, env)
            result = collector.collect(n_episode=1, render=args.render)
            print(f'Final reward: {result["rew"]}, length: {result["len"]}')

Epoch #1:  60%|######    | 1440/2400 [00:16<00:10, 88.78it/s, env_step=16000, len=200.00, n/ep=8.00, n/st=1600.00, rew=-185.96, rew_std=130.38, v/ep=25.28, v/st=5056.09]                                             
Epoch #1:   0%|          | 0/2400 [00:00<?, ?it/s]

{'best_result': '-222.02 ± 120.40',
 'best_reward': -222.02202675443905,
 'duration': '16.22s',
 'test_episode': 100.0,
 'test_speed': '13745.45 step/s',
 'test_step': 20000,
 'test_time': '1.46s',
 'train_episode': 80.0,
 'train_speed': '1083.53 step/s',
 'train_step': 16000,
 'train_time/collector': '3.05s',
 'train_time/model': '11.72s'}
Final reward: -132.40889986952368, length: 200.0


Epoch #1:  60%|######    | 1440/2400 [00:17<00:11, 82.62it/s, env_step=16000, len=200.00, n/ep=8.00, n/st=1600.00, rew=-177.92, rew_std=155.62, v/ep=26.32, v/st=5264.67]                                               


{'best_result': '-172.05 ± 117.14',
 'best_reward': -172.054880058112,
 'duration': '17.43s',
 'test_episode': 200.0,
 'test_speed': '14034.90 step/s',
 'test_step': 40000,
 'test_time': '2.85s',
 'train_episode': 80.0,
 'train_speed': '1097.39 step/s',
 'train_step': 16000,
 'train_time/collector': '3.06s',
 'train_time/model': '11.52s'}
Final reward: -124.99528559181373, length: 200.0


Epoch #1: 481it [00:03, 136.55it/s, env_step=8000, len=200, n/ep=8, n/st=1600, rew=-317.25, v/ep=25.98, v/st=5196.65]                         
Epoch #2:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #1: test_reward: -604.957454 ± 470.947396, best_reward: -604.957454 ± 470.947396 in #1


Epoch #2: 481it [00:03, 140.45it/s, env_step=17600, len=200, n/ep=8, n/st=1600, rew=-281.49, v/ep=26.30, v/st=5259.27]                         
Epoch #3:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #2: test_reward: -750.766964 ± 91.319559, best_reward: -604.957454 ± 470.947396 in #1


Epoch #3: 481it [00:03, 140.87it/s, env_step=27200, len=200, n/ep=8, n/st=1600, rew=-170.78, v/ep=25.97, v/st=5194.41]                         
Epoch #4:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #3: test_reward: -698.085307 ± 113.068231, best_reward: -604.957454 ± 470.947396 in #1


Epoch #4: 481it [00:03, 138.05it/s, env_step=36800, len=200, n/ep=8, n/st=1600, rew=-217.66, v/ep=24.95, v/st=4989.43]                         
Epoch #5:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #4: test_reward: -528.700706 ± 154.796381, best_reward: -528.700706 ± 154.796381 in #4


Epoch #5: 481it [00:03, 136.27it/s, env_step=46400, len=200, n/ep=8, n/st=1600, rew=-208.67, v/ep=23.46, v/st=4692.60]                         
Epoch #6:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #5: test_reward: -404.197649 ± 224.046713, best_reward: -404.197649 ± 224.046713 in #5


Epoch #6: 481it [00:03, 138.71it/s, env_step=56000, len=200, n/ep=8, n/st=1600, rew=-214.63, v/ep=26.02, v/st=5203.12]                         
Epoch #7:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #6: test_reward: -491.545951 ± 168.746814, best_reward: -404.197649 ± 224.046713 in #5


Epoch #7: 481it [00:03, 137.44it/s, env_step=65600, len=200, n/ep=8, n/st=1600, rew=-225.14, v/ep=26.59, v/st=5318.13]                         
Epoch #8:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #7: test_reward: -507.884893 ± 191.929256, best_reward: -404.197649 ± 224.046713 in #5


Epoch #8: 481it [00:03, 136.65it/s, env_step=75200, len=200, n/ep=8, n/st=1600, rew=-318.80, v/ep=25.86, v/st=5172.70]                         
Epoch #9:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #8: test_reward: -597.134398 ± 243.003785, best_reward: -404.197649 ± 224.046713 in #5


Epoch #9: 481it [00:03, 139.98it/s, env_step=84800, len=200, n/ep=8, n/st=1600, rew=-160.56, v/ep=26.62, v/st=5323.62]                         
Epoch #10:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #9: test_reward: -548.594743 ± 158.957307, best_reward: -404.197649 ± 224.046713 in #5


Epoch #10: 481it [00:03, 139.70it/s, env_step=94400, len=200, n/ep=8, n/st=1600, rew=-260.96, v/ep=26.13, v/st=5226.04]                         
Epoch #11:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #10: test_reward: -392.737516 ± 255.773762, best_reward: -392.737516 ± 255.773762 in #10


Epoch #11: 481it [00:03, 140.18it/s, env_step=104000, len=200, n/ep=8, n/st=1600, rew=-205.89, v/ep=26.56, v/st=5312.85]                         
Epoch #12:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #11: test_reward: -368.477263 ± 222.701019, best_reward: -368.477263 ± 222.701019 in #11


Epoch #12: 481it [00:03, 140.47it/s, env_step=113600, len=200, n/ep=8, n/st=1600, rew=-208.01, v/ep=26.70, v/st=5340.30]                         
Epoch #13:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #12: test_reward: -485.146436 ± 195.913573, best_reward: -368.477263 ± 222.701019 in #11


Epoch #13: 481it [00:03, 139.41it/s, env_step=123200, len=200, n/ep=8, n/st=1600, rew=-196.44, v/ep=26.24, v/st=5248.08]                         
Epoch #14:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #13: test_reward: -369.032541 ± 252.038044, best_reward: -368.477263 ± 222.701019 in #11


Epoch #14: 481it [00:03, 138.89it/s, env_step=132800, len=200, n/ep=8, n/st=1600, rew=-245.36, v/ep=25.00, v/st=5000.06]                         
Epoch #15:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #14: test_reward: -372.231960 ± 263.245327, best_reward: -368.477263 ± 222.701019 in #11


Epoch #15: 481it [00:03, 140.07it/s, env_step=142400, len=200, n/ep=8, n/st=1600, rew=-255.63, v/ep=26.42, v/st=5284.06]                         
Epoch #16:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #15: test_reward: -340.367904 ± 219.201510, best_reward: -340.367904 ± 219.201510 in #15


Epoch #16: 481it [00:03, 140.70it/s, env_step=152000, len=200, n/ep=8, n/st=1600, rew=-264.18, v/ep=26.53, v/st=5305.46]                         
Epoch #17:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #16: test_reward: -543.094916 ± 280.280518, best_reward: -340.367904 ± 219.201510 in #15


Epoch #17: 481it [00:03, 139.71it/s, env_step=161600, len=200, n/ep=8, n/st=1600, rew=-230.84, v/ep=26.26, v/st=5252.09]                         
Epoch #18:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #17: test_reward: -452.765935 ± 275.591989, best_reward: -340.367904 ± 219.201510 in #15


Epoch #18: 481it [00:03, 140.04it/s, env_step=171200, len=200, n/ep=8, n/st=1600, rew=-255.96, v/ep=26.58, v/st=5315.14]                         
Epoch #19:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #18: test_reward: -320.896624 ± 228.002984, best_reward: -320.896624 ± 228.002984 in #18


Epoch #19: 481it [00:03, 139.55it/s, env_step=180800, len=200, n/ep=8, n/st=1600, rew=-177.45, v/ep=26.05, v/st=5209.48]                         
Epoch #20:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #19: test_reward: -394.481045 ± 252.956734, best_reward: -320.896624 ± 228.002984 in #18


Epoch #20: 481it [00:03, 140.40it/s, env_step=190400, len=200, n/ep=8, n/st=1600, rew=-208.60, v/ep=26.68, v/st=5335.31]                         
Epoch #1:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch #20: test_reward: -332.793022 ± 208.452075, best_reward: -320.896624 ± 228.002984 in #18
{'best_result': '-320.90 ± 228.00',
 'best_reward': -320.89662382868005,
 'duration': '97.64s',
 'test_episode': 2000.0,
 'test_speed': '14062.94 step/s',
 'test_step': 400000,
 'test_time': '28.44s',
 'train_episode': 480.0,
 'train_speed': '1387.37 step/s',
 'train_step': 96000,
 'train_time/collector': '18.46s',
 'train_time/model': '50.73s'}
Final reward: -253.36232680599616, length: 200.0


Epoch #1:   0%|          | 0/2400 [00:01<?, ?it/s, env_step=1600, len=200.00, n/ep=8.00, n/st=1600.00, rew=-240.19, rew_std=73.05, v/ep=25.69, v/st=5137.35]
Epoch #1:   0%|          | 0/2400 [00:00<?, ?it/s]

{'best_result': '-208.34 ± 127.22',
 'best_reward': -208.33845814146082,
 'duration': '1.74s',
 'test_episode': 100.0,
 'test_speed': '14047.58 step/s',
 'test_step': 20000,
 'test_time': '1.42s',
 'train_episode': 8.0,
 'train_speed': '5046.60 step/s',
 'train_step': 1600,
 'train_time/collector': '0.31s',
 'train_time/model': '0.01s'}
Final reward: -260.3295328745536, length: 200.0


Epoch #1:   0%|          | 0/2400 [00:01<?, ?it/s, env_step=1600, len=200.00, n/ep=8.00, n/st=1600.00, rew=-219.15, rew_std=102.31, v/ep=26.30, v/st=5260.08]
Epoch #1:   0%|          | 0/480 [00:00<?, ?it/s]

{'best_result': '-198.14 ± 119.25',
 'best_reward': -198.14143381569207,
 'duration': '1.70s',
 'test_episode': 100.0,
 'test_speed': '14372.89 step/s',
 'test_step': 20000,
 'test_time': '1.39s',
 'train_episode': 8.0,
 'train_speed': '5167.00 step/s',
 'train_step': 1600,
 'train_time/collector': '0.30s',
 'train_time/model': '0.01s'}
Final reward: -127.19963327142413, length: 200.0


Epoch #1: 481it [00:03, 140.55it/s, env_step=8000, len=200, n/ep=8, n/st=1600, rew=-226.01, v/ep=26.78, v/st=5355.20]                         
Epoch #2:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #1: test_reward: -340.829180 ± 227.785150, best_reward: -340.829180 ± 227.785150 in #1


Epoch #2: 481it [00:03, 142.27it/s, env_step=17600, len=200, n/ep=8, n/st=1600, rew=-250.82, v/ep=26.75, v/st=5349.32]                         
Epoch #3:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #2: test_reward: -421.689846 ± 276.842119, best_reward: -340.829180 ± 227.785150 in #1


Epoch #3: 481it [00:03, 141.03it/s, env_step=27200, len=200, n/ep=8, n/st=1600, rew=-156.84, v/ep=26.68, v/st=5335.44]                         
Epoch #4:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #3: test_reward: -520.911354 ± 255.808791, best_reward: -340.829180 ± 227.785150 in #1


Epoch #4: 481it [00:03, 140.62it/s, env_step=36800, len=200, n/ep=8, n/st=1600, rew=-181.01, v/ep=26.49, v/st=5298.45]                         
Epoch #5:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #4: test_reward: -376.586001 ± 261.626159, best_reward: -340.829180 ± 227.785150 in #1


Epoch #5: 481it [00:03, 139.61it/s, env_step=46400, len=200, n/ep=8, n/st=1600, rew=-206.05, v/ep=26.56, v/st=5311.11]                         
Epoch #6:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #5: test_reward: -462.374770 ± 329.470723, best_reward: -340.829180 ± 227.785150 in #1


Epoch #6: 481it [00:03, 139.62it/s, env_step=56000, len=200, n/ep=8, n/st=1600, rew=-221.40, v/ep=26.35, v/st=5270.72]                         
Epoch #7:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #6: test_reward: -378.144179 ± 222.875786, best_reward: -340.829180 ± 227.785150 in #1


Epoch #7: 481it [00:03, 136.55it/s, env_step=65600, len=200, n/ep=8, n/st=1600, rew=-233.54, v/ep=25.43, v/st=5086.25]                         
Epoch #8:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #7: test_reward: -393.466410 ± 215.599042, best_reward: -340.829180 ± 227.785150 in #1


Epoch #8: 481it [00:03, 140.41it/s, env_step=75200, len=200, n/ep=8, n/st=1600, rew=-209.63, v/ep=26.49, v/st=5298.14]                         
Epoch #9:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #8: test_reward: -469.960507 ± 197.303619, best_reward: -340.829180 ± 227.785150 in #1


Epoch #9: 481it [00:03, 140.10it/s, env_step=84800, len=200, n/ep=8, n/st=1600, rew=-202.85, v/ep=26.55, v/st=5309.39]                         
Epoch #10:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #9: test_reward: -381.124571 ± 232.391805, best_reward: -340.829180 ± 227.785150 in #1


Epoch #10: 481it [00:03, 125.23it/s, env_step=94400, len=200, n/ep=8, n/st=1600, rew=-301.90, v/ep=23.40, v/st=4680.19]                         
Epoch #11:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #10: test_reward: -322.097723 ± 203.623002, best_reward: -322.097723 ± 203.623002 in #10


Epoch #11: 481it [00:04, 116.59it/s, env_step=104000, len=200, n/ep=8, n/st=1600, rew=-161.30, v/ep=22.50, v/st=4500.66]                         
Epoch #12:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch #11: test_reward: -367.457856 ± 222.676505, best_reward: -322.097723 ± 203.623002 in #10


Epoch #12:  67%|######6   | 320/480 [00:02<00:01, 121.12it/s, env_step=110400, len=200, n/ep=8, n/st=1600, rew=-190.62, v/ep=23.25, v/st=4649.17]


KeyboardInterrupt: 