In [1]:
import copy
import glob
import os
import time
from collections import deque

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import algo
from arguments import get_args
from envs import make_vec_envs_ViZDoom, make_vec_envs
from model import Policy
from storage import RolloutStorage
from utils import get_vec_normalize
from visualize import visdom_plot

import os
import json

In [2]:
result_dir = "../ppo_record_6/"
os.makedirs(result_dir, exist_ok=True)

reward_history = os.path.join(result_dir, "reward_history")
loss_history = os.path.join(result_dir, "loss_history")
parameter_save = os.path.join(result_dir, "parameter.json")
fileL = [reward_history, loss_history, parameter_save]

In [3]:
#remove old record files
for f in fileL:
    try:
        os.remove(f)
    except OSError:
        pass

In [4]:
args = get_args()

def ppo_hyper():
    args.algo = "ppo"
    args.use_gae = False
    args.lr = 1e-5
    args.value_loss_coef = 1.0
    args.num_processes = 8
    args.num_steps = 512
    args.num_mini_batch = 4
    args.entropy_coef = 0.01
    args.gamma = 1.0
    args.ppo_epoch = 10
    args.clip_param = 0.1

def a2c_hyper():
    args.algo = "a2c"
    args.gamma = 1.0
    args.num_steps = 512
    args.num_processes = 8
    args.entropy_coef = 0.001

In [5]:
ppo_hyper()
args

Namespace(add_timestep=False, algo='ppo', alpha=0.99, clip_param=0.1, cuda=True, entropy_coef=0.01, env_name='PongNoFrameskip-v4', eps=1e-05, eval_interval=None, gamma=1.0, log_dir='/tmp/gym/', log_interval=10, lr=1e-05, max_grad_norm=0.5, no_cuda=False, num_frames=10000000.0, num_mini_batch=4, num_processes=8, num_steps=512, port=8097, ppo_epoch=10, recurrent_policy=False, save_dir='./trained_models/', save_interval=100, seed=1, tau=0.95, use_gae=False, value_loss_coef=1.0, vis=False, vis_interval=100)

In [6]:
parameters = {}
parameters['algo'] = args.algo
parameters['gamma'] = args.gamma
parameters['num_steps'] = args.num_steps
parameters['num_processes'] = args.num_processes
parameters['value_loss_coef'] = args.value_loss_coef
parameters['eps'] = args.eps
parameters['entropy_coef'] = args.entropy_coef
parameters['lr'] = args.lr
parameters['use_gae'] = args.use_gae
parameters['max_grad_norm'] = args.max_grad_norm
parameters['seed'] = args.seed

if parameters['algo'] == "a2c":
    parameters['alpha'] = args.alpha
elif parameters['algo'] == "ppo":
    parameters['clip_param'] = args.clip_param
    parameters['ppo_epoch'] = args.ppo_epoch
    parameters['num_mini_batch'] = args.num_mini_batch

if parameters['use_gae']:
    parameters['tau'] = args.tau
    

In [7]:
json.dump(parameters, open(parameter_save, "w"))

In [8]:
num_updates = int(args.num_frames) // args.num_steps // args.num_processes
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

In [9]:
torch.set_num_threads(1)
device = torch.device("cuda:0" if args.cuda else "cpu")

envs = make_vec_envs_ViZDoom(args.seed, args.num_processes, device)
#envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
#                        args.gamma, args.log_dir, args.add_timestep, device, False)

actor_critic = Policy(envs.observation_space.shape, envs.action_space,
    base_kwargs={'recurrent': args.recurrent_policy})
actor_critic.to(device)

if args.algo == 'a2c':
    agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
else:
    agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)

rollouts = RolloutStorage(args.num_steps, args.num_processes,
                    envs.observation_space.shape, envs.action_space,
                    actor_critic.recurrent_hidden_state_size)

  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  wa

In [10]:
obs = envs.reset()
rollouts.obs[0].copy_(obs)
rollouts.to(device)

recent_count = 50
episode_rewards = deque(maxlen=recent_count)
episode_lengths = deque(maxlen=recent_count)

In [11]:
acc_step = []
acc_reward = []
acc_length = []

for j in range(num_updates):
    for step in range(args.num_steps):
        # Sample actions
        with torch.no_grad():
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

        # Obser reward and next obs
        obs, reward, done, infos = envs.step(action)

        for info in infos:
            if 'Episode_Total_Reward' in info.keys():
                episode_rewards.append(info['Episode_Total_Reward'])
            if 'Episode_Total_Len' in info.keys():
                episode_lengths.append(info['Episode_Total_Len'])

        # If done then clean the history of observations.
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])
        rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)
        
    
    

    with torch.no_grad():
        next_value = actor_critic.get_value(rollouts.obs[-1],
                                            rollouts.recurrent_hidden_states[-1],
                                            rollouts.masks[-1]).detach()

    rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

    value_loss, action_loss, dist_entropy = agent.update(rollouts)

    rollouts.after_update()
    
    total_num_steps = (j + 1) * args.num_processes * args.num_steps
    
    with open(loss_history, 'a') as the_file:
        the_file.write("{} {} {} {} \n".format(total_num_steps, value_loss, action_loss, dist_entropy))
    
    if len(episode_rewards) > 0:
        print("{} updates: avg reward = {}, avg length = {}".format(total_num_steps, np.mean(episode_rewards),
                                                               np.mean(episode_lengths)))
        
        with open(reward_history, 'a') as the_file:
            the_file.write('{} {} {} \n'.format(total_num_steps, np.mean(episode_rewards),
                                               np.mean(episode_lengths)))
        
    

4096 updates: avg reward = -574.8571428571429, avg length = 425.14285714285717
8192 updates: avg reward = -545.6, avg length = 454.4
12288 updates: avg reward = -546.6666666666666, avg length = 453.3333333333333
16384 updates: avg reward = -553.939393939394, avg length = 446.06060606060606
20480 updates: avg reward = -553.560975609756, avg length = 446.4390243902439
24576 updates: avg reward = -553.92, avg length = 446.08
28672 updates: avg reward = -551.36, avg length = 448.64
32768 updates: avg reward = -554.56, avg length = 445.44
36864 updates: avg reward = -557.12, avg length = 442.88
40960 updates: avg reward = -553.28, avg length = 446.72
45056 updates: avg reward = -548.8, avg length = 451.2
49152 updates: avg reward = -550.08, avg length = 449.92
53248 updates: avg reward = -545.6, avg length = 454.4
57344 updates: avg reward = -549.44, avg length = 450.56
61440 updates: avg reward = -539.2, avg length = 460.8
65536 updates: avg reward = -544.32, avg length = 455.68
69632 upda

585728 updates: avg reward = -524.48, avg length = 475.52
589824 updates: avg reward = -530.88, avg length = 469.12
593920 updates: avg reward = -525.12, avg length = 474.88
598016 updates: avg reward = -529.6, avg length = 470.4
602112 updates: avg reward = -522.56, avg length = 477.44
606208 updates: avg reward = -521.92, avg length = 478.08
610304 updates: avg reward = -512.32, avg length = 487.68
614400 updates: avg reward = -513.6, avg length = 486.4
618496 updates: avg reward = -524.48, avg length = 475.52
622592 updates: avg reward = -532.8, avg length = 467.2
626688 updates: avg reward = -530.24, avg length = 469.76
630784 updates: avg reward = -518.08, avg length = 481.92
634880 updates: avg reward = -520.0, avg length = 480.0
638976 updates: avg reward = -513.6, avg length = 486.4
643072 updates: avg reward = -493.76, avg length = 506.24
647168 updates: avg reward = -494.4, avg length = 505.6
651264 updates: avg reward = -494.4, avg length = 505.6
655360 updates: avg reward =

1167360 updates: avg reward = -463.68, avg length = 536.32
1171456 updates: avg reward = -470.72, avg length = 529.28
1175552 updates: avg reward = -438.72, avg length = 561.28
1179648 updates: avg reward = -429.12, avg length = 570.88
1183744 updates: avg reward = -428.48, avg length = 571.52
1187840 updates: avg reward = -424.0, avg length = 576.0
1191936 updates: avg reward = -435.52, avg length = 564.48
1196032 updates: avg reward = -443.84, avg length = 556.16
1200128 updates: avg reward = -434.88, avg length = 565.12
1204224 updates: avg reward = -457.92, avg length = 542.08
1208320 updates: avg reward = -464.32, avg length = 535.68
1212416 updates: avg reward = -475.2, avg length = 524.8
1216512 updates: avg reward = -469.44, avg length = 530.56
1220608 updates: avg reward = -463.04, avg length = 536.96
1224704 updates: avg reward = -493.76, avg length = 506.24
1228800 updates: avg reward = -475.2, avg length = 524.8
1232896 updates: avg reward = -464.32, avg length = 535.68
123

1740800 updates: avg reward = -450.24, avg length = 549.76
1744896 updates: avg reward = -464.32, avg length = 535.68
1748992 updates: avg reward = -442.56, avg length = 557.44
1753088 updates: avg reward = -449.6, avg length = 550.4
1757184 updates: avg reward = -470.08, avg length = 529.92
1761280 updates: avg reward = -473.28, avg length = 526.72
1765376 updates: avg reward = -469.44, avg length = 530.56
1769472 updates: avg reward = -459.2, avg length = 540.8
1773568 updates: avg reward = -483.52, avg length = 516.48
1777664 updates: avg reward = -486.08, avg length = 513.92
1781760 updates: avg reward = -479.68, avg length = 520.32
1785856 updates: avg reward = -478.4, avg length = 521.6
1789952 updates: avg reward = -486.08, avg length = 513.92
1794048 updates: avg reward = -482.88, avg length = 517.12
1798144 updates: avg reward = -480.32, avg length = 519.68
1802240 updates: avg reward = -472.64, avg length = 527.36
1806336 updates: avg reward = -477.76, avg length = 522.24
181

2314240 updates: avg reward = -438.72, avg length = 561.28
2318336 updates: avg reward = -470.72, avg length = 529.28
2322432 updates: avg reward = -466.24, avg length = 533.76
2326528 updates: avg reward = -458.56, avg length = 541.44
2330624 updates: avg reward = -475.2, avg length = 524.8
2334720 updates: avg reward = -504.0, avg length = 496.0
2338816 updates: avg reward = -503.36, avg length = 496.64
2342912 updates: avg reward = -488.0, avg length = 512.0
2347008 updates: avg reward = -480.96, avg length = 519.04
2351104 updates: avg reward = -478.4, avg length = 521.6
2355200 updates: avg reward = -474.56, avg length = 525.44
2359296 updates: avg reward = -472.64, avg length = 527.36
2363392 updates: avg reward = -470.08, avg length = 529.92
2367488 updates: avg reward = -475.2, avg length = 524.8
2371584 updates: avg reward = -488.0, avg length = 512.0
2375680 updates: avg reward = -485.44, avg length = 514.56
2379776 updates: avg reward = -498.24, avg length = 501.76
2383872 u

2887680 updates: avg reward = -431.68, avg length = 568.32
2891776 updates: avg reward = -427.84, avg length = 572.16
2895872 updates: avg reward = -440.64, avg length = 559.36
2899968 updates: avg reward = -439.36, avg length = 560.64
2904064 updates: avg reward = -425.28, avg length = 574.72
2908160 updates: avg reward = -424.64, avg length = 575.36
2912256 updates: avg reward = -431.04, avg length = 568.96
2916352 updates: avg reward = -367.76, avg length = 612.24
2920448 updates: avg reward = -356.24, avg length = 623.76
2924544 updates: avg reward = -337.68, avg length = 642.32
2928640 updates: avg reward = -340.24, avg length = 639.76
2932736 updates: avg reward = -323.6, avg length = 656.4
2936832 updates: avg reward = -326.16, avg length = 653.84
2940928 updates: avg reward = -339.6, avg length = 640.4
2945024 updates: avg reward = -353.68, avg length = 626.32
2949120 updates: avg reward = -401.6, avg length = 598.4
2953216 updates: avg reward = -387.52, avg length = 612.48
295

3461120 updates: avg reward = -473.28, avg length = 526.72
3465216 updates: avg reward = -483.52, avg length = 516.48
3469312 updates: avg reward = -485.44, avg length = 514.56
3473408 updates: avg reward = -469.44, avg length = 530.56
3477504 updates: avg reward = -463.68, avg length = 536.32
3481600 updates: avg reward = -446.4, avg length = 553.6
3485696 updates: avg reward = -444.48, avg length = 555.52
3489792 updates: avg reward = -435.52, avg length = 564.48
3493888 updates: avg reward = -409.92, avg length = 590.08
3497984 updates: avg reward = -410.56, avg length = 589.44
3502080 updates: avg reward = -428.48, avg length = 571.52
3506176 updates: avg reward = -418.88, avg length = 581.12
3510272 updates: avg reward = -477.12, avg length = 522.88
3514368 updates: avg reward = -480.96, avg length = 519.04
3518464 updates: avg reward = -488.64, avg length = 511.36
3522560 updates: avg reward = -495.68, avg length = 504.32
3526656 updates: avg reward = -482.88, avg length = 517.12

4034560 updates: avg reward = -462.4, avg length = 537.6
4038656 updates: avg reward = -456.0, avg length = 544.0
4042752 updates: avg reward = -470.08, avg length = 529.92
4046848 updates: avg reward = -466.24, avg length = 533.76
4050944 updates: avg reward = -440.64, avg length = 559.36
4055040 updates: avg reward = -440.64, avg length = 559.36
4059136 updates: avg reward = -461.76, avg length = 538.24
4063232 updates: avg reward = -445.76, avg length = 554.24
4067328 updates: avg reward = -441.28, avg length = 558.72
4071424 updates: avg reward = -448.96, avg length = 551.04
4075520 updates: avg reward = -438.08, avg length = 561.92
4079616 updates: avg reward = -450.24, avg length = 549.76
4083712 updates: avg reward = -470.72, avg length = 529.28
4087808 updates: avg reward = -477.12, avg length = 522.88
4091904 updates: avg reward = -486.72, avg length = 513.28
4096000 updates: avg reward = -466.88, avg length = 533.12
4100096 updates: avg reward = -487.36, avg length = 512.64
4

4608000 updates: avg reward = -459.2, avg length = 540.8
4612096 updates: avg reward = -451.52, avg length = 548.48
4616192 updates: avg reward = -467.52, avg length = 532.48
4620288 updates: avg reward = -479.68, avg length = 520.32
4624384 updates: avg reward = -459.2, avg length = 540.8
4628480 updates: avg reward = -457.92, avg length = 542.08
4632576 updates: avg reward = -458.56, avg length = 541.44
4636672 updates: avg reward = -463.04, avg length = 536.96
4640768 updates: avg reward = -475.2, avg length = 524.8
4644864 updates: avg reward = -468.8, avg length = 531.2
4648960 updates: avg reward = -480.32, avg length = 519.68
4653056 updates: avg reward = -492.48, avg length = 507.52
4657152 updates: avg reward = -473.28, avg length = 526.72
4661248 updates: avg reward = -489.28, avg length = 510.72
4665344 updates: avg reward = -482.24, avg length = 517.76
4669440 updates: avg reward = -473.28, avg length = 526.72
4673536 updates: avg reward = -482.88, avg length = 517.12
46776

5185536 updates: avg reward = -462.4, avg length = 537.6
5189632 updates: avg reward = -431.68, avg length = 568.32
5193728 updates: avg reward = -421.44, avg length = 578.56
5197824 updates: avg reward = -416.96, avg length = 583.04
5201920 updates: avg reward = -419.52, avg length = 580.48
5206016 updates: avg reward = -425.92, avg length = 574.08
5210112 updates: avg reward = -441.92, avg length = 558.08
5214208 updates: avg reward = -436.16, avg length = 563.84
5218304 updates: avg reward = -447.04, avg length = 552.96
5222400 updates: avg reward = -463.68, avg length = 536.32
5226496 updates: avg reward = -468.16, avg length = 531.84
5230592 updates: avg reward = -442.56, avg length = 557.44
5234688 updates: avg reward = -436.8, avg length = 563.2
5238784 updates: avg reward = -420.16, avg length = 579.84
5242880 updates: avg reward = -445.12, avg length = 554.88
5246976 updates: avg reward = -441.92, avg length = 558.08
5251072 updates: avg reward = -443.84, avg length = 556.16
5

5758976 updates: avg reward = -349.2, avg length = 630.8
5763072 updates: avg reward = -362.0, avg length = 618.0
5767168 updates: avg reward = -408.08, avg length = 571.92
5771264 updates: avg reward = -463.68, avg length = 536.32
5775360 updates: avg reward = -465.6, avg length = 534.4
5779456 updates: avg reward = -459.2, avg length = 540.8
5783552 updates: avg reward = -471.36, avg length = 528.64
5787648 updates: avg reward = -466.24, avg length = 533.76
5791744 updates: avg reward = -450.24, avg length = 549.76
5795840 updates: avg reward = -448.32, avg length = 551.68
5799936 updates: avg reward = -439.36, avg length = 560.64
5804032 updates: avg reward = -427.2, avg length = 572.8
5808128 updates: avg reward = -432.96, avg length = 567.04
5812224 updates: avg reward = -440.64, avg length = 559.36
5816320 updates: avg reward = -451.52, avg length = 548.48
5820416 updates: avg reward = -456.64, avg length = 543.36
5824512 updates: avg reward = -455.36, avg length = 544.64
5828608

6381568 updates: avg reward = -414.4, avg length = 585.6
6385664 updates: avg reward = -404.8, avg length = 595.2
6389760 updates: avg reward = -377.28, avg length = 622.72
6393856 updates: avg reward = -379.2, avg length = 620.8
6397952 updates: avg reward = -370.24, avg length = 629.76
6402048 updates: avg reward = -381.12, avg length = 618.88
6406144 updates: avg reward = -377.92, avg length = 622.08
6410240 updates: avg reward = -401.6, avg length = 598.4
6414336 updates: avg reward = -384.96, avg length = 615.04
6418432 updates: avg reward = -401.6, avg length = 598.4
6422528 updates: avg reward = -413.12, avg length = 586.88
6426624 updates: avg reward = -424.0, avg length = 576.0
6430720 updates: avg reward = -409.28, avg length = 590.72
6434816 updates: avg reward = -390.08, avg length = 609.92
6438912 updates: avg reward = -402.88, avg length = 597.12
6443008 updates: avg reward = -436.16, avg length = 563.84
6447104 updates: avg reward = -422.72, avg length = 577.28
6451200 u

6955008 updates: avg reward = -338.32, avg length = 641.68
6959104 updates: avg reward = -338.96, avg length = 641.04
6963200 updates: avg reward = -347.92, avg length = 632.08
6967296 updates: avg reward = -387.6, avg length = 592.4
6971392 updates: avg reward = -410.0, avg length = 570.0
6975488 updates: avg reward = -404.88, avg length = 575.12
6979584 updates: avg reward = -397.2, avg length = 582.8
6983680 updates: avg reward = -418.88, avg length = 581.12
6987776 updates: avg reward = -431.04, avg length = 568.96
6991872 updates: avg reward = -390.08, avg length = 609.92
6995968 updates: avg reward = -381.76, avg length = 618.24
7000064 updates: avg reward = -356.16, avg length = 643.84
7004160 updates: avg reward = -299.2, avg length = 700.8
7008256 updates: avg reward = -288.32, avg length = 711.68
7012352 updates: avg reward = -303.68, avg length = 696.32
7016448 updates: avg reward = -267.84, avg length = 732.16
7020544 updates: avg reward = -287.68, avg length = 712.32
70246

7528448 updates: avg reward = -258.24, avg length = 741.76
7532544 updates: avg reward = -255.68, avg length = 744.32
7536640 updates: avg reward = -283.84, avg length = 716.16
7540736 updates: avg reward = -288.96, avg length = 711.04
7544832 updates: avg reward = -288.96, avg length = 711.04
7548928 updates: avg reward = -301.76, avg length = 698.24
7553024 updates: avg reward = -331.84, avg length = 668.16
7557120 updates: avg reward = -351.68, avg length = 648.32
7561216 updates: avg reward = -378.56, avg length = 621.44
7565312 updates: avg reward = -358.72, avg length = 641.28
7569408 updates: avg reward = -276.88, avg length = 703.12
7573504 updates: avg reward = -274.32, avg length = 705.68
7577600 updates: avg reward = -294.8, avg length = 685.2
7581696 updates: avg reward = -301.84, avg length = 678.16
7585792 updates: avg reward = -302.48, avg length = 677.52
7589888 updates: avg reward = -314.64, avg length = 665.36
7593984 updates: avg reward = -329.36, avg length = 650.64

8105984 updates: avg reward = -239.28, avg length = 700.72
8110080 updates: avg reward = -292.96, avg length = 667.04
8114176 updates: avg reward = -196.96, avg length = 763.04
8118272 updates: avg reward = -253.84, avg length = 726.16
8122368 updates: avg reward = -257.04, avg length = 722.96
8126464 updates: avg reward = -260.24, avg length = 719.76
8130560 updates: avg reward = -228.88, avg length = 751.12
8134656 updates: avg reward = -245.52, avg length = 734.48
8138752 updates: avg reward = -217.36, avg length = 762.64
8142848 updates: avg reward = -265.92, avg length = 734.08
8146944 updates: avg reward = -256.32, avg length = 743.68
8151040 updates: avg reward = -281.28, avg length = 718.72
8155136 updates: avg reward = -294.72, avg length = 705.28
8159232 updates: avg reward = -322.24, avg length = 677.76
8163328 updates: avg reward = -346.56, avg length = 653.44
8167424 updates: avg reward = -312.64, avg length = 687.36
8171520 updates: avg reward = -294.08, avg length = 705.

8679424 updates: avg reward = -268.56, avg length = 711.44
8683520 updates: avg reward = -269.2, avg length = 710.8
8687616 updates: avg reward = -328.64, avg length = 671.36
8691712 updates: avg reward = -367.68, avg length = 632.32
8695808 updates: avg reward = -377.28, avg length = 622.72
8699904 updates: avg reward = -369.6, avg length = 630.4
8704000 updates: avg reward = -410.56, avg length = 589.44
8708096 updates: avg reward = -369.68, avg length = 610.32
8712192 updates: avg reward = -365.2, avg length = 614.8
8716288 updates: avg reward = -347.92, avg length = 632.08
8720384 updates: avg reward = -329.36, avg length = 650.64
8724480 updates: avg reward = -314.64, avg length = 665.36
8728576 updates: avg reward = -299.28, avg length = 680.72
8732672 updates: avg reward = -273.04, avg length = 706.96
8736768 updates: avg reward = -244.24, avg length = 735.76
8740864 updates: avg reward = -241.04, avg length = 738.96
8744960 updates: avg reward = -298.56, avg length = 701.44
874

9252864 updates: avg reward = -309.52, avg length = 670.48
9256960 updates: avg reward = -315.92, avg length = 664.08
9261056 updates: avg reward = -288.4, avg length = 691.6
9265152 updates: avg reward = -312.72, avg length = 667.28
9269248 updates: avg reward = -313.36, avg length = 666.64
9273344 updates: avg reward = -303.76, avg length = 676.24
9277440 updates: avg reward = -308.24, avg length = 671.76
9281536 updates: avg reward = -299.28, avg length = 680.72
9285632 updates: avg reward = -278.8, avg length = 701.2
9289728 updates: avg reward = -335.68, avg length = 664.32
9293824 updates: avg reward = -369.6, avg length = 630.4
9297920 updates: avg reward = -306.32, avg length = 673.68
9302016 updates: avg reward = -306.96, avg length = 673.04
9306112 updates: avg reward = -287.76, avg length = 692.24
9310208 updates: avg reward = -277.52, avg length = 702.48
9314304 updates: avg reward = -312.08, avg length = 667.92
9318400 updates: avg reward = -284.56, avg length = 695.44
932

9826304 updates: avg reward = -370.88, avg length = 629.12
9830400 updates: avg reward = -377.92, avg length = 622.08
9834496 updates: avg reward = -357.52, avg length = 622.48
9838592 updates: avg reward = -382.48, avg length = 597.52
9842688 updates: avg reward = -390.8, avg length = 589.2
9846784 updates: avg reward = -383.76, avg length = 596.24
9850880 updates: avg reward = -355.04, avg length = 604.96
9854976 updates: avg reward = -364.64, avg length = 595.36
9859072 updates: avg reward = -349.28, avg length = 610.72
9863168 updates: avg reward = -339.68, avg length = 620.32
9867264 updates: avg reward = -369.68, avg length = 610.32
9871360 updates: avg reward = -321.68, avg length = 658.32
9875456 updates: avg reward = -303.12, avg length = 676.88
9879552 updates: avg reward = -287.12, avg length = 692.88
9883648 updates: avg reward = -347.2, avg length = 652.8
9887744 updates: avg reward = -322.88, avg length = 677.12
9891840 updates: avg reward = -313.28, avg length = 686.72
9

In [12]:
torch.max(obs[0][0])

tensor(0.9386, device='cuda:0')

In [13]:
MODEL_SAVE_PATH = os.path.join(result_dir, "model.save")
torch.save(actor_critic.state_dict(), MODEL_SAVE_PATH)